Skip to content

Commit 5ec0cf7

Browse files
adam900710kdave
authored andcommitted
btrfs-progs: rescue: add new command fix-data-checksum
It's a long known problem that direct IO can lead to data checksum mismatches if the user space is also modifying its buffer during writeback. Although it's being fixed in the recent v6.15 release (and backported to older kernels), we still need a user friendly way to fix those problems. This patch introduces the dryrun version of "btrfs rescue fix-data-checksum", which reports the logical bytenr and corrupted mirrors. Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
1 parent 3354928 commit 5ec0cf7

File tree

5 files changed

+365
-1
lines changed

5 files changed

+365
-1
lines changed

Documentation/btrfs-rescue.rst

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,25 @@ fix-device-size <device>
5050
5151
WARNING: CPU: 3 PID: 439 at fs/btrfs/ctree.h:1559 btrfs_update_device+0x1c5/0x1d0 [btrfs]
5252
53+
fix-data-checksum <device>
54+
fix data checksum mismatch
55+
56+
There is a long existing problem that if a user space program is doing
57+
direct IO and modifies the buffer before the write back finished, it
58+
can lead to data checksum mismatches.
59+
60+
This problem is known but not fixed until upstream release v6.15
61+
(backported to older kernels). So it's possible to hit false data
62+
checksum mismatch for any long running btrfs.
63+
64+
In that case this program can be utilized to repair such problem.
65+
66+
``Options``
67+
68+
-r|--readonly
69+
readonly mode, only scan for and report data checksum mismatches,
70+
do not repair
71+
5372
.. _man-rescue-clear-ino-cache:
5473

5574
clear-ino-cache <device>

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,7 @@ cmds_objects = cmds/subvolume.o cmds/subvolume-list.o \
256256
cmds/inspect.o cmds/balance.o cmds/send.o cmds/receive.o \
257257
cmds/quota.o cmds/qgroup.o cmds/replace.o check/main.o \
258258
cmds/restore.o cmds/rescue.o cmds/rescue-chunk-recover.o \
259-
cmds/rescue-super-recover.o \
259+
cmds/rescue-super-recover.o cmds/rescue-fix-data-checksum.o \
260260
cmds/property.o cmds/filesystem-usage.o cmds/inspect-dump-tree.o \
261261
cmds/inspect-dump-super.o cmds/inspect-tree-stats.o cmds/filesystem-du.o \
262262
cmds/reflink.o \

cmds/rescue-fix-data-checksum.c

Lines changed: 288 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,288 @@
1+
/*
2+
* This program is free software; you can redistribute it and/or
3+
* modify it under the terms of the GNU General Public
4+
* License v2 as published by the Free Software Foundation.
5+
*
6+
* This program is distributed in the hope that it will be useful,
7+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
8+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
9+
* General Public License for more details.
10+
*
11+
* You should have received a copy of the GNU General Public
12+
* License along with this program; if not, write to the
13+
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
14+
* Boston, MA 021110-1307, USA.
15+
*/
16+
17+
#include "kerncompat.h"
18+
#include "kernel-shared/disk-io.h"
19+
#include "kernel-shared/ctree.h"
20+
#include "kernel-shared/volumes.h"
21+
#include "common/messages.h"
22+
#include "common/open-utils.h"
23+
#include "cmds/rescue.h"
24+
25+
/*
26+
* Record one corrupted data block.
27+
*
28+
* We do not report immediately, this is for future file deleting support.
29+
*/
30+
struct corrupted_block {
31+
struct list_head list;
32+
/* The logical bytenr of the exact corrupted block. */
33+
u64 logical;
34+
35+
/* The amount of mirrors above logical have. */
36+
unsigned int num_mirrors;
37+
38+
/*
39+
* Which mirror failed.
40+
*
41+
* Note, bit 0 means mirror 1, since mirror 0 means choosing a
42+
* live mirror, and we never utilized that mirror 0.
43+
*/
44+
unsigned long *error_mirror_bitmap;
45+
};
46+
47+
static int global_repair_mode;
48+
LIST_HEAD(corrupted_blocks);
49+
50+
static int add_corrupted_block(struct btrfs_fs_info *fs_info, u64 logical,
51+
unsigned int mirror, unsigned int num_mirrors)
52+
{
53+
struct corrupted_block *last;
54+
55+
if (list_empty(&corrupted_blocks))
56+
goto add;
57+
58+
last = list_entry(corrupted_blocks.prev, struct corrupted_block, list);
59+
/* The last entry is the same, just set update the error mirror bitmap. */
60+
if (last->logical == logical) {
61+
UASSERT(last->error_mirror_bitmap);
62+
set_bit(mirror, last->error_mirror_bitmap);
63+
return 0;
64+
}
65+
add:
66+
last = calloc(1, sizeof(*last));
67+
if (!last)
68+
return -ENOMEM;
69+
last->error_mirror_bitmap = calloc(1, BITS_TO_LONGS(num_mirrors));
70+
if (!last->error_mirror_bitmap) {
71+
free(last);
72+
return -ENOMEM;
73+
}
74+
set_bit(mirror - 1, last->error_mirror_bitmap);
75+
last->logical = logical;
76+
last->num_mirrors = num_mirrors;
77+
78+
list_add_tail(&last->list, &corrupted_blocks);
79+
return 0;
80+
}
81+
82+
/*
83+
* Verify all mirrors for @logical.
84+
*
85+
* If something critical happened, return <0 and should end the run immediately.
86+
* Otherwise return 0, including data checksum mismatch or read failure.
87+
*/
88+
static int verify_one_data_block(struct btrfs_fs_info *fs_info,
89+
struct extent_buffer *leaf,
90+
unsigned long leaf_offset, u64 logical,
91+
unsigned int num_mirrors)
92+
{
93+
const u32 blocksize = fs_info->sectorsize;
94+
const u32 csum_size = fs_info->csum_size;
95+
u8 *buf;
96+
u8 csum[BTRFS_CSUM_SIZE];
97+
u8 csum_expected[BTRFS_CSUM_SIZE];
98+
int ret = 0;
99+
100+
buf = malloc(blocksize);
101+
if (!buf)
102+
return -ENOMEM;
103+
104+
for (int mirror = 1; mirror <= num_mirrors; mirror++) {
105+
u64 read_len = blocksize;
106+
107+
ret = read_data_from_disk(fs_info, buf, logical, &read_len, mirror);
108+
if (ret < 0) {
109+
/* IO error, add one record. */
110+
ret = add_corrupted_block(fs_info, logical, mirror, num_mirrors);
111+
if (ret < 0)
112+
break;
113+
}
114+
/* Verify the data checksum. */
115+
btrfs_csum_data(fs_info, fs_info->csum_type, buf, csum, blocksize);
116+
read_extent_buffer(leaf, csum_expected, leaf_offset, csum_size);
117+
if (memcmp(csum_expected, csum, csum_size) != 0) {
118+
ret = add_corrupted_block(fs_info, logical, mirror, num_mirrors);
119+
if (ret < 0)
120+
break;
121+
}
122+
}
123+
124+
free(buf);
125+
return ret;
126+
}
127+
128+
static int iterate_one_csum_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path)
129+
{
130+
struct btrfs_key key;
131+
const unsigned long item_ptr_off = btrfs_item_ptr_offset(path->nodes[0],
132+
path->slots[0]);
133+
const u32 blocksize = fs_info->sectorsize;
134+
int num_mirrors;
135+
u64 data_size;
136+
u64 cur;
137+
char *buf;
138+
int ret = 0;
139+
140+
buf = malloc(blocksize);
141+
if (!buf)
142+
return -ENOMEM;
143+
144+
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
145+
data_size = btrfs_item_size(path->nodes[0], path->slots[0]) /
146+
fs_info->csum_size * blocksize;
147+
num_mirrors = btrfs_num_copies(fs_info, key.offset, data_size);
148+
149+
for (cur = 0; cur < data_size; cur += blocksize) {
150+
const unsigned long leaf_offset = item_ptr_off +
151+
cur / blocksize * fs_info->csum_size;
152+
153+
ret = verify_one_data_block(fs_info, path->nodes[0], leaf_offset,
154+
key.offset + cur, num_mirrors);
155+
if (ret < 0)
156+
break;
157+
}
158+
free(buf);
159+
return ret;
160+
}
161+
162+
static int iterate_csum_root(struct btrfs_fs_info *fs_info, struct btrfs_root *csum_root)
163+
{
164+
struct btrfs_path path = { 0 };
165+
struct btrfs_key key;
166+
int ret;
167+
168+
key.objectid = 0;
169+
key.type = 0;
170+
key.offset = 0;
171+
172+
ret = btrfs_search_slot(NULL, csum_root, &key, &path, 0, 0);
173+
if (ret < 0) {
174+
errno = -ret;
175+
error("failed to get the first tree block of csum tree: %m");
176+
return ret;
177+
}
178+
UASSERT(ret > 0);
179+
while (true) {
180+
btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
181+
if (key.type != BTRFS_EXTENT_CSUM_KEY)
182+
goto next;
183+
ret = iterate_one_csum_item(fs_info, &path);
184+
if (ret < 0)
185+
break;
186+
next:
187+
ret = btrfs_next_item(csum_root, &path);
188+
if (ret > 0) {
189+
ret = 0;
190+
break;
191+
}
192+
if (ret < 0) {
193+
errno = -ret;
194+
error("failed to get next csum item: %m");
195+
}
196+
}
197+
btrfs_release_path(&path);
198+
return ret;
199+
}
200+
201+
static void report_corrupted_blocks(void)
202+
{
203+
struct corrupted_block *entry;
204+
205+
if (list_empty(&corrupted_blocks)) {
206+
pr_verbose(LOG_DEFAULT, "no data checksum mismatch found\n");
207+
return;
208+
}
209+
210+
list_for_each_entry(entry, &corrupted_blocks, list) {
211+
bool has_printed = false;
212+
213+
pr_verbose(LOG_DEFAULT, "logical=%llu corrtuped mirrors=", entry->logical);
214+
/* Open coded bitmap print. */
215+
for (int i = 0; i < entry->num_mirrors; i++) {
216+
if (test_bit(i, entry->error_mirror_bitmap)) {
217+
if (has_printed)
218+
pr_verbose(LOG_DEFAULT, ",");
219+
/*
220+
* Bit 0 means mirror 1, thus we need to increase
221+
* the value by 1.
222+
*/
223+
pr_verbose(LOG_DEFAULT, "%d", i + 1);
224+
has_printed=true;
225+
}
226+
}
227+
pr_verbose(LOG_DEFAULT, "\n");
228+
}
229+
}
230+
231+
static void free_corrupted_blocks(void)
232+
{
233+
while (!list_empty(&corrupted_blocks)) {
234+
struct corrupted_block *entry;
235+
236+
entry = list_entry(corrupted_blocks.next, struct corrupted_block, list);
237+
list_del_init(&entry->list);
238+
free(entry->error_mirror_bitmap);
239+
free(entry);
240+
}
241+
}
242+
243+
int btrfs_recover_fix_data_checksum(const char *path, enum btrfs_fix_data_checksum_mode mode)
244+
{
245+
struct btrfs_fs_info *fs_info;
246+
struct btrfs_root *csum_root;
247+
struct open_ctree_args oca = { 0 };
248+
int ret;
249+
250+
if (mode >= BTRFS_FIX_DATA_CSUMS_LAST)
251+
return -EINVAL;
252+
253+
ret = check_mounted(path);
254+
if (ret < 0) {
255+
errno = -ret;
256+
error("could not check mount status: %m");
257+
return ret;
258+
}
259+
if (ret > 0) {
260+
error("%s is currently mounted", path);
261+
return -EBUSY;
262+
}
263+
264+
global_repair_mode = mode;
265+
oca.filename = path;
266+
oca.flags = OPEN_CTREE_WRITES;
267+
fs_info = open_ctree_fs_info(&oca);
268+
if (!fs_info) {
269+
error("failed to open btrfs at %s", path);
270+
return -EIO;
271+
}
272+
csum_root = btrfs_csum_root(fs_info, 0);
273+
if (!csum_root) {
274+
error("failed to get csum root");
275+
ret = -EIO;
276+
goto out_close;
277+
}
278+
ret = iterate_csum_root(fs_info, csum_root);
279+
if (ret) {
280+
errno = -ret;
281+
error("failed to iterate csum tree: %m");
282+
}
283+
report_corrupted_blocks();
284+
out_close:
285+
free_corrupted_blocks();
286+
close_ctree_fs_info(fs_info);
287+
return ret;
288+
}

cmds/rescue.c

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <errno.h>
2323
#include <stdbool.h>
2424
#include <unistd.h>
25+
#include <getopt.h>
2526
#include "kernel-lib/list.h"
2627
#include "kernel-shared/ctree.h"
2728
#include "kernel-shared/volumes.h"
@@ -275,6 +276,55 @@ static int cmd_rescue_fix_device_size(const struct cmd_struct *cmd,
275276
}
276277
static DEFINE_SIMPLE_COMMAND(rescue_fix_device_size, "fix-device-size");
277278

279+
static const char * const cmd_rescue_fix_data_checksum_usage[] = {
280+
"btrfs rescue fix-data-checksum <device>",
281+
"Fix data checksum mismatches.",
282+
"",
283+
OPTLINE("-r", "readonly mode, only report errors without repair"),
284+
HELPINFO_INSERT_GLOBALS,
285+
HELPINFO_INSERT_VERBOSE,
286+
NULL
287+
};
288+
289+
static int cmd_rescue_fix_data_checksum(const struct cmd_struct *cmd,
290+
int argc, char **argv)
291+
{
292+
enum btrfs_fix_data_checksum_mode mode = BTRFS_FIX_DATA_CSUMS_READONLY;
293+
int ret;
294+
295+
optind = 0;
296+
while (1) {
297+
int c;
298+
enum { GETOPT_VAL_DRYRUN = GETOPT_VAL_FIRST };
299+
static const struct option long_options [] = {
300+
{"readonly", no_argument, NULL, 'r'},
301+
{"NULL", 0, NULL, 0},
302+
};
303+
304+
c = getopt_long(argc, argv, "r", long_options, NULL);
305+
if (c < 0)
306+
break;
307+
switch (c) {
308+
case 'r':
309+
mode = BTRFS_FIX_DATA_CSUMS_READONLY;
310+
break;
311+
default:
312+
usage_unknown_option(cmd, argv);
313+
}
314+
}
315+
316+
if (check_argc_min(argc - optind, 1))
317+
return 1;
318+
319+
ret = btrfs_recover_fix_data_checksum(argv[optind], mode);
320+
if (ret < 0) {
321+
errno = -ret;
322+
error("failed to fix data checksums: %m");
323+
}
324+
return !!ret;
325+
}
326+
static DEFINE_SIMPLE_COMMAND(rescue_fix_data_checksum, "fix-data-checksum");
327+
278328
static const char * const cmd_rescue_create_control_device_usage[] = {
279329
"btrfs rescue create-control-device",
280330
"Create /dev/btrfs-control (see 'CONTROL DEVICE' in btrfs(5))",
@@ -527,6 +577,7 @@ static const struct cmd_group rescue_cmd_group = {
527577
&cmd_struct_rescue_super_recover,
528578
&cmd_struct_rescue_zero_log,
529579
&cmd_struct_rescue_fix_device_size,
580+
&cmd_struct_rescue_fix_data_checksum,
530581
&cmd_struct_rescue_create_control_device,
531582
&cmd_struct_rescue_clear_ino_cache,
532583
&cmd_struct_rescue_clear_space_cache,

0 commit comments

Comments
 (0)