diff --git a/extra/mariabackup/backup_copy.cc b/extra/mariabackup/backup_copy.cc index 22a40e5fb1042..885b493a174ba 100644 --- a/extra/mariabackup/backup_copy.cc +++ b/extra/mariabackup/backup_copy.cc @@ -1772,7 +1772,7 @@ copy_back() if it exists. */ ds_tmp = ds_create(dst_dir, DS_TYPE_LOCAL); - if (!(ret = copy_or_move_file(ds_tmp, LOG_FILE_NAME, LOG_FILE_NAME, + if (!(ret = copy_or_move_file(ds_tmp, "ib_logfile0", "ib_logfile0", dst_dir, 1))) { goto cleanup; } @@ -1869,7 +1869,7 @@ copy_back() } /* skip the redo log (it was already copied) */ - if (!strcmp(filename, LOG_FILE_NAME)) { + if (!strcmp(filename, "ib_logfile0")) { continue; } diff --git a/extra/mariabackup/xtrabackup.cc b/extra/mariabackup/xtrabackup.cc index 18fd030e644b4..33c07910f5536 100644 --- a/extra/mariabackup/xtrabackup.cc +++ b/extra/mariabackup/xtrabackup.cc @@ -2705,7 +2705,7 @@ static bool innodb_init() srv_log_group_home_dir= xtrabackup_target_dir; bool ret; - const std::string ib_logfile0{get_log_file_path()}; + const std::string ib_logfile0{log_sys.get_circular_path()}; os_file_delete_if_exists_func(ib_logfile0.c_str(), nullptr); os_file_t file= os_file_create_func(ib_logfile0.c_str(), OS_FILE_CREATE, @@ -5569,10 +5569,11 @@ static bool xtrabackup_backup_func() /* open the log file */ memset(&stat_info, 0, sizeof(MY_STAT)); - dst_log_file = ds_open(backup_datasinks.m_redo, LOG_FILE_NAME, &stat_info); + dst_log_file = + ds_open(backup_datasinks.m_redo, "ib_logfile0", &stat_info); if (dst_log_file == NULL) { - msg("Error: failed to open the target stream for '%s'.", - LOG_FILE_NAME); + msg("Error: failed to open the target stream" + " for 'ib_logfile0'."); goto fail; } diff --git a/mysql-test/include/innodb_encrypt_log.inc b/mysql-test/include/innodb_encrypt_log.inc index 5beebeae81f07..fd3e0ceea5758 100644 --- a/mysql-test/include/innodb_encrypt_log.inc +++ b/mysql-test/include/innodb_encrypt_log.inc @@ -2,3 +2,7 @@ # (see include/innodb_encrypt_log.combinations) --source include/have_innodb.inc +if ($MTR_COMBINATION_CRYPT) +{ +--source ../suite/encryption/include/skip_innodb_log_archive.inc +} diff --git a/mysql-test/mariadb-test-run.pl b/mysql-test/mariadb-test-run.pl index ee3412c9cd154..741ab3e49036a 100755 --- a/mysql-test/mariadb-test-run.pl +++ b/mysql-test/mariadb-test-run.pl @@ -326,7 +326,7 @@ END my $opt_debug_sync_timeout= 300; # Default timeout for WAIT_FOR actions. my $warn_seconds = 60; -my $rebootstrap_re= '--innodb[-_](?:page[-_]size|checksum[-_]algorithm|undo[-_]tablespaces|log[-_]group[-_]home[-_]dir|data[-_]home[-_]dir)|data[-_]file[-_]path|force_rebootstrap'; +my $rebootstrap_re= '--innodb[-_](?:page[-_]size|checksum[-_]algorithm|undo[-_]tablespaces|log[-_](group[-_]home[-_]dir|archive)|data[-_]home[-_]dir)|data[-_]file[-_]path|force_rebootstrap'; sub testcase_timeout ($) { return $opt_testcase_timeout * 60; } sub check_timeout ($) { return testcase_timeout($_[0]); } @@ -3145,7 +3145,7 @@ sub mysql_install_db { # need to be given to the bootstrap process as well as the # server process. foreach my $extra_opt ( @opt_extra_mysqld_opt ) { - if ($extra_opt =~ /--innodb/) { + if ($extra_opt =~ /--((loose|skip)[-_])*innodb/) { mtr_add_arg($args, $extra_opt); } } diff --git a/mysql-test/suite/encryption/include/skip_innodb_log_archive.inc b/mysql-test/suite/encryption/include/skip_innodb_log_archive.inc new file mode 100644 index 0000000000000..3fa44408a68a3 --- /dev/null +++ b/mysql-test/suite/encryption/include/skip_innodb_log_archive.inc @@ -0,0 +1,12 @@ +--disable_query_log +SET STATEMENT sql_log_bin=0 FOR +call mtr.add_suppression("Plugin 'InnoDB' registration as a STORAGE ENGINE failed\\."); +SET STATEMENT sql_log_bin=0 FOR +call mtr.add_suppression("InnoDB: Plugin initialization aborted"); +SET STATEMENT sql_log_bin=0 FOR +call mtr.add_suppression("InnoDB: ib_0.*\\.log does not match innodb_encrypt_log"); +--enable_query_log +if (`SELECT COUNT(*)=0 FROM information_schema.global_variables where variable_name='innodb_log_archive' and variable_value='OFF'`) +{ + --skip Test requires innodb_log_archive=OFF +} diff --git a/mysql-test/suite/encryption/r/innodb_encrypt_log_corruption.result b/mysql-test/suite/encryption/r/innodb_encrypt_log_corruption.result index 3c3e4831d8a0f..b7bdee10daa90 100644 --- a/mysql-test/suite/encryption/r/innodb_encrypt_log_corruption.result +++ b/mysql-test/suite/encryption/r/innodb_encrypt_log_corruption.result @@ -20,6 +20,13 @@ AND support IN ('YES', 'DEFAULT', 'ENABLED'); ENGINE SUPPORT COMMENT TRANSACTIONS XA SAVEPOINTS FOUND 1 /InnoDB: Upgrade after a crash is not supported. This redo log was created before MariaDB 10\.2\.2, and we did not find a valid checkpoint/ in mysqld.1.err # empty redo log from before MariaDB 10.2.2 +# restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-recovery-target=12345 +SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES +WHERE engine = 'innodb' +AND support IN ('YES', 'DEFAULT', 'ENABLED'); +COUNT(*) +0 +FOUND 1 /InnoDB: cannot fulfill innodb_log_recovery_target=12345!=/ in mysqld.1.err # restart: --innodb-data-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-log-group-home-dir=MYSQLTEST_VARDIR/tmp/log_corruption --innodb-force-recovery=5 --innodb-log-file-size=4m SELECT COUNT(*) FROM INFORMATION_SCHEMA.ENGINES WHERE engine = 'innodb' diff --git a/mysql-test/suite/encryption/t/bulk_insert.test b/mysql-test/suite/encryption/t/bulk_insert.test index ce7804cb4e067..94b40f0152232 100644 --- a/mysql-test/suite/encryption/t/bulk_insert.test +++ b/mysql-test/suite/encryption/t/bulk_insert.test @@ -1,4 +1,5 @@ --source include/have_innodb.inc +--source include/skip_innodb_log_archive.inc # innodb_encrypt_log --source include/have_sequence.inc --source include/have_file_key_management_plugin.inc diff --git a/mysql-test/suite/encryption/t/corrupted_during_recovery.test b/mysql-test/suite/encryption/t/corrupted_during_recovery.test index dabf06dd04789..e07b30c2935ec 100644 --- a/mysql-test/suite/encryption/t/corrupted_during_recovery.test +++ b/mysql-test/suite/encryption/t/corrupted_during_recovery.test @@ -1,4 +1,5 @@ --source include/have_innodb.inc +--source ../../suite/innodb/include/skip_innodb_log_archive.inc # FIXME --source include/have_file_key_management_plugin.inc --disable_query_log diff --git a/mysql-test/suite/encryption/t/debug_key_management.test b/mysql-test/suite/encryption/t/debug_key_management.test index 9638391e69058..3211687c0c294 100644 --- a/mysql-test/suite/encryption/t/debug_key_management.test +++ b/mysql-test/suite/encryption/t/debug_key_management.test @@ -1,4 +1,5 @@ -- source include/have_innodb.inc +-- source include/skip_innodb_log_archive.inc # innodb_encrypt_log -- source include/have_debug.inc -- source include/innodb_undo_tablespaces.inc -- source include/not_embedded.inc diff --git a/mysql-test/suite/encryption/t/doublewrite_debug.test b/mysql-test/suite/encryption/t/doublewrite_debug.test index 4f2215240441f..0f7e71b20abb6 100644 --- a/mysql-test/suite/encryption/t/doublewrite_debug.test +++ b/mysql-test/suite/encryption/t/doublewrite_debug.test @@ -1,4 +1,5 @@ --source include/have_innodb.inc +--source ../../suite/innodb/include/skip_innodb_log_archive.inc # FIXME --source include/have_debug.inc --source include/not_embedded.inc --source include/have_file_key_management_plugin.inc diff --git a/mysql-test/suite/encryption/t/encrypt_and_grep.test b/mysql-test/suite/encryption/t/encrypt_and_grep.test index 648ad80780c93..988604db58ecd 100644 --- a/mysql-test/suite/encryption/t/encrypt_and_grep.test +++ b/mysql-test/suite/encryption/t/encrypt_and_grep.test @@ -1,4 +1,5 @@ -- source include/have_innodb.inc +-- source include/skip_innodb_log_archive.inc # innodb_encrypt_log -- source include/innodb_undo_tablespaces.inc -- source include/have_file_key_management_plugin.inc diff --git a/mysql-test/suite/encryption/t/encryption_force.test b/mysql-test/suite/encryption/t/encryption_force.test index 3c6f039184b96..28b49c866b489 100644 --- a/mysql-test/suite/encryption/t/encryption_force.test +++ b/mysql-test/suite/encryption/t/encryption_force.test @@ -1,4 +1,5 @@ -- source include/have_innodb.inc +-- source include/skip_innodb_log_archive.inc # innodb_encrypt_log -- source include/have_partition.inc -- source include/have_example_key_management_plugin.inc diff --git a/mysql-test/suite/encryption/t/file_creation.test b/mysql-test/suite/encryption/t/file_creation.test index 6b0126831a4ca..63e10b7f8d39f 100644 --- a/mysql-test/suite/encryption/t/file_creation.test +++ b/mysql-test/suite/encryption/t/file_creation.test @@ -1,4 +1,5 @@ --source include/have_innodb.inc +--source ../../suite/innodb/include/skip_innodb_log_archive.inc # FIXME --source include/have_example_key_management_plugin.inc let $restart_noprint=2; # embedded does not support restart diff --git a/mysql-test/suite/encryption/t/innodb-first-page-read.opt b/mysql-test/suite/encryption/t/innodb-first-page-read.opt deleted file mode 100644 index 38d69691ed6aa..0000000000000 --- a/mysql-test/suite/encryption/t/innodb-first-page-read.opt +++ /dev/null @@ -1,5 +0,0 @@ ---innodb-encrypt-tables=ON ---innodb-encrypt-log=ON ---innodb-encryption-rotate-key-age=15 ---innodb-encryption-threads=4 ---innodb-tablespaces-encryption diff --git a/mysql-test/suite/encryption/t/innodb-key-rotation-disable.test b/mysql-test/suite/encryption/t/innodb-key-rotation-disable.test index 1bd69365f6892..33936e1704768 100644 --- a/mysql-test/suite/encryption/t/innodb-key-rotation-disable.test +++ b/mysql-test/suite/encryption/t/innodb-key-rotation-disable.test @@ -1,4 +1,5 @@ -- source include/have_innodb.inc +-- source include/skip_innodb_log_archive.inc # innodb_encrypt_log -- source include/have_file_key_management_plugin.inc # not embedded because of restarts -- source include/not_embedded.inc diff --git a/mysql-test/suite/encryption/t/innodb-page_encryption_log_encryption.test b/mysql-test/suite/encryption/t/innodb-page_encryption_log_encryption.test index a736c7292ad35..3c64c039f3a45 100644 --- a/mysql-test/suite/encryption/t/innodb-page_encryption_log_encryption.test +++ b/mysql-test/suite/encryption/t/innodb-page_encryption_log_encryption.test @@ -1,4 +1,5 @@ -- source include/have_innodb.inc +-- source include/skip_innodb_log_archive.inc # innodb_encrypt_log -- source include/not_embedded.inc -- source include/have_file_key_management_plugin.inc diff --git a/mysql-test/suite/encryption/t/innodb-redo-badkey.test b/mysql-test/suite/encryption/t/innodb-redo-badkey.test index bacc71dd2c86e..c7dad93170870 100644 --- a/mysql-test/suite/encryption/t/innodb-redo-badkey.test +++ b/mysql-test/suite/encryption/t/innodb-redo-badkey.test @@ -1,4 +1,5 @@ -- source include/have_innodb.inc +--source ../../suite/innodb/include/skip_innodb_log_archive.inc # FIXME -- source include/have_file_key_management_plugin.inc # embedded does not support restart -- source include/not_embedded.inc diff --git a/mysql-test/suite/encryption/t/innodb-redo-nokeys.test b/mysql-test/suite/encryption/t/innodb-redo-nokeys.test index 87a9e7a146e1b..507bedbb68e3d 100644 --- a/mysql-test/suite/encryption/t/innodb-redo-nokeys.test +++ b/mysql-test/suite/encryption/t/innodb-redo-nokeys.test @@ -1,4 +1,5 @@ -- source include/have_innodb.inc +--source ../../suite/innodb/include/skip_innodb_log_archive.inc # FIXME -- source include/have_file_key_management_plugin.inc # embedded does not support restart -- source include/not_embedded.inc diff --git a/mysql-test/suite/encryption/t/innodb_encrypt_freed.test b/mysql-test/suite/encryption/t/innodb_encrypt_freed.test index 408e874a3b242..b37ff06dac1f7 100644 --- a/mysql-test/suite/encryption/t/innodb_encrypt_freed.test +++ b/mysql-test/suite/encryption/t/innodb_encrypt_freed.test @@ -1,4 +1,5 @@ --source include/have_innodb.inc +--source include/skip_innodb_log_archive.inc # innodb_encrypt_log --source include/have_example_key_management_plugin.inc --source include/have_debug.inc --source include/not_embedded.inc diff --git a/mysql-test/suite/encryption/t/innodb_encrypt_log.test b/mysql-test/suite/encryption/t/innodb_encrypt_log.test index 5448a606ba807..cdb807c157780 100644 --- a/mysql-test/suite/encryption/t/innodb_encrypt_log.test +++ b/mysql-test/suite/encryption/t/innodb_encrypt_log.test @@ -1,4 +1,5 @@ -- source include/have_innodb.inc +-- source include/skip_innodb_log_archive.inc # innodb_encrypt_log -- source include/not_embedded.inc -- source filekeys_plugin.inc diff --git a/mysql-test/suite/encryption/t/innodb_encrypt_log_corruption.test b/mysql-test/suite/encryption/t/innodb_encrypt_log_corruption.test index f1642e83e32e0..19858188cf1ff 100644 --- a/mysql-test/suite/encryption/t/innodb_encrypt_log_corruption.test +++ b/mysql-test/suite/encryption/t/innodb_encrypt_log_corruption.test @@ -1,3 +1,4 @@ +--source include/skip_innodb_log_archive.inc # innodb_encrypt_log --let $no_cleanup=1 --source ../../innodb/t/log_corruption.test diff --git a/mysql-test/suite/encryption/t/innodb_encryption-page-compression.test b/mysql-test/suite/encryption/t/innodb_encryption-page-compression.test index 57c8721282f4a..cf40cfc3ac816 100644 --- a/mysql-test/suite/encryption/t/innodb_encryption-page-compression.test +++ b/mysql-test/suite/encryption/t/innodb_encryption-page-compression.test @@ -1,4 +1,5 @@ -- source include/have_innodb.inc +-- source include/skip_innodb_log_archive.inc # innodb_encrypt_log -- source include/have_example_key_management_plugin.inc -- source include/not_embedded.inc # This test is too slow for valgrind and causes innnodb semaphores to time out diff --git a/mysql-test/suite/encryption/t/innodb_encryption.test b/mysql-test/suite/encryption/t/innodb_encryption.test index 2b0b2b8d7fb5c..113bbf152605d 100644 --- a/mysql-test/suite/encryption/t/innodb_encryption.test +++ b/mysql-test/suite/encryption/t/innodb_encryption.test @@ -2,6 +2,7 @@ # # -- source include/have_innodb.inc +-- source include/skip_innodb_log_archive.inc # innodb_encrypt_log -- source include/have_example_key_management_plugin.inc -- source include/innodb_undo_tablespaces.inc diff --git a/mysql-test/suite/encryption/t/innodb_encryption_discard_import.test b/mysql-test/suite/encryption/t/innodb_encryption_discard_import.test index e33aaec3e21c0..c1f51be89883f 100644 --- a/mysql-test/suite/encryption/t/innodb_encryption_discard_import.test +++ b/mysql-test/suite/encryption/t/innodb_encryption_discard_import.test @@ -1,4 +1,5 @@ -- source include/have_innodb.inc +-- source include/skip_innodb_log_archive.inc # innodb_encrypt_log -- source include/have_example_key_management_plugin.inc -- source include/not_valgrind.inc -- source include/not_embedded.inc diff --git a/mysql-test/suite/encryption/t/innodb_encryption_tables.test b/mysql-test/suite/encryption/t/innodb_encryption_tables.test index d03bc890ba4ed..3a8d244a1e3f4 100644 --- a/mysql-test/suite/encryption/t/innodb_encryption_tables.test +++ b/mysql-test/suite/encryption/t/innodb_encryption_tables.test @@ -1,4 +1,5 @@ -- source include/have_innodb.inc +-- source include/skip_innodb_log_archive.inc # innodb_encrypt_log -- source include/have_example_key_management_plugin.inc -- source include/not_embedded.inc # We can't run this test under valgrind as it 'takes forever' diff --git a/mysql-test/suite/encryption/t/innodb_first_page.test b/mysql-test/suite/encryption/t/innodb_first_page.test index db4d8eb3b16f5..838fc8396a9a8 100644 --- a/mysql-test/suite/encryption/t/innodb_first_page.test +++ b/mysql-test/suite/encryption/t/innodb_first_page.test @@ -3,6 +3,7 @@ # --source include/have_innodb.inc +--source include/skip_innodb_log_archive.inc # innodb_encrypt_log --source include/have_file_key_management_plugin.inc --source include/innodb_undo_tablespaces.inc diff --git a/mysql-test/suite/encryption/t/innodb_onlinealter_encryption.test b/mysql-test/suite/encryption/t/innodb_onlinealter_encryption.test index dc6d1e6f93c86..0b5c70339f704 100644 --- a/mysql-test/suite/encryption/t/innodb_onlinealter_encryption.test +++ b/mysql-test/suite/encryption/t/innodb_onlinealter_encryption.test @@ -1,4 +1,5 @@ -- source include/have_innodb.inc +-- source include/skip_innodb_log_archive.inc # innodb_encrypt_log -- source include/have_file_key_management_plugin.inc # test uses restart -- source include/not_embedded.inc diff --git a/mysql-test/suite/encryption/t/recovery_memory.test b/mysql-test/suite/encryption/t/recovery_memory.test index fc6f15f7ee27f..af539b3a36bec 100644 --- a/mysql-test/suite/encryption/t/recovery_memory.test +++ b/mysql-test/suite/encryption/t/recovery_memory.test @@ -1,5 +1,6 @@ --source include/have_debug.inc --source include/have_innodb.inc +--source include/skip_innodb_log_archive.inc # innodb_encrypt_log --source include/have_sequence.inc --source filekeys_plugin.inc diff --git a/mysql-test/suite/innodb/include/skip_innodb_log_archive.inc b/mysql-test/suite/innodb/include/skip_innodb_log_archive.inc new file mode 100644 index 0000000000000..984e5fefa369a --- /dev/null +++ b/mysql-test/suite/innodb/include/skip_innodb_log_archive.inc @@ -0,0 +1,4 @@ +if (`SELECT @@GLOBAL.innodb_log_archive`) +{ + --skip Test requires innodb_log_archive=OFF +} diff --git a/mysql-test/suite/innodb/r/corrupted_during_recovery.result b/mysql-test/suite/innodb/r/corrupted_during_recovery.result index 593943b4951ea..d063fcb0132cb 100644 --- a/mysql-test/suite/innodb/r/corrupted_during_recovery.result +++ b/mysql-test/suite/innodb/r/corrupted_during_recovery.result @@ -1,14 +1,19 @@ -CREATE TABLE t1(a BIGINT PRIMARY KEY) ENGINE=InnoDB; +CREATE TABLE t1(a BIGINT PRIMARY KEY) ENGINE=InnoDB STATS_PERSISTENT=0; INSERT INTO t1 VALUES(1); +SET GLOBAL innodb_max_purge_lag_wait=0, innodb_log_checkpoint_now=ON; CREATE TABLE t2(a BIGINT PRIMARY KEY) ENGINE=InnoDB; INSERT INTO t1 VALUES(2); SET GLOBAL innodb_flush_log_at_trx_commit=1; INSERT INTO t2 VALUES(1); # Kill the server +SELECT * FROM t2; +Got one of the listed errors +SELECT * FROM t2; +ERROR 42000: Unknown storage engine 'InnoDB' +FOUND 1 /InnoDB: impossible innodb_log_recovery_start=/ in mysqld.1.err # Corrupt the pages SELECT * FROM t1; ERROR 42000: Unknown storage engine 'InnoDB' -FOUND 1 /InnoDB: Page \[page id: space=[1-9][0-9]*, page number=3\] log sequence number 1311768467463790320 is in the future!/ in mysqld.1.err SELECT * FROM t1; a 1 @@ -18,6 +23,7 @@ a CHECK TABLE t2; Table Op Msg_type Msg_text test.t2 check status OK +FOUND 1 /InnoDB: Page \[page id: space=[1-9][0-9]*, page number=3\] log sequence number 1311768467463790320 is in the future!/ in mysqld.1.err DROP TABLE t1, t2; CREATE TABLE t1(pk SERIAL) ENGINE=InnoDB; INSERT INTO t1 VALUES (1),(2),(3); diff --git a/mysql-test/suite/innodb/r/innodb-wl5522,strict_crc32.rdiff b/mysql-test/suite/innodb/r/innodb-wl5522,strict_crc32.rdiff index 283bbe96aae97..e4c5128b0377f 100644 --- a/mysql-test/suite/innodb/r/innodb-wl5522,strict_crc32.rdiff +++ b/mysql-test/suite/innodb/r/innodb-wl5522,strict_crc32.rdiff @@ -1,6 +1,27 @@ --- innodb-wl5522.result +++ innodb-wl5522,strict_crc32.result~ -@@ -131,8 +131,7 @@ +@@ -1,9 +1,6 @@ + call mtr.add_suppression("InnoDB: Unable to import tablespace .* because it already exists. Please DISCARD the tablespace before IMPORT\\."); + call mtr.add_suppression("Index for table 't2' is corrupt; try to repair it"); + call mtr.add_suppression("InnoDB: Cannot save statistics for table `test`\\.`t1` because the \\.ibd file is missing"); +-call mtr.add_suppression("InnoDB: cannot fulfill innodb_log_recovery_target=123456<"); +-call mtr.add_suppression("InnoDB: Plugin initialization aborted"); +-call mtr.add_suppression("Plugin 'InnoDB' registration as a STORAGE ENGINE failed\\."); + FLUSH TABLES; + CREATE TABLE t1 + (a INT AUTO_INCREMENT PRIMARY KEY, +@@ -37,10 +34,6 @@ + t1.ibd + t2.frm + t2.ibd +-# restart: --innodb-log-recovery-target=123456 +-FOUND 1 /InnoDB: cannot fulfill innodb_log_recovery_target=123456page.frame) == + byte *const fseg_header= TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG + + trx_sys_block->page.frame; + + if (mach_read_from_4(fseg_header + FSEG_HEADER_SIZE) == TRX_SYS_DOUBLEWRITE_MAGIC_N) { /* The doublewrite buffer has already been created: just read in @@ -136,21 +138,21 @@ bool buf_dblwr_t::create() noexcept sql_print_information("InnoDB: Doublewrite buffer not found:" " creating new"); - /* FIXME: After this point, the doublewrite buffer creation - is not atomic. The doublewrite buffer should not exist in + /* FIXME: The doublewrite buffer should not exist in the InnoDB system tablespace file in the first place. It could be located in separate optional file(s) in a user-specified location. */ } - byte *fseg_header= TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG + - trx_sys_block->page.frame; + mtr_t init_mtr{nullptr}; + init_mtr.start(); + for (uint32_t prev_page_no= 0, i= 0, extent_size= FSP_EXTENT_SIZE; i < 2 * size + extent_size / 2; i++) { buf_block_t *new_block= fseg_alloc_free_page_general(fseg_header, prev_page_no + 1, FSP_UP, - false, &mtr, &mtr, &err); + false, &mtr, &init_mtr, &err); if (!new_block) { sql_print_error("InnoDB: Cannot create doublewrite buffer: " @@ -165,75 +167,39 @@ bool buf_dblwr_t::create() noexcept return false; } - /* We read the allocated pages to the buffer pool; when they are - written to disk in a flush, the space id and page number fields - are also written to the pages. When we at database startup read - pages from the doublewrite buffer, we know that if the space id - and page number in them are the same as the page position in the - tablespace, then the page has not been written to in - doublewrite. */ - - ut_ad(new_block->page.lock.not_recursive()); const page_id_t id= new_block->page.id(); - /* We only do this in the debug build, to ensure that the check in - buf_flush_init_for_writing() will see a valid page type. The - flushes of new_block are actually unnecessary here. */ - ut_d(mtr.write<2>(*new_block, FIL_PAGE_TYPE + new_block->page.frame, - FIL_PAGE_TYPE_SYS)); + /* Normally, allocated pages will be modified further. However, + the pages of the doublewrite buffer are just dummy storage, not + covered by the write-ahead log. */ + ut_ad(init_mtr.get_savepoint() == 1); + ut_ad(init_mtr.m_memo[0].object == new_block); + ut_ad(init_mtr.m_memo[0].type == MTR_MEMO_PAGE_X_MODIFY); + init_mtr.m_memo[0].type= MTR_MEMO_PAGE_X_FIX; + init_mtr.rollback_to_savepoint(0, 1); + init_mtr.m_log.erase(); if (i == size / 2) - { ut_a(id.page_no() == size); - mtr.write<4>(*trx_sys_block, - TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_BLOCK1 + - trx_sys_block->page.frame, id.page_no()); - mtr.write<4>(*trx_sys_block, - TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_REPEAT + - TRX_SYS_DOUBLEWRITE_BLOCK1 + trx_sys_block->page.frame, - id.page_no()); - } else if (i == size / 2 + size) - { ut_a(id.page_no() == 2 * size); - mtr.write<4>(*trx_sys_block, - TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_BLOCK2 + - trx_sys_block->page.frame, id.page_no()); - mtr.write<4>(*trx_sys_block, - TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_REPEAT + - TRX_SYS_DOUBLEWRITE_BLOCK2 + trx_sys_block->page.frame, - id.page_no()); - } else if (i > size / 2) ut_a(id.page_no() == prev_page_no + 1); - - if (((i + 1) & 15) == 0) { - /* rw_locks can only be recursively x-locked 2048 times. (on 32 - bit platforms, (lint) 0 - (X_LOCK_DECR * 2049) is no longer a - negative number, and thus lock_word becomes like a shared lock). - For 4k page size this loop will lock the fseg header too many - times. Since this code is not done while any other threads are - active, restart the MTR occasionally. */ - mtr.commit(); - mtr.start(); - trx_sys_block= buf_dblwr_trx_sys_get(&mtr); - fseg_header= TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG + - trx_sys_block->page.frame; - } - prev_page_no= id.page_no(); } - mtr.write<4>(*trx_sys_block, - TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC + - trx_sys_block->page.frame, TRX_SYS_DOUBLEWRITE_MAGIC_N); - mtr.write<4>(*trx_sys_block, - TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_MAGIC + - TRX_SYS_DOUBLEWRITE_REPEAT + trx_sys_block->page.frame, - TRX_SYS_DOUBLEWRITE_MAGIC_N); - - mtr.write<4>(*trx_sys_block, - TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED + - trx_sys_block->page.frame, + ut_ad(init_mtr.is_empty()); + byte *const doublewrite= fseg_header + + (TRX_SYS_DOUBLEWRITE_MAGIC - TRX_SYS_DOUBLEWRITE_FSEG); + mtr.write<4>(*trx_sys_block, doublewrite, TRX_SYS_DOUBLEWRITE_MAGIC_N); + static_assert(TRX_SYS_DOUBLEWRITE_BLOCK1==TRX_SYS_DOUBLEWRITE_MAGIC + 4, ""); + mtr.write<4>(*trx_sys_block, doublewrite + 4, size); + static_assert(TRX_SYS_DOUBLEWRITE_BLOCK2==TRX_SYS_DOUBLEWRITE_MAGIC + 8, ""); + mtr.write<4>(*trx_sys_block, doublewrite + 8, size * 2); + static_assert(TRX_SYS_DOUBLEWRITE_REPEAT == 12, ""); + mtr.memcpy(*trx_sys_block, doublewrite + 12, doublewrite, 12); + static_assert(TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED == + 24 + TRX_SYS_DOUBLEWRITE_MAGIC, ""); + mtr.write<4>(*trx_sys_block, doublewrite + 24, TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N); mtr.commit(); diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index 0fe60203901b8..b01c8bb696983 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -1815,77 +1815,178 @@ static ulint buf_flush_LRU(ulint max_n) noexcept inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept { ut_ad(!srv_read_only_mode); + ut_ad(archive || next_checkpoint_lsn >= first_lsn); ut_ad(end_lsn >= next_checkpoint_lsn); ut_d(const lsn_t current_lsn{get_lsn()}); ut_ad(end_lsn <= current_lsn); ut_ad(end_lsn + SIZE_OF_FILE_CHECKPOINT <= current_lsn || srv_shutdown_state > SRV_SHUTDOWN_INITIATED); + ut_ad(this->end_lsn <= end_lsn); DBUG_PRINT("ib_log", ("checkpoint at " LSN_PF " written", next_checkpoint_lsn)); - auto n= next_checkpoint_no; - const size_t offset{(n & 1) ? CHECKPOINT_2 : CHECKPOINT_1}; + const bool first_checkpoint_in_new_archive= + archive && resize_log.is_opened() && end_lsn >= first_lsn; + + if (first_checkpoint_in_new_archive) + { +#ifdef HAVE_PMEM + if (UNIV_LIKELY_NULL(resize_buf)) + archived_mmap_switch_complete(); +#endif + byte *c= checkpoint_buf; + if (c) + memset_aligned<512>(c, 0, write_size); + else + { + c= buf; + ut_ad(!memcmp_aligned<512>(c, field_ref_zero, START_OFFSET)); + } + + next_checkpoint_no= uint16_t(8 * is_encrypted() + 1); + + if (is_encrypted()) + log_crypt_write_header(c); + } + + const auto n= next_checkpoint_no; + size_t offset; static_assert(CPU_LEVEL1_DCACHE_LINESIZE >= 64, "efficiency"); static_assert(CPU_LEVEL1_DCACHE_LINESIZE <= 4096, "compatibility"); - byte* c= my_assume_aligned - (is_mmap() ? buf + offset : checkpoint_buf); - memset_aligned(c, 0, CPU_LEVEL1_DCACHE_LINESIZE); - mach_write_to_8(my_assume_aligned<8>(c), next_checkpoint_lsn); - mach_write_to_8(my_assume_aligned<8>(c + 8), end_lsn); - mach_write_to_4(my_assume_aligned<4>(c + 60), my_crc32c(0, c, 60)); - - lsn_t resizing; + lsn_t resizing{resize_lsn.load(std::memory_order_relaxed)}; + byte *c; -#ifdef HAVE_PMEM - if (is_mmap()) + if (archive) { - ut_ad(!is_opened()); - resizing= resize_lsn.load(std::memory_order_relaxed); - - if (resizing > 1 && resizing <= next_checkpoint_lsn) + ut_ad(!resizing); + offset= n * 4; + ut_a(offset); + ut_a(offset < START_OFFSET); // FIXME: better guard for this + const lsn_t d= end_lsn - + (resize_log.is_opened() ? first_lsn : this->end_lsn); + ut_a(d <= lsn_t{~uint32_t{0}}); + + if (!d) + { + ut_a(next_checkpoint_no == (is_encrypted() ? 8 : 1)); + ut_a(end_lsn == first_lsn); + } +#ifdef HAVE_PMEM + else if (is_mmap()) { - memcpy_aligned<64>(resize_buf + CHECKPOINT_1, c, 64); - header_write(resize_buf, resizing, is_encrypted()); - pmem_persist(resize_buf, resize_target); + c= buf + offset; + ut_ad(!memcmp(c, field_ref_zero, 4)); + mach_write_to_4(my_assume_aligned<4>(c), uint32_t(d)); + c= reinterpret_cast(uintptr_t(c) & ~63); + goto persist_checkpoint; + } +#endif + else + { + c= checkpoint_buf; + const size_t o{offset & (write_size - 1)}; + offset&= ~size_t(write_size - 1); + if (!o) + memset_aligned<512>(c, 0, write_size); + ut_ad(!memcmp(c + o, field_ref_zero, 4)); + mach_write_to_4(my_assume_aligned<4>(c + o), uint32_t(d)); + goto write_checkpoint; } - pmem_persist(c, 64); } else -#endif { - ut_ad(!is_mmap()); - ut_ad(!checkpoint_pending); - checkpoint_pending= true; - latch.wr_unlock(); - log_write_and_flush_prepare(); - resizing= resize_lsn.load(std::memory_order_relaxed); - ut_ad(ut_is_2pow(write_size)); - ut_ad(write_size >= 512); - ut_ad(write_size <= 4096); - log.write(offset, {c, write_size}); - if (resizing > 1 && resizing <= next_checkpoint_lsn) + offset= (n & 1) ? CHECKPOINT_2 : CHECKPOINT_1; + c= is_mmap() ? buf + offset : checkpoint_buf; + memset_aligned(c, 0, CPU_LEVEL1_DCACHE_LINESIZE); + mach_write_to_8(my_assume_aligned<8>(c), next_checkpoint_lsn); + mach_write_to_8(my_assume_aligned<8>(c + 8), end_lsn); + mach_write_to_4(my_assume_aligned<4>(c + 60), my_crc32c(0, c, 60)); + +#ifdef HAVE_PMEM + if (is_mmap()) + { + ut_ad(!is_opened()); + if (resizing > 1 && resizing <= next_checkpoint_lsn) + { + memcpy_aligned<64>(resize_buf + CHECKPOINT_1, c, 64); + header_write(resize_buf, resizing, is_encrypted()); + pmem_persist(resize_buf, resize_target); + } + persist_checkpoint: + pmem_persist(c, 64); + } + else +#endif { - resize_log.write(CHECKPOINT_1, {c, write_size}); - byte *buf= static_cast(aligned_malloc(4096, 4096)); - memset_aligned<4096>(buf, 0, 4096); - header_write(buf, resizing, is_encrypted()); - resize_log.write(0, {buf, 4096}); - aligned_free(buf); + write_checkpoint: + ut_ad(!is_mmap()); + ut_ad(!checkpoint_pending); + checkpoint_pending= true; + latch.wr_unlock(); + log_write_and_flush_prepare(); + resizing= resize_lsn.load(std::memory_order_relaxed); + ut_ad(!resizing || !archive); + ut_ad(ut_is_2pow(write_size)); + ut_ad(write_size >= 512); + ut_ad(write_size <= 4096); + log.write(offset, {c, write_size}); + if (resizing > 1 && resizing <= next_checkpoint_lsn) + { + resize_log.write(CHECKPOINT_1, {c, write_size}); + byte *buf= static_cast(aligned_malloc(4096, 4096)); + memset_aligned<4096>(buf, 0, 4096); + header_write(buf, resizing, is_encrypted()); + resize_log.write(0, {buf, 4096}); + aligned_free(buf); + } + + if (!log_write_through) + ut_a(log.flush()); + latch.wr_lock(SRW_LOCK_CALL); + ut_ad(checkpoint_pending); + checkpoint_pending= false; + resizing= resize_lsn.load(std::memory_order_relaxed); } - if (!log_write_through) - ut_a(log.flush()); - latch.wr_lock(SRW_LOCK_CALL); - ut_ad(checkpoint_pending); - checkpoint_pending= false; - resizing= resize_lsn.load(std::memory_order_relaxed); + next_checkpoint_no++; } + ut_ad(!resizing || !archive); ut_ad(!checkpoint_pending); - next_checkpoint_no++; + this->end_lsn= end_lsn; const lsn_t checkpoint_lsn{next_checkpoint_lsn}; last_checkpoint_lsn= checkpoint_lsn; + if (!archive) + archived_lsn= checkpoint_lsn; + else if (first_checkpoint_in_new_archive) + { + /* Make the previous archived log file read-only */ +#ifdef _WIN32 + resize_log.close(); + SetFileAttributesA(get_archive_path().c_str(), + FILE_ATTRIBUTE_READONLY | FILE_ATTRIBUTE_ARCHIVE); +#else + struct stat st; + if (!fstat(resize_log.m_file, &st)) + st.st_mode&= 0444; + else + st.st_mode= 0444; + fchmod(resize_log.m_file, st.st_mode); + resize_log.close(); +#endif +#ifdef HAVE_PMEM + if (!is_mmap()) +#endif + { + /* Mimic archived_mmap_switch_complete() */ + first_lsn+= capacity(); + file_size= resize_target; + } + + ut_ad(current_lsn >= first_lsn); + ut_ad(current_lsn < first_lsn + capacity()); + } DBUG_PRINT("ib_log", ("checkpoint ended at " LSN_PF ", flushed to " LSN_PF, checkpoint_lsn, get_flushed_lsn())); @@ -1902,6 +2003,7 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept if (resizing > 1 && resizing <= checkpoint_lsn) { + ut_ad(!archive); ut_ad(is_mmap() == !resize_flush_buf); ut_ad(is_mmap() == !resize_log.is_opened()); @@ -1931,7 +2033,7 @@ inline void log_t::write_checkpoint(lsn_t end_lsn) noexcept ut_ad(!log.is_opened()); bool success; log.m_file= - os_file_create_func(get_log_file_path().c_str(), OS_FILE_OPEN, + os_file_create_func(get_circular_path().c_str(), OS_FILE_OPEN, OS_LOG_FILE, false, &success); ut_a(success); ut_a(log.is_opened()); @@ -1997,12 +2099,12 @@ static bool log_checkpoint_low(lsn_t oldest_lsn, lsn_t end_lsn) noexcept ut_ad(oldest_lsn <= end_lsn); ut_ad(end_lsn == log_sys.get_lsn()); - if (oldest_lsn == log_sys.last_checkpoint_lsn || - (oldest_lsn == end_lsn && - !log_sys.resize_in_progress() && - oldest_lsn == log_sys.last_checkpoint_lsn + - (log_sys.is_encrypted() - ? SIZE_OF_FILE_CHECKPOINT + 8 : SIZE_OF_FILE_CHECKPOINT))) + if (oldest_lsn == end_lsn && oldest_lsn != log_sys.get_first_lsn() && + (oldest_lsn == log_sys.last_checkpoint_lsn || + (!log_sys.resize_in_progress() && + oldest_lsn == log_sys.last_checkpoint_lsn + + (log_sys.is_encrypted() + ? SIZE_OF_FILE_CHECKPOINT + 8 : SIZE_OF_FILE_CHECKPOINT)))) { /* Do nothing, because nothing was logged (other than a FILE_CHECKPOINT record) since the previous checkpoint. */ @@ -2012,7 +2114,7 @@ static bool log_checkpoint_low(lsn_t oldest_lsn, lsn_t end_lsn) noexcept } ut_ad(!recv_no_log_write); - ut_ad(oldest_lsn > log_sys.last_checkpoint_lsn); + ut_ad(oldest_lsn >= log_sys.last_checkpoint_lsn); /* Repeat the FILE_MODIFY records after the checkpoint, in case some log records between the checkpoint and log_sys.lsn need them. Finally, write a FILE_CHECKPOINT record. Redo log apply expects to @@ -2029,7 +2131,8 @@ static bool log_checkpoint_low(lsn_t oldest_lsn, lsn_t end_lsn) noexcept log_sys.latch.wr_unlock(); log_write_up_to(flush_lsn, true); log_sys.latch.wr_lock(SRW_LOCK_CALL); - if (log_sys.last_checkpoint_lsn >= oldest_lsn) + if (log_sys.last_checkpoint_lsn >= oldest_lsn && + log_sys.last_checkpoint_lsn != log_sys.get_first_lsn()) goto do_nothing; ut_ad(log_sys.get_flushed_lsn() >= flush_lsn); @@ -2184,7 +2287,8 @@ ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious) noexcept if (recv_recovery_is_on()) recv_sys.apply(true); - DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", return;); + DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", + if (!log_sys.archive) return;); Atomic_relaxed &limit= furious ? buf_flush_sync_lsn : buf_flush_async_lsn; diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index f5c0132a39f44..54b564a7020c1 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -950,6 +950,7 @@ static SHOW_VAR innodb_status_variables[]= { {"lsn_flushed", &export_vars.innodb_lsn_flushed, SHOW_ULONGLONG}, {"lsn_last_checkpoint", &export_vars.innodb_lsn_last_checkpoint, SHOW_ULONGLONG}, + {"lsn_archived", &log_sys.archived_lsn, SHOW_ULONGLONG}, {"master_thread_active_loops", &srv_main_active_loops, SHOW_SIZE_T}, {"master_thread_idle_loops", &srv_main_idle_loops, SHOW_SIZE_T}, {"max_trx_id", &export_vars.innodb_max_trx_id, SHOW_ULONGLONG}, @@ -3694,6 +3695,9 @@ compression_algorithm_is_not_loaded(ulong compression_algorithm, myf flags) return 1; } +/** Initial value of innodb_lsn_archived */ +static uint64_t innodb_log_archive_start; + /** Initialize, validate and normalize the InnoDB startup parameters. @return failure code @retval 0 on success @@ -3982,6 +3986,25 @@ static int innodb_init_params() skip_buffering_tweak: #endif + log_sys.archived_lsn= innodb_log_archive_start; + + if (recv_sys.recovery_start && + log_sys.archived_lsn > recv_sys.recovery_start) + { + sql_print_error("InnoDB: innodb_log_archive_start=" LSN_PF + " is after innodb_log_recovery_start=" LSN_PF, + log_sys.archived_lsn, recv_sys.recovery_start); + DBUG_RETURN(HA_ERR_INITIALIZATION); + } + + if (recv_sys.rpo && recv_sys.recovery_start > recv_sys.rpo) + { + sql_print_error("InnoDB: innodb_log_recovery_start=" LSN_PF + " is after innodb_log_recovery_target=" LSN_PF, + recv_sys.recovery_start, recv_sys.rpo); + DBUG_RETURN(HA_ERR_INITIALIZATION); + } + if (!tpool::supports_native_aio()) srv_use_native_aio= FALSE; @@ -19047,7 +19070,7 @@ static MYSQL_SYSVAR_ENUM(flush_method, innodb_flush_method, static MYSQL_SYSVAR_STR(log_group_home_dir, srv_log_group_home_dir, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, - "Path to ib_logfile0", NULL, NULL, NULL); + "Path to ib_logfile0 or ib_*.log", NULL, NULL, NULL); static MYSQL_SYSVAR_DOUBLE(max_dirty_pages_pct, srv_max_buf_pool_modified_pct, PLUGIN_VAR_RQCMDARG, @@ -19411,7 +19434,7 @@ static MYSQL_SYSVAR_UINT(log_buffer_size, log_sys.buf_size, NULL, NULL, 16U << 20, 2U << 20, log_sys.buf_size_max, 4096); static constexpr const char *innodb_log_file_mmap_description= - "Whether ib_logfile0" + "Whether the log" " resides in persistent memory (when supported) or" " should initially be memory-mapped"; static MYSQL_SYSVAR_BOOL(log_file_mmap, log_sys.log_mmap, @@ -19428,7 +19451,7 @@ static MYSQL_SYSVAR_BOOL(log_file_buffering, log_sys.log_buffered, static MYSQL_SYSVAR_BOOL(log_file_write_through, log_sys.log_write_through, PLUGIN_VAR_OPCMDARG, - "Whether each write to ib_logfile0 is write through", + "Whether each write to the log is write through", nullptr, innodb_log_file_write_through_update, FALSE); static MYSQL_SYSVAR_BOOL(data_file_buffering, fil_system.buffered, @@ -19441,9 +19464,35 @@ static MYSQL_SYSVAR_BOOL(data_file_write_through, fil_system.write_through, "Whether each write to data files writes through", nullptr, innodb_data_file_write_through_update, FALSE); +static void innodb_log_archive_update(THD *, st_mysql_sys_var*, + void *, const void *save) noexcept +{ + log_sys.set_archive(*static_cast(save)); +} + +static MYSQL_SYSVAR_BOOL(log_archive, log_sys.archive, + PLUGIN_VAR_OPCMDARG, + "Whether log archiving is desired", + nullptr, innodb_log_archive_update, FALSE); + +static MYSQL_SYSVAR_UINT64_T(log_archive_start, innodb_log_archive_start, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "initial value of innodb_lsn_archived; 0=auto-detect", + nullptr, nullptr, 0, 0, std::numeric_limits::max(), 0); + +static MYSQL_SYSVAR_UINT64_T(log_recovery_start, recv_sys.recovery_start, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "checkpoint LSN to start recovery from (0=automatic)", + nullptr, nullptr, 0, 0, std::numeric_limits::max(), 0); + +static MYSQL_SYSVAR_UINT64_T(log_recovery_target, recv_sys.rpo, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "recovery point objective (end LSN; 0=unlimited)", + nullptr, nullptr, 0, 0, std::numeric_limits::max(), 0); + static MYSQL_SYSVAR_ULONGLONG(log_file_size, srv_log_file_size, PLUGIN_VAR_RQCMDARG, - "Redo log size in bytes.", + "Desired log file size in bytes", nullptr, innodb_log_file_size_update, 96 << 20, 4 << 20, std::numeric_limits::max(), 4096); @@ -19875,6 +19924,10 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(log_file_write_through), MYSQL_SYSVAR(data_file_buffering), MYSQL_SYSVAR(data_file_write_through), + MYSQL_SYSVAR(log_archive), + MYSQL_SYSVAR(log_archive_start), + MYSQL_SYSVAR(log_recovery_start), + MYSQL_SYSVAR(log_recovery_target), MYSQL_SYSVAR(log_file_size), MYSQL_SYSVAR(log_write_ahead_size), MYSQL_SYSVAR(log_spin_wait_delay), diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h index e80011a9c4c50..3c32d5dd03e61 100644 --- a/storage/innobase/include/log0log.h +++ b/storage/innobase/include/log0log.h @@ -35,22 +35,6 @@ Created 12/9/1995 Heikki Tuuri using st_::span; -static const char LOG_FILE_NAME_PREFIX[] = "ib_logfile"; -static const char LOG_FILE_NAME[] = "ib_logfile0"; - -/** Composes full path for a redo log file -@param[in] filename name of the redo log file -@return path with log file name*/ -std::string get_log_file_path(const char *filename= LOG_FILE_NAME); - -/** Delete log file. -@param[in] suffix suffix of the file name */ -static inline void delete_log_file(const char* suffix) -{ - auto path = get_log_file_path(LOG_FILE_NAME_PREFIX).append(suffix); - os_file_delete_if_exists_func(path.c_str(), nullptr); -} - struct completion_callback; /** Ensure that the log has been written to the log file up to a given @@ -127,9 +111,12 @@ class log_file_t bool flush() const noexcept { return os_file_flush(m_file); } }; +struct recv_warp; + /** Redo log buffer */ struct log_t { + friend recv_warp; /** The maximum buf_size */ static constexpr unsigned buf_size_max= os_file_request_size_max; @@ -236,7 +223,7 @@ struct log_t /** whether a checkpoint is pending; protected by latch.wr_lock() */ Atomic_relaxed checkpoint_pending; /** next checkpoint number (protected by latch.wr_lock()) */ - byte next_checkpoint_no; + uint16_t next_checkpoint_no; /** Log sequence number when a log file overwrite (broken crash recovery) was noticed. Protected by latch.wr_lock(). */ lsn_t overwrite_warned; @@ -247,15 +234,23 @@ struct log_t lsn_t (*writer)() noexcept; /** next checkpoint LSN (protected by latch.wr_lock()) */ lsn_t next_checkpoint_lsn; + /** start of archived log, or 0 (proteted by latch.wr_lock()) */ + lsn_t archived_lsn; /** Log file */ log_file_t log; private: - /** Log file being constructed during resizing; protected by latch */ + /** Log file being constructed during resizing, + or the previous archived log file; protected by latch */ log_file_t resize_log; - /** size of resize_log; protected by latch */ + /** size of resize_log, or the requested innodb_log_file_size + of the next file created if archive==TRUE; protected by latch */ lsn_t resize_target; - /** Buffer for writing to resize_log; @see buf */ + /** Buffer for writing to resize_log; @see buf + Also a spare buffer between archived_mmap_switch_prepare() + and archived_mmap_switch_complete(), + or archived_mmap_switch_recovery_prepare() + and archived_mmap_switch_recovery_complete(). */ byte *resize_buf; /** Buffer for writing to resize_log; @see flush_buf */ byte *resize_flush_buf; @@ -263,13 +258,17 @@ struct log_t /** log sequence number when log resizing was initiated; 0 if the log is not being resized, 1 if resize_start() is in progress */ std::atomic resize_lsn; - /** the log sequence number at the start of the log file */ + /** the log sequence number at the start of the current log file */ lsn_t first_lsn; + /** the log sequence number when the latest checkpoint was initiated */ + lsn_t end_lsn; public: /** current innodb_log_write_ahead_size */ uint write_size; /** format of the redo log: e.g., FORMAT_10_8 */ uint32_t format; + /** the current value of innodb_log_archive; protected by latch.wr_lock() */ + my_bool archive; /** whether the memory-mapped interface is enabled for the log */ my_bool log_mmap; /** the default value of log_mmap */ @@ -333,7 +332,7 @@ struct log_t bool is_mmap() const noexcept { return !flush_buf; } /** @return whether a handle to the log is open; - is_mmap() && !is_opened() holds for PMEM */ + is_mmap() && (is_opened() == archive) holds for PMEM */ bool is_opened() const noexcept { return log.is_opened(); } /** @return LSN at which log resizing was started and is still in progress @@ -362,14 +361,33 @@ struct log_t { return thd == resize_initiator; } /** Replicate a write to the log. + @tparam mmap whether the memory-mapped interface is enabled @param lsn start LSN @param end end of the mini-transaction @param len length of the mini-transaction @param seq offset of the sequence bit from the end */ + template inline void resize_write(lsn_t lsn, const byte *end, - size_t len, size_t seq) noexcept; + size_t len, size_t seq) noexcept + { + if (UNIV_LIKELY_NULL(resize_buf)) + resize_write_low(lsn, end, len, seq); + } + + /** SET GLOBAL innodb_log_archive + @param archive the new value of innodb_log_archive */ + void set_archive(my_bool archive) noexcept; private: + /** Replicate a write to the log. + @tparam mmap whether the memory-mapped interface is enabled + @param lsn start LSN + @param end end of the mini-transaction + @param len length of the mini-transaction + @param seq offset of the sequence bit from the end */ + template + ATTRIBUTE_COLD void resize_write_low(lsn_t lsn, const byte *end, + size_t len, size_t seq) noexcept; /** Write resize_buf to resize_log. @param b resize_buf or resize_flush_buf @param length the used length of b */ @@ -380,23 +398,19 @@ struct log_t @return whether an error occurred */ static bool resize_rename() noexcept; - /** @return pointer for writing to resize_buf - @retval nullptr if no is_mmap() based resizing is active */ - inline byte *resize_buf_begin(lsn_t lsn) const noexcept; - /** @return end of resize_buf */ - inline const byte *resize_buf_end() const noexcept - { return resize_buf + resize_target; } - /** Initialise the redo log subsystem. */ void create() noexcept; /** Attach a log file. @return whether the memory allocation succeeded */ - bool attach(log_file_t file, os_offset_t size) noexcept; + bool attach(log_file_t file, os_offset_t size, bool read_only) noexcept; /** Disable memory-mapped access (update log_mmap) */ void clear_mmap() noexcept; void close_file(bool really_close= true) noexcept; + /** Stash a log archive file in multi-file recovery */ + inline void stash_archive_file() noexcept; + #if defined __linux__ || defined _WIN32 /** Try to enable or disable file system caching (update log_buffered) */ void set_buffered(bool buffered) noexcept; @@ -413,6 +427,10 @@ struct log_t @param encrypted whether the log is encrypted */ static void header_write(byte *buf, lsn_t lsn, bool encrypted) noexcept; + /** Rewrite the log file header in set_archive() + @param archive the new value of innodb_log_archive */ + void header_rewrite(my_bool archive) noexcept; + /** @return an estimate of get_lsn(), using acquire-release ordering with write_buf() or persist(); an upper bound if said functions have updated only one of the fields, @@ -434,6 +452,13 @@ struct log_t (write_lsn_offset & (WRITE_BACKOFF - 1)); } + /** @return whether a back-off in a log write is in progress */ + bool is_backoff() const noexcept + { + ut_ad(latch_have_wr()); + return write_lsn_offset & WRITE_BACKOFF; + } + lsn_t get_flushed_lsn(std::memory_order order= std::memory_order_acquire) const noexcept { return flushed_to_disk_lsn.load(order); } @@ -455,7 +480,34 @@ struct log_t /** Persist the log. @param lsn desired new value of flushed_to_disk_lsn */ void persist(lsn_t lsn) noexcept; + /** @return the overflow buffer when ARCHIVED_MMAP is wrapping around */ + byte *get_archived_mmap_switch() const noexcept + { + ut_ad(archived_mmap_switch()); + return resize_buf + START_OFFSET; + } #endif + /** @return whether archived_mmap_switch_complete() needs to be called */ + bool archived_mmap_switch() const noexcept + { + ut_ad(latch_have_any()); + return UNIV_UNLIKELY(archive && resize_buf); + } + /** Create a new log file when the current one will fill up. + @param buf log records to append + @param length size of the log records, in bytes + @param offset log file offset */ + ATTRIBUTE_COLD void archive_new_write(const byte *buf, size_t length, + lsn_t offset) noexcept; + + /** Ensure that innodb_log_archive=ON will default to the current + innodb_log_file_size if no size has been specified. */ + void archive_set_size() noexcept + { + ut_ad(!resize_in_progress()); + if (!resize_target) + resize_target= file_size; + } bool check_for_checkpoint() const { @@ -489,13 +541,65 @@ struct log_t @param late whether the WRITE_BACKOFF flag had already been set @param ex whether log_sys.latch is exclusively locked */ ATTRIBUTE_COLD void append_prepare_wait(bool late, bool ex) noexcept; +#ifdef HAVE_PMEM + /** Wait in append_prepare() for buffer to become available + @param late whether the WRITE_BACKOFF flag had already been set + @param ex whether log_sys.latch is exclusively locked */ + ATTRIBUTE_COLD void archived_mmap_switch_prepare(bool late, bool ex) + noexcept; +#endif public: + /** Attempt to finish archived_mmap_switch_prepare(). + @return the current LSN in the new file + @retval 0 if no switch took place */ + ATTRIBUTE_COLD lsn_t archived_mmap_switch_complete() noexcept; + + /** Prepare for multi-file memory-mapped log recovery. */ + ATTRIBUTE_COLD void archived_mmap_switch_recovery_prepare() noexcept; + /** Finish archived_mmap_switch_recovery_prepare(). */ + ATTRIBUTE_COLD void archived_mmap_switch_recovery_complete() noexcept; + + /** How to write log */ + enum write { + /** normal writing !log_sys.is_mmap() */ + WRITE_NORMAL, + /** circular memory-mapped writing when log_sys.is_mmap() */ + CIRCULAR_MMAP, + /** memory-mapped log for log_sys.archive */ + ARCHIVED_MMAP + }; + + /** Get a name of a circular log file. + @param i log file number (0 to 101) + @return the path name of the log file */ + ATTRIBUTE_COLD static std::string get_circular_path(size_t i= 0); + + /** @return the name of the current log file */ + ATTRIBUTE_COLD std::string get_path() const; + + /** Append the archive log file base name to a string. + @param path directory name and separator + @param lsn first LSN stored in the file + @return path with the base file name appended */ + static ATTRIBUTE_COLD std::string &append_archive_name(std::string &path, + lsn_t lsn); + + /** Generate an archive log file name. + @param lsn first LSN stored in the file + @return archive log file name */ + ATTRIBUTE_COLD std::string get_archive_path(lsn_t lsn) const; + /** @return the current archive log file name */ + std::string get_archive_path() const { return get_archive_path(first_lsn); } + + /** @return the next archive log file name */ + ATTRIBUTE_COLD std::string get_next_archive_path() const; + /** Reserve space in the log buffer for appending data. - @tparam mmap log_sys.is_mmap() + @tparam mode how to write log @param size total length of the data to append(), in bytes @param ex whether log_sys.latch is exclusively locked @return the start LSN and the buffer position for append() */ - template + template std::pair append_prepare(size_t size, bool ex) noexcept; /** Append a string of bytes to the redo log. @@ -527,6 +631,18 @@ struct log_t /** @return the first LSN of the log file */ lsn_t get_first_lsn() const noexcept { return first_lsn; } + /** Set the recovered checkpoint. + @param lsn log sequence number of the checkpoint + @param end_lsn LSN passed to write_checkpoint() + @param number checkpoint number */ + void set_recovered_checkpoint(lsn_t lsn, lsn_t end_lsn, uint16_t number) + noexcept + { + next_checkpoint_lsn= lsn; + this->end_lsn= end_lsn; + next_checkpoint_no= number; + } + /** Determine the sequence bit at a log sequence number */ byte get_sequence_bit(lsn_t lsn) const noexcept { @@ -547,6 +663,9 @@ struct log_t @param end_lsn start LSN of the FILE_CHECKPOINT mini-transaction */ inline void write_checkpoint(lsn_t end_lsn) noexcept; + /** Wait for write_checkpoint() if necessary. */ + ATTRIBUTE_COLD void checkpoint_margin() noexcept; + /** Variations of write_buf() */ enum resizing_and_latch { /** skip latch.wr_unlock(); log resizing may or may not be in progress */ diff --git a/storage/innobase/include/log0recv.h b/storage/innobase/include/log0recv.h index 457218656f439..08b56cba57e5b 100644 --- a/storage/innobase/include/log0recv.h +++ b/storage/innobase/include/log0recv.h @@ -44,11 +44,6 @@ ATTRIBUTE_COLD MY_ATTRIBUTE((nonnull, warn_unused_result)) @return whether the page was recovered correctly */ bool recv_recover_page(fil_space_t* space, buf_page_t* bpage); -/** Read the latest checkpoint information from log file -and store it in log_sys.next_checkpoint and recv_sys.file_checkpoint -@return error code or DB_SUCCESS */ -dberr_t recv_recovery_read_checkpoint(); - /** Start recovering from a redo log checkpoint. of first system tablespace page @return error code or DB_SUCCESS */ @@ -249,6 +244,11 @@ struct recv_sys_t lsn_t scanned_lsn; /** log sequence number at the end of the FILE_CHECKPOINT record, or 0 */ lsn_t file_checkpoint; + /** recovery start checkpoint */ + lsn_t recovery_start; + /** recovery point objective (a limit for scanned_lsn) */ + lsn_t rpo; + /** the time when progress was last reported */ time_t progress_time; @@ -406,6 +406,15 @@ struct recv_sys_t @return error code or DB_SUCCESS */ dberr_t find_checkpoint(); +private: + /** Find a checkpoint in an innodb_log_archive=ON file. + @param first_lsn the first LSN of the file + @param silent whether to silence error reporting + @return error code + @retval DB_SUCCESS if a suitable checkpoint was found */ + dberr_t find_checkpoint_archived(lsn_t first_lsn, bool silent); +public: + /** Register a redo log snippet for a page. @param it page iterator @param l redo log snippet diff --git a/storage/innobase/include/mtr0mtr.h b/storage/innobase/include/mtr0mtr.h index 82756ee43cf0d..633d7e6cbda4a 100644 --- a/storage/innobase/include/mtr0mtr.h +++ b/storage/innobase/include/mtr0mtr.h @@ -63,8 +63,11 @@ struct mtr_memo_slot_t void release() const; }; +class buf_dblwr_t; + /** Mini-transaction handle and buffer */ struct mtr_t { + friend buf_dblwr_t; mtr_t(trx_t *trx/*= nullptr*/); ~mtr_t(); @@ -688,13 +691,15 @@ struct mtr_t { ATTRIBUTE_NOINLINE size_t crc32c() noexcept; /** Commit the mini-transaction log. - @tparam pmem log_sys.is_mmap() + @tparam mmap log_sys.is_mmap() @param mtr mini-transaction @param lsns {start_lsn,flush_ahead_lsn} */ - template + template static void commit_log(mtr_t *mtr, std::pair lsns) noexcept; - /** Release log_sys.latch. */ + /** Release log_sys.latch. + @tparam mmap log_sys.is_mmap() */ + template void commit_log_release() noexcept; /** Append the redo log records to the redo log buffer. @@ -702,11 +707,11 @@ struct mtr_t { std::pair do_write() noexcept; /** Append the redo log records to the redo log buffer. - @tparam mmap log_sys.is_mmap() + @tparam how how to write @param mtr mini-transaction @param len number of bytes to write @return {start_lsn,flush_ahead_lsn} */ - template static + template static std::pair finish_writer(mtr_t *mtr, size_t len); /** The applicable variant of commit_log() */ diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc index eb5fc3fe721d5..b5feb8348fc03 100644 --- a/storage/innobase/log/log0log.cc +++ b/storage/innobase/log/log0log.cc @@ -109,6 +109,8 @@ void log_t::create() noexcept #endif last_checkpoint_lsn= FIRST_LSN; + first_lsn= FIRST_LSN; + end_lsn= FIRST_LSN; log_capacity= 0; max_modified_age_async= 0; max_checkpoint_age= 0; @@ -191,15 +193,17 @@ void log_file_t::write(os_offset_t offset, span buf) noexcept # endif /** Attempt to memory map a file. -@param file log file handle -@param size file size +@param file log file handle +@param size file size +@param read_only whether the file is read-only @return pointer to memory mapping @retval MAP_FAILED if the memory cannot be mapped */ static void *log_mmap(os_file_t file, # ifdef HAVE_PMEM bool &is_pmem, /*!< whether the file is on pmem */ # endif - os_offset_t size) + os_offset_t size, + bool read_only) { #if SIZEOF_SIZE_T < 8 if (size != os_offset_t(size_t(size))) @@ -228,8 +232,8 @@ static void *log_mmap(os_file_t file, The mapping will always be read-only if innodb_read_only=ON or if mariadb-backup is running in any other mode than --prepare --export. */ - const bool read_only= - srv_read_only_mode || srv_operation >= SRV_OPERATION_BACKUP; + ut_ad(read_only || + (!srv_read_only_mode && srv_operation < SRV_OPERATION_BACKUP)); # ifdef _WIN32 void *ptr= MAP_FAILED; @@ -320,29 +324,36 @@ ATTRIBUTE_COLD static void log_file_message() noexcept static inline void log_file_message() noexcept {} #endif -bool log_t::attach(log_file_t file, os_offset_t size) noexcept +bool log_t::attach(log_file_t file, os_offset_t size, bool read_only) noexcept { - log= file; + ut_ad(!log.is_opened()); + ut_ad(archive || !resize_log.is_opened()); ut_ad(!size || size >= START_OFFSET + SIZE_OF_FILE_CHECKPOINT); - file_size= size; - ut_ad(!buf); ut_ad(!flush_buf); ut_ad(!writer); + ut_ad(archive || !resize_buf); + ut_ad(!resize_flush_buf); + + file_size= size; + if (size) { # ifdef HAVE_PMEM bool is_pmem; - void *ptr= ::log_mmap(log.m_file, is_pmem, size); + void *ptr= ::log_mmap(file.m_file, is_pmem, size, read_only); # else - void *ptr= ::log_mmap(log.m_file, size); + void *ptr= ::log_mmap(file.m_file, size, read_only); # endif if (ptr != MAP_FAILED) { # ifdef HAVE_PMEM if (is_pmem) { - log.close(); + if (archive) + log= file; + else + file.close(); log_buffered= false; log_maybe_unbuffered= true; IF_WIN(,mprotect(ptr, size_t(size), PROT_READ)); @@ -357,6 +368,10 @@ bool log_t::attach(log_file_t file, os_offset_t size) noexcept goto func_exit; } } + else + ut_ad(!archive); + + log= file; log_mmap= false; buf= static_cast(ut_malloc_dontdump(buf_size, PSI_INSTRUMENT_ME)); if (!buf) @@ -431,25 +446,31 @@ void log_t::create(lsn_t lsn) noexcept ut_ad(is_latest()); ut_ad(this == &log_sys); + next_checkpoint_no= archive ? (is_encrypted() ? 8 : 1) : 0; write_lsn_offset= 0; base_lsn.store(lsn, std::memory_order_relaxed); flushed_to_disk_lsn.store(lsn, std::memory_order_relaxed); first_lsn= lsn; + end_lsn= lsn; write_lsn= lsn; + if (!archived_lsn) + archived_lsn= lsn; - last_checkpoint_lsn= 0; + last_checkpoint_lsn= lsn; DBUG_PRINT("ib_log", ("write header " LSN_PF, lsn)); #ifdef HAVE_PMEM if (is_mmap()) { - ut_ad(!is_opened()); + ut_ad(is_opened() == archive); mprotect(buf, size_t(file_size), PROT_READ | PROT_WRITE); + buf_size= unsigned(std::min(capacity(), buf_size_max)); + if (archive) + goto archive_header; memset_aligned<4096>(buf, 0, 4096); - log_sys.header_write(buf, lsn, is_encrypted()); + header_write(buf, lsn, is_encrypted()); pmem_persist(buf, 512); - buf_size= unsigned(std::min(capacity(), buf_size_max)); } else #endif @@ -457,9 +478,18 @@ void log_t::create(lsn_t lsn) noexcept ut_ad(!is_mmap()); memset_aligned<4096>(flush_buf, 0, buf_size); memset_aligned<4096>(buf, 0, buf_size); - log_sys.header_write(buf, lsn, is_encrypted()); - log.write(0, {buf, 4096}); - memset_aligned<512>(buf, 0, 512); + if (!archive) + { + header_write(buf, lsn, is_encrypted()); + log.write(0, {buf, 4096}); + memset_aligned<512>(buf, 0, 512); + } + else +#ifdef HAVE_PMEM + archive_header: +#endif + if (is_encrypted()) + log_crypt_write_header(buf); } } @@ -559,10 +589,9 @@ void log_t::set_buffered(bool buffered) noexcept { if (const dberr_t err= log.close()) log_close_failed(err); - std::string path{get_log_file_path()}; log_buffered= buffered; bool success; - log.m_file= os_file_create_func(path.c_str(), + log.m_file= os_file_create_func(get_path().c_str(), OS_FILE_OPEN, OS_LOG_FILE, false, &success); ut_a(log.m_file != OS_FILE_CLOSED); @@ -582,14 +611,13 @@ void log_t::set_write_through(bool write_through) bool(log_write_through) != write_through) { os_file_close_func(log.m_file); - log.m_file= OS_FILE_CLOSED; - std::string path{get_log_file_path()}; + log= OS_FILE_CLOSED; log_write_through= write_through; bool success; - log.m_file= os_file_create_func(path.c_str(), + log.m_file= os_file_create_func(get_path().c_str(), OS_FILE_OPEN, OS_LOG_FILE, false, &success); - ut_a(log.m_file != OS_FILE_CLOSED); + ut_a(log.is_opened()); sql_print_information(log_write_through ? "InnoDB: Log writes write through" : "InnoDB: Log writes may be cached"); @@ -597,6 +625,211 @@ void log_t::set_write_through(bool write_through) log_resize_release(); } +/** Rewrite the log file header in set_archive() +@param archive the new value of innodb_log_archive */ +void log_t::header_rewrite(my_bool archive) noexcept +{ + ut_ad(!resize_buf); + ut_ad(this->archive == !archive); + + /* We will rewrite the log file header while the file + name is not ib_logfile0. That is, the archived log file + recovery will accept both the circular and the archived + format for the last file. */ + + byte* c= checkpoint_buf; + ut_ad(end_lsn >= first_lsn); + ut_ad(!archive || end_lsn <= first_lsn + ~0U); + ut_ad(format == (is_encrypted() ? FORMAT_ENC_11 : FORMAT_10_8)); +#ifdef HAVE_PMEM + if (!c) + { + ut_ad(is_mmap()); + if (!archive) + { + memset_aligned<512>(buf + 512, 0, START_OFFSET - 512); + c= buf + CHECKPOINT_1; + mach_write_to_8(my_assume_aligned<8>(c), next_checkpoint_lsn); + mach_write_to_8(my_assume_aligned<8>(c + 8), end_lsn); + mach_write_to_4(my_assume_aligned<4>(c + 60), my_crc32c(0, c, 60)); + pmem_persist(buf + 512, START_OFFSET - 512); + header_write(buf, first_lsn, is_encrypted()); + memset_aligned<512>(buf + 512, 0, CHECKPOINT_1 - 512); + pmem_persist(buf, CHECKPOINT_1); + } + else + { + next_checkpoint_no= uint16_t(8 * is_encrypted() + 2); + if (!is_encrypted()) + { + mach_write_to_8(buf, uint32_t(end_lsn - first_lsn)); + memset_aligned<8>(buf + 8, 0, 64 - 8); + } + else + { + log_crypt_write_header(buf); + mach_write_to_4(buf + 32, uint32_t(end_lsn - first_lsn)); + memset_aligned<4>(buf + 36, 0, 64 - 36); + } + pmem_persist(buf, 64); + memset_aligned<64>(buf + 64, 0, START_OFFSET - 64); + pmem_persist(buf, START_OFFSET); + } + return; + } +#endif + memset_aligned<512>(c, 0, write_size); + + if (!archive) + { + mach_write_to_8(my_assume_aligned<8>(c), next_checkpoint_lsn); + mach_write_to_8(my_assume_aligned<8>(c + 8), end_lsn); + mach_write_to_4(my_assume_aligned<4>(c + 60), my_crc32c(0, c, 60)); + log.write(CHECKPOINT_1, {c, write_size}); + os_file_flush(log.m_file); + memset_aligned<512>(c, 0, write_size); + for (size_t offset= CHECKPOINT_1; (offset+= write_size) < START_OFFSET;) + log.write(offset, {c, write_size}); + header_write(c, first_lsn, is_encrypted()); + if (write_size > 512) + memset_aligned<512>(c + 512, 0, write_size - 512); + log.write(0, {c, write_size}); + os_file_flush(log.m_file); + memset_aligned<512>(c, 0, write_size); + for (size_t offset= 0; (offset+= write_size) < CHECKPOINT_1;) + log.write(offset, {c, write_size}); + } + else + { + next_checkpoint_no= uint16_t(8 * is_encrypted() + 2); + if (!is_encrypted()) + mach_write_to_8(c, uint32_t(end_lsn - first_lsn)); + else + { + log_crypt_write_header(c); + mach_write_to_4(c + 32, uint32_t(end_lsn - first_lsn)); + } + log.write(0, {c, write_size}); + os_file_flush(log.m_file); + for (size_t offset= 0; (offset+= write_size) < START_OFFSET;) + log.write(offset, {field_ref_zero, write_size}); + } + + os_file_flush(log.m_file); +} + +/** SET GLOBAL innodb_log_archive +@param archive the new value of innodb_log_archive */ +void log_t::set_archive(my_bool archive) noexcept +{ + for (;;) + { + IF_WIN(log_resize_acquire(), latch.wr_lock(SRW_LOCK_CALL)); + if (resize_in_progress()) + { + my_printf_error(ER_WRONG_USAGE, + "SET GLOBAL innodb_log_file_size is in progress", + MYF(0)); + break; + } + if (archive == this->archive) + break; +#ifdef HAVE_PMEM + if (is_mmap()) + { + ut_ad(this->archive == log.is_opened()); + if (is_backoff()) + /* Prevent a race condition with append_prepare() */ + goto retry; + if (archive); + else if (resize_buf) + /* Wait for a call to archived_mmap_switch_complete() */ + goto retry; + else + log.close(); + } +#endif + else if (checkpoint_pending) + { + /* Prevent a race condition with write_checkpoint() */ +#ifdef HAVE_PMEM + retry: +#endif + IF_WIN(log_resize_release(), latch.wr_unlock()); + continue; + } + + ut_ad(!resize_buf); + ut_ad(!resize_log.is_opened()); // FIXME: wait for checkpoint? + + const lsn_t old_first_lsn{first_lsn}; + if (archive) + first_lsn+= (end_lsn - old_first_lsn) / capacity() * capacity(); + std::string normal_name{get_circular_path()}; + std::string arch_name{get_archive_path()}; + + const char *old_name= normal_name.c_str(); + const char *new_name= arch_name.c_str(); + if (!archive) + { + std::swap(old_name, new_name); + header_rewrite(archive); + } +#if defined HAVE_PMEM && !defined _WIN32 + else if (is_mmap()) + { + /* Open the file so that write_checkpoint() + will be able to flag it read-only */ + bool success; + log.m_file= + os_file_create_func(old_name, OS_FILE_OPEN, OS_LOG_FILE, + false, &success); + if (!log.is_opened()) + { + my_error(ER_ERROR_ON_READ, MYF(0), old_name, errno); + break; + } + } +#endif + +#ifdef _WIN32 + /* On Microsoft Windows, there must be no open file handles to a + file that is being renamed. */ + if (const dberr_t err= log.close()) + log_close_failed(err); +#endif + int fail= my_rename(old_name, new_name, MY_SYNC_DIR); +#ifdef _WIN32 + { + bool success; + log.m_file= os_file_create_func(fail ? old_name : new_name, + OS_FILE_OPEN, OS_LOG_FILE, + false, &success); + ut_a(log.m_file != OS_FILE_CLOSED); + } +#endif + if (fail) + { + my_error(ER_ERROR_ON_RENAME, MYF(0), old_name, new_name, my_errno); + first_lsn= old_first_lsn; + break; + } + + if (archive) + { + header_rewrite(archive); + + archived_lsn= next_checkpoint_lsn; + archive_set_size(); + } + this->archive= archive; + mtr_t::finisher_update(); + break; + } + + IF_WIN(log_resize_release(), latch.wr_unlock()); +} + /** Start resizing the log and release the exclusive latch. @param size requested new file_size @param thd the current thread identifier @@ -617,6 +850,14 @@ log_t::resize_start_status log_t::resize_start(os_offset_t size, void *thd) status= RESIZE_NO_CHANGE; else if (resize_in_progress()) status= RESIZE_IN_PROGRESS; + else if (archive) + { + status= RESIZE_NO_CHANGE; + /* When the current log becomes full and a new archivable log file + is being created, it will be of this size. At that point we will assign + file_size= resize_target, resize_target= 0; */ + resize_target= size; + } else { lsn_t start_lsn; @@ -625,7 +866,7 @@ log_t::resize_start_status log_t::resize_start(os_offset_t size, void *thd) ut_ad(!resize_buf); ut_ad(!resize_flush_buf); ut_ad(!resize_initiator); - std::string path{get_log_file_path("ib_logfile101")}; + const std::string path{get_circular_path(101)}; bool success; resize_initiator= thd; resize_lsn.store(1, std::memory_order_relaxed); @@ -646,7 +887,7 @@ log_t::resize_start_status log_t::resize_start(os_offset_t size, void *thd) else if (is_mmap()) { bool is_pmem{false}; - ptr= ::log_mmap(resize_log.m_file, is_pmem, size); + ptr= ::log_mmap(resize_log.m_file, is_pmem, size, false); if (ptr == MAP_FAILED) goto alloc_fail; @@ -749,8 +990,8 @@ void log_t::resize_abort(void *thd) noexcept resize_target= 0; resize_lsn.store(0, std::memory_order_relaxed); resize_initiator= nullptr; - std::string path{get_log_file_path("ib_logfile101")}; - IF_WIN(DeleteFile(path.c_str()), unlink(path.c_str())); + IF_WIN(DeleteFile(get_circular_path(101).c_str()), + unlink(get_circular_path(101).c_str())); writer_update(false); } @@ -758,10 +999,13 @@ void log_t::resize_abort(void *thd) noexcept } /** Write an aligned buffer to ib_logfile0. -@param buf buffer to be written -@param length length of data to be written -@param offset log file offset */ -static void log_write_buf(const byte *buf, size_t length, lsn_t offset) +@param max_length the maximum length that can be written to the file +@param buf buffer to be written +@param length length of data to be written +@param offset log file offset */ +static void log_write_buf(lsn_t max_length, + const byte *buf, size_t length, lsn_t offset) + noexcept { ut_ad(write_lock.is_owner()); ut_ad(!recv_no_log_write); @@ -770,21 +1014,99 @@ static void log_write_buf(const byte *buf, size_t length, lsn_t offset) ut_ad(!(length & block_size_1)); ut_ad(!(size_t(buf) & block_size_1)); ut_ad(length); + ut_ad(max_length == log_sys.file_size - offset); - const lsn_t maximum_write_length{log_sys.file_size - offset}; - ut_ad(maximum_write_length <= log_sys.file_size - log_sys.START_OFFSET); - - if (UNIV_UNLIKELY(length > maximum_write_length)) + if (UNIV_UNLIKELY(length > max_length)) { - log_sys.log.write(offset, {buf, size_t(maximum_write_length)}); - length-= size_t(maximum_write_length); - buf+= size_t(maximum_write_length); + ut_ad(!log_sys.archive); + log_sys.log.write(offset, {buf, size_t(max_length)}); + length-= size_t(max_length); + buf+= size_t(max_length); ut_ad(log_sys.START_OFFSET + length < offset); offset= log_sys.START_OFFSET; } log_sys.log.write(offset, {buf, length}); } +ATTRIBUTE_COLD +std::string &log_t::append_archive_name(std::string &path, lsn_t lsn) +{ + path.append("ib_"); + for (int i= 16; i--; lsn<<= 4) + path.push_back("0123456789abcdef"[lsn >> 60]); + path.append(".log"); + return path; +} + +ATTRIBUTE_COLD std::string log_t::get_archive_path(lsn_t lsn) const +{ + size_t size= strlen(srv_log_group_home_dir); + retry: + switch (srv_log_group_home_dir[size - 1]) { +#ifdef _WIN32 + case '\\': +#endif + case '/': + if (size <= 1) + break; + size--; + goto retry; + } + if (size == 1 && *srv_log_group_home_dir == '.') + size= 0; + std::string path; + path.reserve(size + sizeof "/ib_0000000000000000.log"); + path.assign(srv_log_group_home_dir, size); + if (size) + path.push_back('/'); + return append_archive_name(path, lsn); +} + +ATTRIBUTE_COLD std::string log_t::get_next_archive_path() const +{ return get_archive_path(first_lsn + capacity()); } + +ATTRIBUTE_COLD void log_t::archive_new_write(const byte *buf, size_t length, + lsn_t offset) noexcept +{ + ut_ad(latch_have_wr()); + ut_ad(write_lock.is_owner()); + ut_ad(archive); + ut_ad(length >= file_size - offset); + ut_ad(!resize_log.is_opened()); + ut_ad(!resize_buf); + ut_ad(!resize_in_progress()); + ut_ad(resize_target >= 4U << 20); + ut_ad(is_latest()); + + const size_t first{size_t(file_size - offset)}; + log.write(offset, {buf, first}); + length-= first; + buf+= first; + + std::string path{get_next_archive_path()}; + bool success; + pfs_os_file_t file= + os_file_create_func(path.c_str(), OS_FILE_CREATE, OS_LOG_FILE, + false, &success); + ut_ad(success == (file != OS_FILE_CLOSED)); + if (file != OS_FILE_CLOSED) + { + if (os_file_set_size(path.c_str(), file, resize_target)) + { + resize_log= log; + log.m_file= file; + if (length) + log.write(START_OFFSET, {buf, length}); + return; + } + os_file_close(file); + IF_WIN(DeleteFile(path.c_str()), unlink(path.c_str())); + } + sql_print_error("[FATAL] InnoDB: Failed to create %s of %" PRIu64 + " bytes", path.c_str(), resize_target); + abort(); +} + /** Invoke commit_checkpoint_notify_ha() to notify that outstanding log writes have been completed. */ void log_flush_notify(lsn_t flush_lsn); @@ -911,16 +1233,123 @@ static size_t log_pad(lsn_t lsn, size_t pad, byte *begin, byte *extra) #endif #ifdef HAVE_PMEM +ATTRIBUTE_COLD +void log_t::archived_mmap_switch_prepare(bool late, bool ex) noexcept +{ + ut_ad(archive); + ut_ad(is_mmap()); + ut_ad(log.is_opened()); + ut_ad(!resize_log.is_opened()); + ut_ad(!resize_buf); + ut_ad(!resize_in_progress()); + ut_ad(resize_target >= 4U << 20); + ut_ad(is_latest()); + + if (UNIV_LIKELY(!ex)) + { + latch.rd_unlock(); + if (!late) + { + /* Wait for all threads to back off. */ + latch.wr_lock(SRW_LOCK_CALL); + goto got_ex; + } + + const auto delay= my_cpu_relax_multiplier / 4 * srv_spin_wait_delay; + const auto rounds= srv_n_spin_wait_rounds; + + for (;;) + { + HMT_low(); + for (auto r= rounds + 1; r--; ) + { + if (write_lsn_offset.load(std::memory_order_relaxed) & WRITE_BACKOFF) + { + for (auto d= delay; d--; ) + MY_RELAX_CPU(); + } + else + { + HMT_medium(); + goto done; + } + } + HMT_medium(); + std::this_thread::sleep_for(std::chrono::microseconds(100)); + } + } + else + { + got_ex: + const uint64_t l= write_lsn_offset.load(std::memory_order_relaxed); + const lsn_t lsn= base_lsn.load(std::memory_order_relaxed) + + (l & (WRITE_BACKOFF - 1)); + waits++; + ut_ad(archive); + ut_ad(!resize_buf); + ut_ad(!resize_in_progress()); + ut_ad(resize_target >= 4U << 20); + ut_ad(is_latest()); + ut_ad(log.is_opened()); + ut_ad(!resize_log.is_opened()); + + do + { + std::string path{get_next_archive_path()}; + bool success; + os_file_t file= + os_file_create_func(path.c_str(), OS_FILE_CREATE, OS_LOG_FILE, + false, &success); + ut_ad(success == (file != OS_FILE_CLOSED)); + if (file != OS_FILE_CLOSED) + { + if (os_file_set_size(path.c_str(), file, resize_target)) + { + bool is_pmem{false}; + resize_buf= static_cast(::log_mmap(file, is_pmem, + resize_target, false)); + if (resize_buf != MAP_FAILED) + { + /* Will be closed in write_checkpoint() */ + resize_log= log; + log= file; + continue; + } + resize_buf= nullptr; + os_file_close(file); + } + } + + IF_WIN(DeleteFile(path.c_str()), unlink(path.c_str())); + sql_print_error("[FATAL] InnoDB: Failed to create and map %s of %" PRIu64 + " bytes", path.c_str(), resize_target); + abort(); + } + while (false); + + ut_ad(lsn - get_flushed_lsn(std::memory_order_relaxed) < capacity()); + persist(lsn); + /* Above we cleared the WRITE_BACKOFF flag, + which our caller will recheck. */ + if (ex) + return; + latch.wr_unlock(); + } + +done: + latch.rd_lock(SRW_LOCK_CALL); +} + void log_t::persist(lsn_t lsn) noexcept { - ut_ad(!is_opened()); ut_ad(!write_lock.is_owner()); ut_ad(!flush_lock.is_owner()); ut_ad(latch_have_wr()); + ut_ad(is_opened() == archive); lsn_t old= flushed_to_disk_lsn.load(std::memory_order_relaxed); - if (old >= lsn) + if (old > lsn) return; const size_t start(calc_lsn_offset(old)); @@ -1092,18 +1521,31 @@ lsn_t log_t::write_buf() noexcept ut_ad(base + (write_lsn_offset & (WRITE_TO_BUF - 1)) == lsn); write_to_log++; + DBUG_PRINT("ib_log", ("write " LSN_PF " to " LSN_PF " at " LSN_PF, + write_lsn, lsn, offset)); + + const lsn_t max_length{file_size - offset}; + ut_ad(max_length <= capacity()); + if (UNIV_UNLIKELY(length >= max_length)) + { + if (resizing != RESIZING && archive) + { + archive_new_write(write_buf, length, offset); + if (resizing != RETAIN_LATCH) + latch.wr_unlock(); + goto written; + } + } if (resizing != RETAIN_LATCH) latch.wr_unlock(); - DBUG_PRINT("ib_log", ("write " LSN_PF " to " LSN_PF " at " LSN_PF, - write_lsn, lsn, offset)); - /* Do the write to the log file */ - log_write_buf(write_buf, length, offset); + log_write_buf(max_length, write_buf, length, offset); if (UNIV_LIKELY_NULL(re_write_buf)) resize_write_buf(re_write_buf, length); + written: write_lsn= lsn; if (UNIV_UNLIKELY(srv_shutdown_state > SRV_SHUTDOWN_INITIATED)) @@ -1249,38 +1691,33 @@ void log_t::clear_mmap() noexcept { if (!is_mmap() || high_level_read_only) return; -#ifdef HAVE_PMEM - if (!is_opened()) - { - ut_d(latch.wr_lock(SRW_LOCK_CALL)); - ut_ad(!resize_in_progress()); - ut_ad(get_lsn() == get_flushed_lsn(std::memory_order_relaxed)); - ut_d(latch.wr_unlock()); - return; - } -#endif log_resize_acquire(); ut_ad(!resize_in_progress()); - ut_ad(write_lsn == get_lsn()); - ut_ad(write_lsn == get_flushed_lsn(std::memory_order_relaxed)); - - if (buf) /* this may be invoked while creating a new database */ + ut_ad(get_lsn() == get_flushed_lsn(std::memory_order_relaxed)); +#ifdef HAVE_PMEM + if (is_opened() && !archive) +#endif { - alignas(16) byte log_block[4096]; - const size_t bs{write_size}; + ut_ad(write_lsn == get_lsn()); + + if (buf) /* this may be invoked while creating a new database */ { - const size_t bf= - size_t(write_lsn - base_lsn.load(std::memory_order_relaxed)); - memcpy_aligned<16>(log_block, buf + (bf & ~(bs - 1)), bs); - } + alignas(16) byte log_block[4096]; + const size_t bs{write_size}; + { + const size_t bf= + size_t(write_lsn - base_lsn.load(std::memory_order_relaxed)); + memcpy_aligned<16>(log_block, buf + (bf & ~(bs - 1)), bs); + } - close_file(false); - log_mmap= false; - ut_a(attach(log, file_size)); - ut_ad(!is_mmap()); + close_file(false); + log_mmap= false; + ut_a(attach(log, file_size, false)); + ut_ad(!is_mmap()); - memcpy_aligned<16>(buf, log_block, bs); + memcpy_aligned<16>(buf, log_block, bs); + } } log_resize_release(); } @@ -1301,45 +1738,46 @@ ATTRIBUTE_COLD void log_write_and_flush() noexcept } } -/****************************************************************//** -Tries to establish a big enough margin of free space in the log, such -that a new log entry can be catenated without an immediate need for a -checkpoint. NOTE: this function may only be called if the calling thread -owns no synchronization objects! */ -ATTRIBUTE_COLD static void log_checkpoint_margin() noexcept +ATTRIBUTE_COLD void log_t::checkpoint_margin() noexcept { - while (log_sys.check_for_checkpoint()) + ut_ad(this == &log_sys); + ut_ad(!recv_no_log_write); + + while (check_for_checkpoint()) { - log_sys.latch.wr_lock(SRW_LOCK_CALL); + latch.wr_lock(SRW_LOCK_CALL); ut_ad(!recv_no_log_write); - if (!log_sys.check_for_checkpoint()) + if (!check_for_checkpoint()) { func_exit: - log_sys.latch.wr_unlock(); + latch.wr_unlock(); return; } - const lsn_t lsn= log_sys.get_lsn(); - const lsn_t max_age= log_sys.max_checkpoint_age; - const lsn_t age= lsn_t(lsn - log_sys.last_checkpoint_lsn); + const lsn_t last{last_checkpoint_lsn}, max_age{max_checkpoint_age}; + lsn_t lsn{get_lsn()}; - if (age <= max_age) + if (last < first_lsn) + lsn= first_lsn; + else { + if (lsn_t(lsn - last) <= max_age) + { #ifndef DBUG_OFF - skip_checkpoint: + skip_checkpoint: #endif - log_sys.set_check_for_checkpoint(false); - goto func_exit; + set_check_for_checkpoint(false); + goto func_exit; + } + DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", goto skip_checkpoint;); + lsn-= max_age; } - DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", goto skip_checkpoint;); - log_sys.latch.wr_unlock(); + latch.wr_unlock(); /* We must wait to prevent the tail of the log overwriting the head. */ - buf_flush_wait_flushed(lsn - max_age); - /* Sleep to avoid a thundering herd */ - std::this_thread::sleep_for(std::chrono::milliseconds(10)); + buf_flush_wait_flushed(lsn); } } @@ -1350,10 +1788,7 @@ void log_free_check() noexcept { ut_ad(!lock_sys.is_holder()); if (log_sys.check_for_checkpoint()) - { - ut_ad(!recv_no_log_write); - log_checkpoint_margin(); - } + log_sys.checkpoint_margin(); } #ifdef __linux__ @@ -1581,10 +2016,10 @@ void log_t::close() recv_sys.close(); } -std::string get_log_file_path(const char *filename) +ATTRIBUTE_COLD std::string log_t::get_circular_path(size_t i) { - const size_t size= strlen(srv_log_group_home_dir) + /* path separator */ 1 + - strlen(filename) + /* longest suffix */ 3; + ut_ad(i <= 101); + const size_t size= strlen(srv_log_group_home_dir) + sizeof "/ib_logfile101"; std::string path; path.reserve(size); path.assign(srv_log_group_home_dir); @@ -1598,7 +2033,10 @@ std::string get_log_file_path(const char *filename) default: path.push_back('/'); } - path.append(filename); + return path.append("ib_logfile").append(std::to_string(i)); +} - return path; +ATTRIBUTE_COLD std::string log_t::get_path() const +{ + return archive ? get_archive_path() : get_circular_path(); } diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc index 9eb286b4d5516..d4ab569096c21 100644 --- a/storage/innobase/log/log0recv.cc +++ b/storage/innobase/log/log0recv.cc @@ -32,6 +32,8 @@ Created 9/20/1997 Heikki Tuuri #include "log0recv.h" +#define LOG_ARCHIVE_NAME "ib_%016" PRIx64 ".log" + #ifdef HAVE_MY_AES_H #include #endif @@ -57,6 +59,8 @@ Created 9/20/1997 Heikki Tuuri /** The recovery system */ recv_sys_t recv_sys; +/** 0 or the first LSN that would conflict with innodb_log_recovery_target */ +static lsn_t recv_sys_rpo_exceeded; /** TRUE when recv_init_crash_recovery() has been called. */ bool recv_needed_recovery; #ifdef UNIV_DEBUG @@ -1688,10 +1692,41 @@ static dberr_t recv_log_recover_10_5(lsn_t lsn_offset) return DB_SUCCESS; } +/** @return if the specified innodb_log_recovery_target is being violated */ +static bool recv_sys_invalid_rpo(lsn_t lsn) noexcept +{ + if (!recv_sys.rpo || recv_sys.rpo >= lsn) + return false; + sql_print_error("InnoDB: cannot fulfill innodb_log_recovery_target=%" + PRIu64 "<%" PRIu64, recv_sys.rpo, lsn); + return true; +} + +inline void log_t::stash_archive_file() noexcept +{ + ut_ad(log.is_opened()); + if (resize_log.is_opened()) + { + ut_ad(!is_mmap() == !resize_buf); + if (resize_buf) + my_munmap(resize_buf, size_t(resize_target)); + resize_log.close(); + } + if (is_mmap()) + { + resize_buf= buf; + buf= nullptr; + } + std::swap(log, resize_log); + resize_target= file_size; + writer= nullptr; +} + dberr_t recv_sys_t::find_checkpoint() { - bool wrong_size= false; byte *buf; + lsn_t first_lsn= 0; + bool read_only{srv_read_only_mode || srv_operation >= SRV_OPERATION_BACKUP}; ut_ad(pages.empty()); pages_it= pages.end(); @@ -1699,14 +1734,179 @@ dberr_t recv_sys_t::find_checkpoint() if (files.empty()) { file_checkpoint= 0; - std::string path{get_log_file_path()}; + int archive= log_sys.archive; + retry: + std::string path{log_sys.get_circular_path()}; bool success; - os_file_t file{os_file_create_func(path.c_str(), - OS_FILE_OPEN, - OS_LOG_FILE, - srv_read_only_mode, &success)}; - if (file == OS_FILE_CLOSED) + os_file_t file{os_file_create_func(path.c_str(), archive < 0 + ? OS_FILE_OPEN : OS_FILE_OPEN_SILENT, + OS_LOG_FILE, read_only, &success)}; + if (file != OS_FILE_CLOSED) + { + if (archive > 0) + { + sql_print_error("InnoDB: innodb_log_archive=ON but %s exists", + path.c_str()); + return DB_ERROR; + } + } + else if (archive < 0) return DB_ERROR; + else + { + ut_ad(srv_operation == SRV_OPERATION_NORMAL); + path.reserve(strlen(srv_log_group_home_dir) + + sizeof "/ib_0000000000000000.log"); +#ifdef _WIN32 + WIN32_FIND_DATAA entry; + path.assign(srv_log_group_home_dir); + switch (path.back()) { + case '\\': case '/': + break; + default: + path.push_back('/'); + } + path.append("ib_????????????????.log"); + HANDLE d= FindFirstFileA(path.c_str(), &entry); + if (d != INVALID_HANDLE_VALUE) + goto readdir; +#else + DIR *d= opendir(srv_log_group_home_dir); + if (d) + goto readdir; +#endif + no_archive_found: + if (archive) + sql_print_error("InnoDB: innodb_log_archive files not found in '%s'", + srv_log_group_home_dir); + if (archive) + return DB_ERROR; + archive= -1; + goto retry; + + readdir: + struct log { lsn_t end; bool read_only; }; + std::map logs; +#ifdef _WIN32 + do + { + if (entry.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) + continue; + lsn_t lsn; + int n{0}; + const char *fn{entry.cFileName}; + if (1 != sscanf(fn, LOG_ARCHIVE_NAME "%n", &lsn, &n) || fn[n]) + continue; + LARGE_INTEGER filesize; + filesize.LowPart= entry.nFileSizeLow; + filesize.HighPart= entry.nFileSizeHigh; + logs.emplace(lsn, + log{lsn + filesize.QuadPart, + bool(entry.dwFileAttributes & + FILE_ATTRIBUTE_READONLY)}); + } + while (FindNextFile(d, &entry)); + FindClose(d); +#else + while (dirent *e= readdir(d)) + { + lsn_t lsn; + int n{0}; + const char *fn{e->d_name}; + if (1 != sscanf(fn, LOG_ARCHIVE_NAME "%n", &lsn, &n) || fn[n] || + lsn < log_t::FIRST_LSN) + continue; + path.assign(srv_log_group_home_dir); + path.push_back('/'); + struct stat st; + if (stat(log_sys.append_archive_name(path, lsn).c_str(), &st) || + st.st_size < static_cast + (log_t::START_OFFSET + SIZE_OF_FILE_CHECKPOINT)) + { + sql_print_warning("InnoDB: ignoring %s", path.c_str()); + continue; + } + logs.emplace(lsn, + log{lsn - log_t::START_OFFSET + st.st_size, + !(st.st_mode & 0400)}); + } + closedir(d); +#endif + + const auto end= logs.cend(); + auto i= logs.cbegin(), start= i, found_recovery_start= end; + int subsequent= 0; + if (i == end) + goto no_archive_found; + log_sys.format= srv_encrypt_log + ? log_t::FORMAT_ENC_11 : log_t::FORMAT_10_8; + log_sys.archive= true; + for (;;) + { + const lsn_t first{i->first}, last{i->second.end}; + if (log_sys.archived_lsn > first && log_sys.archived_lsn < last) + { + sql_print_error("InnoDB: Invalid innodb_log_archive_start=" LSN_PF); + return DB_ERROR; + } + if (recovery_start >= first && recovery_start < last) + found_recovery_start= i; + const auto prev= i; + if (++i == end) + { + if (!recovery_start) + subsequent= 0, start= prev; + break; + } + if (last == i->first) + subsequent++; + else + subsequent=0, start= i; + } + + if (recovery_start && found_recovery_start == end) + { + sql_print_error("InnoDB: No matching file found for" + " innodb_log_recovery_start=" LSN_PF, recovery_start); + return DB_ERROR; + } + + for (i= logs.cbegin();; start--, subsequent++) + { + if (recovery_start && start != found_recovery_start) + continue; + path.assign(srv_log_group_home_dir); + switch (path.back()) { +#ifdef _WIN32 + case '\\': +#endif + case '/': + break; + default: + path.push_back('/'); + } + read_only= subsequent || srv_read_only_mode || start->second.read_only; + file= + os_file_create_func(log_sys.append_archive_name(path, start->first). + c_str(), OS_FILE_OPEN, OS_LOG_FILE, + read_only, &success); + if (file == OS_FILE_CLOSED) + return DB_ERROR; + if (!log_sys.attach(file, start->second.end - start->first + + log_t::START_OFFSET, read_only)) + { + os_file_close(file); + return DB_ERROR; + } + const dberr_t err= + find_checkpoint_archived(start->first, !read_only && i != start); + if (err == DB_SUCCESS || read_only || recovery_start || i == start) + return err; + log_sys.stash_archive_file(); + } + } + + ut_ad(!log_sys.archive); const os_offset_t size{os_file_get_size(file)}; if (!size) { @@ -1716,21 +1916,21 @@ dberr_t recv_sys_t::find_checkpoint() else if (size < log_t::START_OFFSET + SIZE_OF_FILE_CHECKPOINT) { too_small: - sql_print_error("InnoDB: File %.*s is too small", - int(path.size()), path.data()); + sql_print_error("InnoDB: File %s is too small", path.c_str()); err_exit: os_file_close(file); return DB_ERROR; } - else if (!log_sys.attach(file, size)) + else if (!log_sys.attach(file, size, read_only)) goto err_exit; else file= OS_FILE_CLOSED; - recv_sys.files.emplace_back(file); + files.emplace_back(file); + for (int i= 1; i < 101; i++) { - path= get_log_file_path(LOG_FILE_NAME_PREFIX).append(std::to_string(i)); + path= log_sys.get_circular_path(i); file= os_file_create_func(path.c_str(), OS_FILE_OPEN_SILENT, OS_LOG_FILE, true, &success); @@ -1742,14 +1942,14 @@ dberr_t recv_sys_t::find_checkpoint() sql_print_error("InnoDB: Log file %.*s is of different size " UINT64PF " bytes than other log files " UINT64PF " bytes!", int(path.size()), path.data(), sz, size); - wrong_size= true; + first_lsn= LSN_MAX; } - recv_sys.files.emplace_back(file); + files.emplace_back(file); } if (!size) { - if (wrong_size) + if (first_lsn == LSN_MAX) return DB_CORRUPTION; lsn= log_sys.next_checkpoint_lsn; log_sys.format= log_t::FORMAT_3_23; @@ -1758,18 +1958,21 @@ dberr_t recv_sys_t::find_checkpoint() } else ut_ad(srv_operation == SRV_OPERATION_BACKUP); + + ut_ad(!log_sys.archive); log_sys.next_checkpoint_lsn= 0; lsn= 0; buf= my_assume_aligned<4096>(log_sys.buf); if (!log_sys.is_mmap()) if (dberr_t err= log_sys.log.read(0, {buf, log_sys.START_OFFSET})) return err; + /* Check the header page checksum. There was no checksum in the first redo log format (version 0). */ log_sys.format= mach_read_from_4(buf + LOG_HEADER_FORMAT); if (log_sys.format == log_t::FORMAT_3_23) { - if (wrong_size) + if (first_lsn == LSN_MAX) return DB_CORRUPTION; if (dberr_t err= recv_log_recover_pre_10_2()) return err; @@ -1779,6 +1982,12 @@ dberr_t recv_sys_t::find_checkpoint() log_sys.last_checkpoint_lsn= log_sys.next_checkpoint_lsn; log_sys.set_recovered_lsn(log_sys.next_checkpoint_lsn); lsn= file_checkpoint= log_sys.next_checkpoint_lsn; + if (rpo && rpo != lsn) + { + sql_print_error("InnoDB: cannot fulfill innodb_log_recovery_target=%" + PRIu64 "!=%" PRIu64, rpo, lsn); + return DB_CORRUPTION; + } if (UNIV_LIKELY(lsn != 0)) scanned_lsn= lsn; log_sys.next_checkpoint_no= 0; @@ -1791,7 +2000,7 @@ dberr_t recv_sys_t::find_checkpoint() return DB_CORRUPTION; } - const lsn_t first_lsn{mach_read_from_8(buf + LOG_HEADER_START_LSN)}; + first_lsn= mach_read_from_8(buf + LOG_HEADER_START_LSN); log_sys.set_first_lsn(first_lsn); char creator[LOG_HEADER_CREATOR_END - LOG_HEADER_CREATOR + 1]; memcpy(creator, buf + LOG_HEADER_CREATOR, sizeof creator); @@ -1847,14 +2056,15 @@ dberr_t recv_sys_t::find_checkpoint() } if (checkpoint_lsn >= log_sys.next_checkpoint_lsn) - { - log_sys.next_checkpoint_lsn= checkpoint_lsn; - log_sys.next_checkpoint_no= field == log_t::CHECKPOINT_1; - lsn= end_lsn; - } + log_sys.set_recovered_checkpoint(checkpoint_lsn, lsn= end_lsn, + field == log_t::CHECKPOINT_1); } if (!log_sys.next_checkpoint_lsn) goto got_no_checkpoint; + else if (!log_sys.archived_lsn) + log_sys.archived_lsn= log_sys.next_checkpoint_lsn; + if (recv_sys_invalid_rpo(lsn)) + return DB_READ_ONLY; if (!memcmp(creator, "Backup ", 7)) srv_start_after_restore= true; @@ -1921,7 +2131,7 @@ dberr_t recv_sys_t::find_checkpoint() return DB_ERROR; } - if (wrong_size) + if (first_lsn == LSN_MAX) return DB_CORRUPTION; if (dberr_t err= recv_log_recover_10_5(lsn_offset)) @@ -2321,6 +2531,96 @@ struct recv_ring : public recv_buf } }; +/** Buffer wrapper for memory-mapped log_sys.archive, +with the capability to warp from log_sys.buf to log_sys.resize_buf */ +struct recv_warp : public recv_buf +{ + constexpr recv_warp(const byte *ptr) : recv_buf(ptr) {} + + static constexpr bool may_wrap() { return true; } + bool is_wrapped(const recv_warp &end) const { return end.ptr < ptr; } + constexpr static bool is_eof() { return false; } + constexpr static bool is_eof(size_t) { return false; } + + byte operator*() const noexcept + { + ut_ad((ptr >= &log_sys.buf[log_sys.START_OFFSET] && ptr < end()) || + (ptr >= &log_sys.resize_buf[log_sys.START_OFFSET] && + ptr < &log_sys.resize_buf[log_sys.resize_target])); + return *ptr; + } + recv_warp operator+(size_t len) const noexcept + { recv_warp r{*this}; return r+= len; } + recv_warp &operator++() noexcept { return *this+= 1; } + recv_warp &operator+=(size_t len) noexcept + { + ut_ad(len < recv_sys.MTR_SIZE_MAX * 2); + const byte *const e{end()}; + const bool first{ptr < e && ptr >= log_sys.buf}; + ut_ad(!first || ptr >= &log_sys.buf[log_sys.START_OFFSET]); + ptr+= len; + if (first) + { + if (ptr < e) + return *this; + ptr= &log_sys.resize_buf[log_sys.START_OFFSET + (ptr - e)]; + } + ut_ad(ptr > &log_sys.resize_buf[log_sys.START_OFFSET]); + ut_ad(ptr < &log_sys.resize_buf[log_sys.resize_target]); + return *this; + } + size_t operator-(const recv_warp &start) const noexcept + { + const int start_in_buf= size_t(start.ptr - log_sys.buf) < recv_sys.len, + buf_diff= (size_t(ptr - log_sys.buf) < recv_sys.len) - start_in_buf; + ut_ad(buf_diff <= 0); + ut_ad(buf_diff == 0 || start_in_buf); + return buf_diff + ? size_t((ptr - &log_sys.resize_buf[log_sys.START_OFFSET]) - + (end() - start.ptr)) + : size_t(ptr - start.ptr); + } + + uint32_t decode_varint() const noexcept + { + recv_warp log{*this}; + uint32_t i{*log}; + if (i < MIN_2BYTE) + return i; + uint32_t j{*++log}; + if (i < 0xc0) + return MIN_2BYTE + ((i & ~0xc0) << 8 | j); + j<<= 8; + j|= *++log; + if (i < 0xe0) + return MIN_3BYTE + ((i & ~0xe0) << 16 | j); + j<<= 8; + j|= *++log; + if (i < 0xf0) + return MIN_4BYTE + ((i & ~0xf0) << 24 | j); + if (i == 0xf0) + { + j<<= 8; + j|= *++log; + if (j <= ~MIN_5BYTE) + return MIN_5BYTE + j; + } + return MLOG_DECODE_ERROR; + } + + uint32_t crc32c(uint32_t crc, const recv_warp &start) const noexcept + { + int db= (size_t(ptr - log_sys.buf) < recv_sys.len) - + (size_t(start.ptr - log_sys.buf) < recv_sys.len); + ut_ad(db >= 0); + return db + ? my_crc32c(my_crc32c(crc, start.ptr, end() - start.ptr), + &log_sys.resize_buf[log_sys.START_OFFSET], + ptr - &log_sys.resize_buf[log_sys.START_OFFSET]) + : my_crc32c(crc, start.ptr, ptr - start.ptr); + } +}; + ATTRIBUTE_COLD void recv_sys_t::rewind(const byte *begin, const byte *end) noexcept { @@ -2427,9 +2727,17 @@ recv_sys_t::parse_mtr_result log_parse_start(source &l, unsigned nonce) return recv_sys_t::PREMATURE_EOF; eom_found: - if (*l != log_sys.get_sequence_bit((l - begin) + recv_sys.lsn)) + const lsn_t end_lsn{(l - begin) + recv_sys.lsn}; + + if (*l != log_sys.get_sequence_bit(end_lsn) && !log_sys.archive) return recv_sys_t::GOT_EOF; + if (recv_sys.rpo && recv_sys.rpo < end_lsn) + { + recv_sys_rpo_exceeded= end_lsn; + return recv_sys_t::GOT_EOF; + } + if (l.is_eof(5 + nonce)) return recv_sys_t::PREMATURE_EOF; @@ -2474,7 +2782,7 @@ recv_sys_t::parse_mtr_result recv_sys_t::parse(source l, bool if_exists) (srv_operation == SRV_OPERATION_BACKUP || srv_operation == SRV_OPERATION_BACKUP_NO_DEFER)); mysql_mutex_assert_owner(&mutex); - ut_ad(log_sys.next_checkpoint_lsn); + ut_ad(log_sys.next_checkpoint_lsn || log_sys.archive); ut_ad(log_sys.is_recoverable()); ut_ad(log_sys.format == format); @@ -2737,14 +3045,21 @@ log_parse_file(const page_id_t id, bool if_exists, ? "ignored" : recv_sys.file_checkpoint ? "reread" : "read", recv_sys.lsn)); - if (c == log_sys.next_checkpoint_lsn) + if (c == log_sys.next_checkpoint_lsn || !log_sys.next_checkpoint_lsn) { /* There can be multiple FILE_CHECKPOINT for the same LSN. */ if (!recv_sys.file_checkpoint) { + ut_ad(log_sys.next_checkpoint_lsn || log_sys.archive); + ut_ad(log_sys.last_checkpoint_lsn || log_sys.archive); + log_sys.next_checkpoint_lsn= c; + if (!log_sys.last_checkpoint_lsn) + log_sys.last_checkpoint_lsn= c; recv_sys.file_checkpoint= recv_sys.lsn; return recv_sys_t::GOT_EOF; } + else + ut_ad(log_sys.next_checkpoint_lsn); } } break; @@ -2788,6 +3103,18 @@ log_parse_file(const page_id_t id, bool if_exists, break; } + if (!log_sys.next_checkpoint_lsn) + { + /* We are currently validating checkpoints in + recv_log_t::find_checkpoint_archived(). We must not open and + validate data files until we actually start recovery from a + checkpoint, because there could be lots of FILE_MODIFY and + FILE_CHECKPOINT log records to be parsed. */ + ut_ad(!recv_sys.file_checkpoint); + ut_ad(log_sys.archive); + return recv_sys_t::OK; + } + fil_name_process(reinterpret_cast(l), fnend - l, space_id, fn2 ? FILE_MODIFY : mfile_type_t(b & 0xf0), recv_sys.start_lsn, if_exists); @@ -3211,6 +3538,11 @@ recv_sys_t::parse_mtr_result recv_sys_t::parse_mtr(bool if_exists) return recv_sys.parse(s, if_exists); } +ATTRIBUTE_COLD void log_t::archived_mmap_switch_recovery_complete() noexcept +{ + ut_error; // TODO +} + template recv_sys_t::parse_mtr_result recv_sys_t::parse_mmap(bool if_exists) { @@ -3220,6 +3552,16 @@ recv_sys_t::parse_mtr_result recv_sys_t::parse_mmap(bool if_exists) ut_ad(recv_sys.len == log_sys.file_size); ut_ad(recv_sys.offset >= log_sys.START_OFFSET); ut_ad(recv_sys.offset <= recv_sys.len); + if (log_sys.archive) + { + ut_ad(log_sys.archived_mmap_switch()); + recv_warp s{&log_sys.buf[recv_sys.offset]}; + auto r= recv_sys.parse(s,if_exists); + log_sys.archived_mmap_switch_recovery_complete(); + ut_ad(s.ptr > &log_sys.buf[log_sys.START_OFFSET]); + ut_ad(s.ptr < recv_buf::end()); + return r; + } recv_ring s {recv_sys.offset == recv_sys.len ? &log_sys.buf[log_sys.START_OFFSET] @@ -4294,7 +4636,7 @@ static bool recv_scan_log(bool last_phase, const recv_sys_t::parser *parser) ut_ad(!end || end == recv_sys.lsn); bool corrupt_fs= recv_sys.is_corrupt_fs(); - if (!end && !corrupt_fs) + if (!end && !corrupt_fs && !log_sys.archive) { recv_sys.set_corrupt_log(); sql_print_error("InnoDB: Missing FILE_CHECKPOINT(" LSN_PF @@ -4698,30 +5040,6 @@ static dberr_t recv_rename_files() return err; } -dberr_t recv_recovery_read_checkpoint() -{ - ut_ad(srv_operation <= SRV_OPERATION_EXPORT_RESTORED || - srv_operation == SRV_OPERATION_RESTORE || - srv_operation == SRV_OPERATION_RESTORE_EXPORT); - ut_ad(!recv_sys.recovery_on); - ut_d(mysql_mutex_lock(&buf_pool.mutex)); - ut_ad(UT_LIST_GET_LEN(buf_pool.LRU) == 0); - ut_ad(UT_LIST_GET_LEN(buf_pool.unzip_LRU) == 0); - ut_d(mysql_mutex_unlock(&buf_pool.mutex)); - - if (srv_force_recovery >= SRV_FORCE_NO_LOG_REDO) - { - sql_print_information("InnoDB: innodb_force_recovery=6" - " skips redo log apply"); - return DB_SUCCESS; - } - - log_sys.latch.wr_lock(SRW_LOCK_CALL); - dberr_t err= recv_sys.find_checkpoint(); - log_sys.latch.wr_unlock(); - return err; -} - inline void log_t::set_recovered() noexcept { ut_ad(get_flushed_lsn() == get_lsn()); @@ -4771,6 +5089,131 @@ static recv_sys_t::parser get_parse_mmap() noexcept ut_error; } +dberr_t recv_sys_t::find_checkpoint_archived(lsn_t first_lsn, bool silent) +{ + ut_ad(log_sys.archive); + ut_ad(!log_sys.checkpoint_buf == log_sys.is_mmap()); + const byte *buf; + if (byte *c= log_sys.checkpoint_buf) + { + buf= c; + if (dberr_t err= log_sys.log.read(0, {c, log_sys.write_size})) + return err; + } + else + buf= log_sys.buf; + uint16_t n_checkpoint; + { + const uint32_t format{mach_read_from_4(buf)}; + if (format != uint32_t(srv_encrypt_log)) + { + /* TODO: correct the file header later, if we can recover + from the previous file */ + if (!silent) + sql_print_error(format < 2 + ? "InnoDB: " LOG_ARCHIVE_NAME + " does not match innodb_encrypt_log" + : "InnoDB: " LOG_ARCHIVE_NAME + " is in unrecognized format", + first_lsn); + return DB_ERROR; + } + + if (!format) + buf+= 4, n_checkpoint= 1; + else if (!log_crypt_read_header(buf)) + return DB_ERROR; + else + { + buf+= 32/*log_crypt_read_header()*/, n_checkpoint= 32 / 4; + if (!tmp_buf) + { + tmp_buf= static_cast + (ut_malloc_dontdump(tmp_buf_size, PSI_INSTRUMENT_ME)); + if (!tmp_buf) + return DB_OUT_OF_MEMORY; + } + } + } + + log_sys.next_checkpoint_lsn= 0; + log_sys.set_first_lsn(first_lsn); + lsn= 0; + /* Validate the checkpoints */ + lsn_t end_lsn{first_lsn}, checkpoint{0}, recovery_start_end_lsn{0}; + const recv_sys_t::parser parser[2] { + get_parse_mmap(), get_parse_mmap() + }; + ut_ad(recv_spaces.empty()); + for (bool first= true; n_checkpoint < log_sys.START_OFFSET / 4; first= false) + { + const uint32_t d= mach_read_from_4(my_assume_aligned<4>(buf)); + if (!d && !first) + break; + lsn= end_lsn + d; + file_checkpoint= 0; + log_sys.next_checkpoint_lsn= 0; + ut_d(const bool rescan=) recv_scan_log(false, parser); + ut_ad(rescan); + ut_ad(recv_spaces.empty()); + if (!file_checkpoint) + { + found_corrupt_log= false; + if (!d && first) + goto next; + break; + } + ut_ad(file_checkpoint == lsn); + end_lsn+= d; + checkpoint= log_sys.next_checkpoint_lsn; + ut_ad(checkpoint); + ut_ad(checkpoint < lsn); + if (first && !log_sys.archived_lsn) + log_sys.archived_lsn= checkpoint; + if (checkpoint == recovery_start) + recovery_start_end_lsn= end_lsn; + next: + n_checkpoint++; + buf+= 4; + if (byte *c= log_sys.checkpoint_buf) + { + uint offset(n_checkpoint * 4); + if (offset & (log_sys.write_size - 1)) + continue; + buf= c; + if (dberr_t err= log_sys.log.read(offset, {c, log_sys.write_size})) + return err; + } + } + + if (!checkpoint) + { + if (!silent) + sql_print_error("InnoDB: Did not find any checkpoint after LSN=" LSN_PF, + first_lsn); + return DB_CORRUPTION; + } + + if (!recovery_start); + else if (recovery_start_end_lsn) + checkpoint= recovery_start, end_lsn= recovery_start_end_lsn; + else + { + ut_ad(!silent); + sql_print_error("InnoDB: Did not find innodb_log_recovery_start=" LSN_PF + " between " LSN_PF " and " LSN_PF " (" LSN_PF ")", + recovery_start, first_lsn, end_lsn, checkpoint); + return DB_CORRUPTION; + } + + if (recv_sys_invalid_rpo(recv_sys_rpo_exceeded)) + return DB_READ_ONLY; + + file_checkpoint= 0; + log_sys.set_recovered_checkpoint(checkpoint, lsn= end_lsn, n_checkpoint); + return DB_SUCCESS; +} + /** Start recovering from a redo log checkpoint. of first system tablespace page @return error code or DB_SUCCESS */ @@ -4794,6 +5237,7 @@ dberr_t recv_recovery_from_checkpoint_start() } recv_sys.recovery_on = true; + recv_sys_rpo_exceeded = 0; log_sys.latch.wr_lock(SRW_LOCK_CALL); log_sys.set_capacity(); @@ -4811,9 +5255,21 @@ dberr_t recv_recovery_from_checkpoint_start() recv_sys_t::parser parser[2]; if (log_sys.is_recoverable()) { + if (recv_sys.recovery_start > log_sys.next_checkpoint_lsn) { + ut_ad(!log_sys.archive); // already checked + sql_print_error("InnoDB: impossible " + "innodb_log_recovery_start=%" PRIu64 + ">%" PRIu64, + recv_sys.recovery_start, + log_sys.next_checkpoint_lsn); + goto err_exit; // FIXME: remove this? + } else { + log_sys.last_checkpoint_lsn = recv_sys.recovery_start + ? recv_sys.recovery_start + : log_sys.next_checkpoint_lsn; + } const bool rewind = recv_sys.lsn - != log_sys.next_checkpoint_lsn; - log_sys.last_checkpoint_lsn = log_sys.next_checkpoint_lsn; + != log_sys.last_checkpoint_lsn; parser[false] = get_parse_mmap(); parser[true] = get_parse_mmap(); recv_scan_log(false, parser); @@ -4821,6 +5277,7 @@ dberr_t recv_recovery_from_checkpoint_start() read_only_recovery: sql_print_warning("InnoDB: innodb_read_only" " prevents crash recovery"); +read_only_reported: err = DB_READ_ONLY; goto func_exit; } @@ -4841,8 +5298,11 @@ dberr_t recv_recovery_from_checkpoint_start() } rescan = recv_scan_log(false, parser); - if (srv_read_only_mode && recv_needed_recovery) { + if (!recv_needed_recovery) { + } else if (srv_read_only_mode) { goto read_only_recovery; + } else if (recv_sys_invalid_rpo(recv_sys_rpo_exceeded)) { + goto read_only_reported; } if ((recv_sys.is_corrupt_log() && !srv_force_recovery) diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc index 952850b3f499d..0554bda5c2c0b 100644 --- a/storage/innobase/mtr/mtr0mtr.cc +++ b/storage/innobase/mtr/mtr0mtr.cc @@ -52,12 +52,14 @@ void mtr_t::finisher_update() if (log_sys.is_mmap()) { commit_logger= mtr_t::commit_log; - finisher= mtr_t::finish_writer; + finisher= log_sys.archive + ? mtr_t::finish_writer + : mtr_t::finish_writer; return; } commit_logger= mtr_t::commit_log; #endif - finisher= mtr_t::finish_writer; + finisher= mtr_t::finish_writer; } void mtr_memo_slot_t::release() const @@ -336,7 +338,51 @@ void mtr_t::release() m_memo.clear(); } -ATTRIBUTE_NOINLINE void mtr_t::commit_log_release() noexcept +#ifdef HAVE_PMEM +ATTRIBUTE_COLD lsn_t log_t::archived_mmap_switch_complete() noexcept +{ + ut_ad(latch_have_wr()); + if (!archive || !resize_buf) + return 0; + const lsn_t lsn{get_lsn()}, end_lsn{first_lsn + capacity()}; + if (lsn < end_lsn) + return 0; + persist(lsn); + my_munmap(buf, file_size); + buf= resize_buf; + resize_buf= nullptr; + first_lsn= end_lsn; + file_size= resize_target; + return lsn; +} + +template<> +ATTRIBUTE_NOINLINE void mtr_t::commit_log_release() noexcept +{ + if (m_latch_ex) + { + completed: + const lsn_t lsn{log_sys.archived_mmap_switch_complete()}; + log_sys.latch.wr_unlock(); + m_latch_ex= false; + if (lsn) + buf_flush_ahead(lsn, true); + } + else + { + const bool retry{log_sys.archived_mmap_switch()}; + log_sys.latch.rd_unlock(); + if (retry) + { + log_sys.latch.wr_lock(SRW_LOCK_CALL); + goto completed; + } + } +} +#endif + +template<> +ATTRIBUTE_NOINLINE void mtr_t::commit_log_release() noexcept { if (m_latch_ex) { @@ -395,12 +441,12 @@ void mtr_t::commit_log(mtr_t *mtr, std::pair lsns) noexcept buf_pool.page_cleaner_wakeup(); mysql_mutex_unlock(&buf_pool.flush_list_mutex); - mtr->commit_log_release(); + mtr->commit_log_release(); mtr->release(); } else { - mtr->commit_log_release(); + mtr->commit_log_release(); for (auto it= mtr->m_memo.rbegin(); it != mtr->m_memo.rend(); ) { @@ -852,7 +898,9 @@ static time_t log_close_warn_time; making the server crash-unsafe. */ ATTRIBUTE_COLD static void log_overwrite_warning(lsn_t lsn) { - if (log_sys.overwrite_warned) + ut_ad(!log_sys.archive); /* we hope that this is unreachable */ + + if (log_sys.overwrite_warned || log_sys.archive) return; time_t t= time(nullptr); @@ -872,6 +920,38 @@ ATTRIBUTE_COLD static void log_overwrite_warning(lsn_t lsn) ? ". Shutdown is in progress" : ""); } + +#ifdef HAVE_PMEM +template<> +inline std::pair +log_t::append_prepare(size_t size, bool ex) noexcept +{ + ut_ad(ex ? latch_have_wr() : latch_have_rd()); + ut_ad(is_mmap()); + ut_ad(archive); + ut_ad(archived_lsn); + + uint64_t l, lsn; + static_assert(WRITE_TO_BUF == WRITE_BACKOFF << 1, ""); + while (UNIV_UNLIKELY((l= write_lsn_offset.fetch_add(size + WRITE_TO_BUF) & + (WRITE_TO_BUF - 1)) + size + + (lsn= base_lsn.load(std::memory_order_relaxed)) >= + first_lsn + capacity()) && !resize_buf) + { + /* The following is inlined here instead of being part of + archive_mmap_switch_prepare() below, in order to increase the + locality of reference and to expedite setting the WRITE_BACKOFF flag. */ + bool late(write_lsn_offset.fetch_or(WRITE_BACKOFF) & WRITE_BACKOFF); + /* Subtract our LSN overshoot. */ + write_lsn_offset.fetch_sub(size); + archived_mmap_switch_prepare(late, ex); + } + + lsn+= l; + return {lsn, buf + FIRST_LSN + (lsn - first_lsn)}; +} +#endif + ATTRIBUTE_COLD void log_t::append_prepare_wait(bool late, bool ex) noexcept { if (UNIV_LIKELY(!ex)) @@ -918,6 +998,7 @@ ATTRIBUTE_COLD void log_t::append_prepare_wait(bool late, bool ex) noexcept const bool is_pmem{is_mmap()}; if (is_pmem) { + ut_ad(!archive); ut_ad(lsn - get_flushed_lsn(std::memory_order_relaxed) < capacity() || overwrite_warned); persist(lsn); @@ -942,17 +1023,20 @@ ATTRIBUTE_COLD void log_t::append_prepare_wait(bool late, bool ex) noexcept } /** Reserve space in the log buffer for appending data. -@tparam mmap log_sys.is_mmap() +@tparam mode how to write log @param size total length of the data to append(), in bytes @param ex whether log_sys.latch is exclusively locked @return the start LSN and the buffer position for append() */ -template +template inline std::pair log_t::append_prepare(size_t size, bool ex) noexcept { ut_ad(ex ? latch_have_wr() : latch_have_rd()); - ut_ad(mmap == is_mmap()); - ut_ad(!mmap || buf_size == std::min(capacity(), buf_size_max)); + static_assert(!bool(WRITE_NORMAL), ""); + static_assert(bool(CIRCULAR_MMAP), ""); + static_assert(mode == WRITE_NORMAL || mode == CIRCULAR_MMAP, ""); + ut_ad(bool(mode) == is_mmap()); + ut_ad(!mode || buf_size == std::min(capacity(), buf_size_max)); const size_t buf_size{this->buf_size - size}; uint64_t l; static_assert(WRITE_TO_BUF == WRITE_BACKOFF << 1, ""); @@ -968,14 +1052,13 @@ std::pair log_t::append_prepare(size_t size, bool ex) noexcept append_prepare_wait(late, ex); } - const lsn_t lsn{l + base_lsn.load(std::memory_order_relaxed)}, - end_lsn{lsn + size}; + const lsn_t lsn{l + base_lsn.load(std::memory_order_relaxed)}; - if (UNIV_UNLIKELY(end_lsn >= last_checkpoint_lsn + log_capacity)) + if (UNIV_UNLIKELY(lsn + size >= last_checkpoint_lsn + log_capacity)) set_check_for_checkpoint(true); return {lsn, - buf + size_t(mmap ? FIRST_LSN + (lsn - first_lsn) % capacity() : l)}; + buf + size_t(mode ? FIRST_LSN + (lsn - first_lsn) % capacity() : l)}; } /** Finish appending data to the log. @@ -1111,12 +1194,14 @@ std::pair mtr_t::do_write() noexcept return finish_write(len); } -inline void log_t::resize_write(lsn_t lsn, const byte *end, size_t len, - size_t seq) noexcept +template +ATTRIBUTE_COLD +void log_t::resize_write_low(lsn_t lsn, const byte *end, + size_t len, size_t seq) noexcept { ut_ad(latch_have_any()); + ut_ad(resize_buf); - if (UNIV_LIKELY_NULL(resize_buf)) { ut_ad(end >= buf); end-= len; @@ -1208,92 +1293,141 @@ inline void log_t::resize_write(lsn_t lsn, const byte *end, size_t len, inline void log_t::append(byte *&d, const void *s, size_t size) noexcept { ut_ad(log_sys.latch_have_any()); - ut_ad(d + size <= log_sys.buf + - (log_sys.is_mmap() ? log_sys.file_size : log_sys.buf_size)); + ut_ad(log_sys.is_mmap() + ? ((d >= log_sys.buf && d + size <= log_sys.buf + log_sys.file_size) || + (log_sys.archive && + d >= log_sys.resize_buf && + d + size <= log_sys.resize_buf + log_sys.resize_target)) + : (d >= log_sys.buf && d + size <= log_sys.buf + log_sys.buf_size)); memcpy(d, s, size); d+= size; } -template -std::pair mtr_t::finish_writer(mtr_t *mtr, size_t len) +template +std::pair +mtr_t::finish_writer(mtr_t *mtr, size_t len) { ut_ad(log_sys.is_latest()); ut_ad(!recv_no_log_write); ut_ad(mtr->is_logged()); ut_ad(mtr->m_latch_ex ? log_sys.latch_have_wr() : log_sys.latch_have_rd()); ut_ad(len < recv_sys.MTR_SIZE_MAX); + ut_ad(mode == log_t::WRITE_NORMAL || + log_sys.archive == (mode == log_t::ARCHIVED_MMAP)); const size_t size{mtr->m_commit_lsn ? 5U + 8U : 5U}; std::pair start= - log_sys.append_prepare(len, mtr->m_latch_ex); + log_sys.append_prepare(len, mtr->m_latch_ex); - if (!mmap) - { + if (mode == log_t::WRITE_NORMAL) +#ifdef HAVE_PMEM + write_normal: +#endif for (const mtr_buf_t::block_t &b : mtr->m_log) log_sys.append(start.second, b.begin(), b.used()); - - write_trailer: - *start.second++= log_sys.get_sequence_bit(start.first + len - size); - if (mtr->m_commit_lsn) - { - mach_write_to_8(start.second, mtr->m_commit_lsn); - mtr->m_crc= my_crc32c(mtr->m_crc, start.second, 8); - start.second+= 8; - } - mach_write_to_4(start.second, mtr->m_crc); - start.second+= 4; - } +#ifdef HAVE_PMEM else { - if (UNIV_LIKELY(start.second + len <= &log_sys.buf[log_sys.file_size])) + const size_t file_size= log_sys.file_size; + byte *const buf{log_sys.buf}; + byte *end= &buf[file_size]; + if (UNIV_LIKELY(start.second + len <= end)) + goto write_normal; + byte *const begin= mode == log_t::ARCHIVED_MMAP + ? log_sys.get_archived_mmap_switch() + : buf + log_sys.START_OFFSET; + if (mode == log_t::ARCHIVED_MMAP && UNIV_UNLIKELY(start.second > end)) { - for (const mtr_buf_t::block_t &b : mtr->m_log) - log_sys.append(start.second, b.begin(), b.used()); - goto write_trailer; + /* Our mini-transaction will not span two log files. We are + somewhere between log_t::archived_mmap_switch_prepare() and + log_t::archived_mmap_switch_complete(), and our entire log must + be written to the new file. */ + start.second= begin + (start.second - end); + goto write_normal; } + for (const mtr_buf_t::block_t &b : mtr->m_log) { size_t size{b.used()}; - const size_t size_left(&log_sys.buf[log_sys.file_size] - start.second); + const size_t size_left(end - start.second); const byte *src= b.begin(); if (size > size_left) { ::memcpy(start.second, src, size_left); - start.second= &log_sys.buf[log_sys.START_OFFSET]; + start.second= begin; + if (mode == log_t::ARCHIVED_MMAP) + /* An approximation; the minimum innodb_log_file_size + always exceeds the maximum mtr->get_log_size() */ + end= begin + file_size; src+= size_left; size-= size_left; } ::memcpy(start.second, src, size); start.second+= size; } - const size_t size_left(&log_sys.buf[log_sys.file_size] - start.second); - if (size_left > size) - goto write_trailer; + const size_t size_left(end - start.second); + if (size_left <= size) + { + byte tail[5 + 8]; + tail[0]= + (mode == log_t::WRITE_NORMAL + ? log_sys.archive : mode == log_t::ARCHIVED_MMAP) || + log_sys.get_sequence_bit(start.first + len - size); - byte tail[5 + 8]; - tail[0]= log_sys.get_sequence_bit(start.first + len - size); + if (mtr->m_commit_lsn) + { + mach_write_to_8(tail + 1, mtr->m_commit_lsn); + mtr->m_crc= my_crc32c(mtr->m_crc, tail + 1, 8); + mach_write_to_4(tail + 9, mtr->m_crc); + } + else + mach_write_to_4(tail + 1, mtr->m_crc); - if (mtr->m_commit_lsn) - { - mach_write_to_8(tail + 1, mtr->m_commit_lsn); - mtr->m_crc= my_crc32c(mtr->m_crc, tail + 1, 8); - mach_write_to_4(tail + 9, mtr->m_crc); + ::memcpy(start.second, tail, size_left); + ::memcpy(begin, tail + size_left, size - size_left); + start.second= ((size >= size_left) ? begin : end) + (size - size_left); + goto wrote_trailer; } - else - mach_write_to_4(tail + 1, mtr->m_crc); - - ::memcpy(start.second, tail, size_left); - ::memcpy(log_sys.buf + log_sys.START_OFFSET, tail + size_left, - size - size_left); - start.second= log_sys.buf + - ((size >= size_left) ? log_sys.START_OFFSET : log_sys.file_size) + - (size - size_left); } +#endif - log_sys.resize_write(start.first, start.second, len, size); + *start.second++= + (mode == log_t::WRITE_NORMAL + ? log_sys.archive : mode == log_t::ARCHIVED_MMAP) || + log_sys.get_sequence_bit(start.first + len - size); + + if (mtr->m_commit_lsn) + { + mach_write_to_8(start.second, mtr->m_commit_lsn); + mtr->m_crc= my_crc32c(mtr->m_crc, start.second, 8); + start.second+= 8; + } + mach_write_to_4(start.second, mtr->m_crc); + start.second+= 4; + +#ifdef HAVE_PMEM +wrote_trailer: +#else + static_assert(mode == log_t::WRITE_NORMAL, ""); +#endif mtr->m_commit_lsn= start.first + len; - return {start.first, log_close(mtr->m_commit_lsn)}; + + switch (mode) { + case log_t::ARCHIVED_MMAP: + ut_ad(!log_sys.resize_in_progress()); + return {start.first, (log_sys.get_first_lsn() > log_sys.last_checkpoint_lsn + ? log_sys.get_first_lsn() : 0)}; + case log_t::CIRCULAR_MMAP: + log_sys.resize_write(start.first, start.second, len, size); + return {start.first, log_close(mtr->m_commit_lsn)}; + case log_t::WRITE_NORMAL: + log_sys.resize_write(start.first, start.second, len, size); + } + return {start.first, log_sys.archive + ? (log_sys.get_first_lsn() > log_sys.last_checkpoint_lsn + ? log_sys.get_first_lsn() : 0) + : log_close(mtr->m_commit_lsn)}; } bool mtr_t::have_x_latch(const buf_block_t &block) const diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index 318357432cfdf..40071694bf271 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -160,13 +160,12 @@ static PSI_stage_info* srv_stages[] = static void delete_log_files() { for (size_t i= 1; i < 102; i++) - delete_log_file(std::to_string(i).c_str()); + os_file_delete_if_exists_func(log_sys.get_circular_path(i).c_str(), nullptr); } /** Creates log file. @param create_new_db whether the database is being initialized @param lsn log sequence number -@param logfile0 name of the log file @return DB_SUCCESS or error code */ static dberr_t create_log_file(bool create_new_db, lsn_t lsn) { @@ -192,7 +191,7 @@ static dberr_t create_log_file(bool create_new_db, lsn_t lsn) log_sys.latch.wr_lock(SRW_LOCK_CALL); log_sys.set_capacity(); - std::string logfile0{get_log_file_path("ib_logfile101")}; + const std::string logfile0{log_sys.get_circular_path(101)}; bool ret; os_file_t file{ os_file_create_func(logfile0.c_str(), @@ -218,7 +217,7 @@ static dberr_t create_log_file(bool create_new_db, lsn_t lsn) } log_sys.set_latest_format(srv_encrypt_log); - if (!log_sys.attach(file, srv_log_file_size)) { + if (!log_sys.attach(file, srv_log_file_size, false)) { goto close_and_exit; } @@ -265,8 +264,8 @@ static dberr_t create_log_file(bool create_new_db, lsn_t lsn) @return whether an error occurred */ bool log_t::resize_rename() noexcept { - std::string old_name{get_log_file_path("ib_logfile101")}; - std::string new_name{get_log_file_path()}; + const std::string old_name{get_circular_path(101)}; + const std::string new_name{log_sys.get_path()}; if (IF_WIN(MoveFileEx(old_name.c_str(), new_name.c_str(), MOVEFILE_REPLACE_EXISTING), @@ -803,7 +802,8 @@ srv_check_undo_redo_logs_exists() } /* Check if redo log file exists */ - auto logfilename = get_log_file_path(); + const std::string logfilename{log_sys.get_circular_path()}; + // FIXME: check for archived log as well fh = os_file_create_func(logfilename.c_str(), OS_FILE_OPEN_RETRY_SILENT, @@ -1210,6 +1210,8 @@ static dberr_t srv_log_rebuild_if_needed() if (srv_read_only_mode) /* Leave the redo log alone. */ return DB_SUCCESS; + if (log_sys.archive) + return DB_SUCCESS; /* Never rebuild archived log files. */ if (log_sys.file_size == srv_log_file_size && log_sys.format == @@ -1458,9 +1460,25 @@ dberr_t srv_start(bool create_new_db) } recv_sys.debug_free(); } else { - err = recv_recovery_read_checkpoint(); - if (err != DB_SUCCESS) { - return srv_init_abort(err); + ut_ad(srv_operation <= SRV_OPERATION_EXPORT_RESTORED + || srv_operation == SRV_OPERATION_RESTORE + || srv_operation == SRV_OPERATION_RESTORE_EXPORT); + ut_ad(!recv_sys.recovery_on); + /* Suppress warnings in fil_space_t::create() for files + that are being read before dict_boot() has recovered + DICT_HDR_MAX_SPACE_ID. */ + fil_system.space_id_reuse_warned = true; + + if (srv_force_recovery >= SRV_FORCE_NO_LOG_REDO) { + sql_print_information("InnoDB: innodb_force_recovery=6" + " skips redo log apply"); + } else { + log_sys.latch.wr_lock(SRW_LOCK_CALL); + err = recv_sys.find_checkpoint(); + log_sys.latch.wr_unlock(); + if (err != DB_SUCCESS) { + return srv_init_abort(err); + } } } @@ -1599,12 +1617,9 @@ dberr_t srv_start(bool create_new_db) if (log_sys.resize_rename()) { return(srv_init_abort(DB_ERROR)); } - } else { - /* Suppress warnings in fil_space_t::create() for files - that are being read before dict_boot() has recovered - DICT_HDR_MAX_SPACE_ID. */ - fil_system.space_id_reuse_warned = true; + if (log_sys.archive) log_sys.archive_set_size(); + } else { /* We always try to do a recovery, even if the database had been shut down normally: this is the normal startup path */ @@ -1720,6 +1735,8 @@ dberr_t srv_start(bool create_new_db) recv_sys.debug_free(); + if (log_sys.archive) log_sys.archive_set_size(); + if (!srv_read_only_mode) { const uint32_t flags = FSP_FLAGS_PAGE_SSIZE(); for (uint32_t id = srv_undo_space_id_start;