From 194800030253aa84762fabb8d38faec9e65ca97e Mon Sep 17 00:00:00 2001 From: Vlad Lesin Date: Fri, 21 Feb 2025 14:13:43 +0300 Subject: [PATCH 1/6] MDEV-31956 SSD based InnoDB buffer pool extension In one of the practical cloud MariaDB setups, a server node accesses its datadir over the network, but also has a fast local SSD storage for temporary data. The content of such temporary storage is lost when the server container is destroyed. The commit uses this ephemeral fast local storage (SSD) as an extension of the portion of InnoDB buffer pool (DRAM) that caches persistent data pages. This cache is separated from the persistent storage of data files and ib_logfile0 and ignored during backup. The following system variables were introduced: innodb_extended_buffer_pool_size - the size of external buffer pool file, if it equals to 0, external buffer pool will not be used; innodb_extended_buffer_pool_path - the path to external buffer pool file. If innodb_extended_buffer_pool_size is not equal to 0, external buffer pool file will be created on startup. Only clean pages will be flushed to external buffer pool file. There is no need to flush dirty pages, as such pages will become clean after flushing, and then will be evicted when they reach the tail of LRU list. The general idea of this commit is to flush clean pages to external buffer pool file when they are evicted. A page can be evicted either by transaction thread or by background thread of page cleaner. In some cases transaction thread is waiting for page cleaner thread to finish its job. We can't do flushing in external buffer pool file when transaction threads are waithing for eviction, that would heart performance. That's why the only case for flushing is when page cleaner thread evicts pages in background and there are no waiters. For this purprose buf_pool_t::done_flush_list_waiters_count variable was introduced, we flush evicted clean pages only if the variable is zeroed. Clean pages are evicted in buf_flush_LRU_list_batch() to keep some amount of pages in buffer pool's free list. That's why we flush every second page to external buffer pool file, otherwise there could be not enought amount of pages in free list to let transaction threads to allocate buffer pool pages without page cleaner waiting. This might be not a good solution, but this is enought for prototyping. External buffer pool page is introduced to store information in buffer pool page hash about the certain page can be read from external buffer pool file. The first several members of such page must be the same as the members of internal page. External page frame must be equal to the certain value to disthinguish external page from internal one. External buffer pages are preallocated on startup in external pages array. We could get rid of the frame in external page, and check if the page's address belongs to the array to distinguish external and internal pages. There are also external pages free and LRU lists. When some internal page is decided to be flushed in external buffer pool file, a new external page is allocated eighter from the head of external free list, or from the tail of external LRU list. Both lists are protected with buf_pool.mutex. It makes sense, because a page is removed from internal LRU list during eviction under buf_pool.mutex. Then internal page is locked and the allocated external page is attached to io request for external buffer pool file, and when write request is completed, the internal page is replaced with external one in page hash, external page is pushed to the head of external LRU list and internal page is unlocked. After internal page was removed from external free list, it was not placed in external LRU, and placed there only after write completion, so the page can't be used by the other threads until write is completed. Page hash chain get element function has additional template parameter, which notifies the function if external pages must be ignored or not. We don't ignore external pages in page hash in two cases, when some page is initialized for read and when one is reinitialized for new page creating. When an internal page is initialized for read and external page with the same page id is found in page hash, the internal page is locked, the external page in replaced with newly initialized internal page in the page hash chain, the external page is removed from external LRU list and attached to io request to external buffer pool file. When the io request is completed, external page is returned to external free list, internal page is unlocked. So during read external page absents in both external LRU and free lists and can't be reused. When an internal page is initialized for new page creating and external pages with the same page id is found in page hash, we just remove external page from the page hash chain and external LRU list and push it to the head of external free list. So the external page can be used for future flushing. The pages are flushed to and read from external buffer pool file with the same manner as they are flushed to their spaces, i.e. compressed and encrypted pages stay compressed and encrypted in external buffer pool file. --- mysql-test/suite/innodb/r/ext_buf_pool.result | 512 ++++++++++++++++++ .../r/innodb_information_schema_buffer.result | 4 +- .../innodb/r/innodb_status_variables.result | 2 + mysql-test/suite/innodb/t/ext_buf_pool.opt | 1 + mysql-test/suite/innodb/t/ext_buf_pool.test | 167 ++++++ .../innodb_buffer_pool_stats.result | 4 +- .../suite/sys_vars/r/sysvars_innodb.result | 27 +- .../suite/sys_vars/t/sysvars_innodb.test | 3 +- storage/innobase/buf/buf0buf.cc | 72 ++- storage/innobase/buf/buf0dblwr.cc | 40 +- storage/innobase/buf/buf0flu.cc | 265 ++++++--- storage/innobase/buf/buf0lru.cc | 34 +- storage/innobase/buf/buf0rea.cc | 197 +++++-- storage/innobase/fil/fil0fil.cc | 174 +++++- storage/innobase/handler/ha_innodb.cc | 65 +++ storage/innobase/handler/i_s.cc | 12 + storage/innobase/include/buf0buf.h | 217 ++++++-- storage/innobase/include/buf0lru.h | 6 +- storage/innobase/include/buf0types.h | 1 + storage/innobase/include/fil0fil.h | 48 ++ storage/innobase/include/os0file.h | 81 ++- storage/innobase/log/log0recv.cc | 31 +- storage/innobase/os/os0file.cc | 64 ++- storage/innobase/row/row0merge.cc | 70 +-- storage/innobase/srv/srv0start.cc | 10 + 25 files changed, 1786 insertions(+), 321 deletions(-) create mode 100644 mysql-test/suite/innodb/r/ext_buf_pool.result create mode 100644 mysql-test/suite/innodb/t/ext_buf_pool.opt create mode 100644 mysql-test/suite/innodb/t/ext_buf_pool.test diff --git a/mysql-test/suite/innodb/r/ext_buf_pool.result b/mysql-test/suite/innodb/r/ext_buf_pool.result new file mode 100644 index 0000000000000..1ecbab38815bc --- /dev/null +++ b/mysql-test/suite/innodb/r/ext_buf_pool.result @@ -0,0 +1,512 @@ +connect prevent_purge,localhost,root; +START TRANSACTION WITH CONSISTENT SNAPSHOT; +connection default; +SET GLOBAL innodb_limit_optimistic_insert_debug = 3; +SET GLOBAL DEBUG_DBUG='+d,ib_ext_bp_count_io_only_for_t'; +SET GLOBAL DEBUG_DBUG='+d,ib_ext_bp_disable_LRU_eviction_for_t'; +################################################################### +# Testing for encrypted ROW_FORMAT=COMPRESSED table # +################################################################### +CREATE TABLE t ( +`a` INT NOT NULL, +PRIMARY KEY (`a`) +) ENGINE=InnoDB STATS_PERSISTENT=0 ROW_FORMAT=COMPRESSED encrypted=yes +encryption_key_id=1; +SELECT variable_value INTO @prev_flushed_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; +SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL INTO @prev_written_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +SELECT variable_value INTO @prev_reads_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; +SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL INTO @prev_reads_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +SET @start_val = 6*100; +INSERT INTO t SET a = @start_val+1; +INSERT INTO t SET a = @start_val+2; +INSERT INTO t SET a = @start_val+3; +INSERT INTO t SET a = @start_val+4; +INSERT INTO t SET a = @start_val+5; +INSERT INTO t SET a = @start_val+6; +INSERT INTO t SET a = @start_val+7; +INSERT INTO t SET a = @start_val+8; +INSERT INTO t SET a = @start_val+9; +INSERT INTO t SET a = @start_val+10; +INSERT INTO t SET a = @start_val+11; +INSERT INTO t SET a = @start_val+12; +SET GLOBAL DEBUG_DBUG='-d,ib_ext_bp_disable_LRU_eviction_for_t'; +SET GLOBAL innodb_force_LRU_eviction = TRUE; +SELECT variable_value-@prev_flushed_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; +variable_value-@prev_flushed_gs +9 +SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +9 +SELECT variable_value-@prev_reads_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; +variable_value-@prev_reads_gs +0 +SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +0 +SELECT * FROM t; +a +601 +602 +603 +604 +605 +606 +607 +608 +609 +610 +611 +612 +SELECT variable_value-@prev_flushed_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; +variable_value-@prev_flushed_gs +9 +SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +9 +SELECT variable_value-@prev_reads_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; +variable_value-@prev_reads_gs +7 +SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +7 +DROP TABLE t; +SET GLOBAL DEBUG_DBUG='+d,ib_ext_bp_disable_LRU_eviction_for_t'; +################################################################### +# Testing for unencrypted ROW_FORMAT=COMPRESSED table # +################################################################### +CREATE TABLE t ( +`a` INT NOT NULL, +PRIMARY KEY (`a`) +) ENGINE=InnoDB STATS_PERSISTENT=0 ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=1; +SELECT variable_value INTO @prev_flushed_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; +SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL INTO @prev_written_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +SELECT variable_value INTO @prev_reads_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; +SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL INTO @prev_reads_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +SET @start_val = 5*100; +INSERT INTO t SET a = @start_val+1; +INSERT INTO t SET a = @start_val+2; +INSERT INTO t SET a = @start_val+3; +INSERT INTO t SET a = @start_val+4; +INSERT INTO t SET a = @start_val+5; +INSERT INTO t SET a = @start_val+6; +INSERT INTO t SET a = @start_val+7; +INSERT INTO t SET a = @start_val+8; +INSERT INTO t SET a = @start_val+9; +INSERT INTO t SET a = @start_val+10; +INSERT INTO t SET a = @start_val+11; +INSERT INTO t SET a = @start_val+12; +SET GLOBAL DEBUG_DBUG='-d,ib_ext_bp_disable_LRU_eviction_for_t'; +SET GLOBAL innodb_force_LRU_eviction = TRUE; +SELECT variable_value-@prev_flushed_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; +variable_value-@prev_flushed_gs +9 +SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +9 +SELECT variable_value-@prev_reads_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; +variable_value-@prev_reads_gs +0 +SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +0 +SELECT * FROM t; +a +501 +502 +503 +504 +505 +506 +507 +508 +509 +510 +511 +512 +SELECT variable_value-@prev_flushed_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; +variable_value-@prev_flushed_gs +9 +SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +9 +SELECT variable_value-@prev_reads_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; +variable_value-@prev_reads_gs +7 +SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +7 +DROP TABLE t; +SET GLOBAL DEBUG_DBUG='+d,ib_ext_bp_disable_LRU_eviction_for_t'; +################################################################### +# Testing for unencrypted uncompressed table # +################################################################### +CREATE TABLE t ( +`a` INT NOT NULL, +PRIMARY KEY (`a`) +) ENGINE=InnoDB STATS_PERSISTENT=0; +SELECT variable_value INTO @prev_flushed_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; +SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL INTO @prev_written_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +SELECT variable_value INTO @prev_reads_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; +SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL INTO @prev_reads_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +SET @start_val = 4*100; +INSERT INTO t SET a = @start_val+1; +INSERT INTO t SET a = @start_val+2; +INSERT INTO t SET a = @start_val+3; +INSERT INTO t SET a = @start_val+4; +INSERT INTO t SET a = @start_val+5; +INSERT INTO t SET a = @start_val+6; +INSERT INTO t SET a = @start_val+7; +INSERT INTO t SET a = @start_val+8; +INSERT INTO t SET a = @start_val+9; +INSERT INTO t SET a = @start_val+10; +INSERT INTO t SET a = @start_val+11; +INSERT INTO t SET a = @start_val+12; +SET GLOBAL DEBUG_DBUG='-d,ib_ext_bp_disable_LRU_eviction_for_t'; +SET GLOBAL innodb_force_LRU_eviction = TRUE; +SELECT variable_value-@prev_flushed_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; +variable_value-@prev_flushed_gs +9 +SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +9 +SELECT variable_value-@prev_reads_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; +variable_value-@prev_reads_gs +0 +SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +0 +SELECT * FROM t; +a +401 +402 +403 +404 +405 +406 +407 +408 +409 +410 +411 +412 +SELECT variable_value-@prev_flushed_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; +variable_value-@prev_flushed_gs +9 +SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +9 +SELECT variable_value-@prev_reads_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; +variable_value-@prev_reads_gs +7 +SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +7 +DROP TABLE t; +SET GLOBAL DEBUG_DBUG='+d,ib_ext_bp_disable_LRU_eviction_for_t'; +################################################################### +# Testing for encrypted uncompressed table # +################################################################### +CREATE TABLE t ( +`a` INT NOT NULL, +PRIMARY KEY (`a`) +) ENGINE=InnoDB STATS_PERSISTENT=0 encrypted=yes encryption_key_id=1; +SELECT variable_value INTO @prev_flushed_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; +SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL INTO @prev_written_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +SELECT variable_value INTO @prev_reads_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; +SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL INTO @prev_reads_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +SET @start_val = 3*100; +INSERT INTO t SET a = @start_val+1; +INSERT INTO t SET a = @start_val+2; +INSERT INTO t SET a = @start_val+3; +INSERT INTO t SET a = @start_val+4; +INSERT INTO t SET a = @start_val+5; +INSERT INTO t SET a = @start_val+6; +INSERT INTO t SET a = @start_val+7; +INSERT INTO t SET a = @start_val+8; +INSERT INTO t SET a = @start_val+9; +INSERT INTO t SET a = @start_val+10; +INSERT INTO t SET a = @start_val+11; +INSERT INTO t SET a = @start_val+12; +SET GLOBAL DEBUG_DBUG='-d,ib_ext_bp_disable_LRU_eviction_for_t'; +SET GLOBAL innodb_force_LRU_eviction = TRUE; +SELECT variable_value-@prev_flushed_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; +variable_value-@prev_flushed_gs +9 +SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +9 +SELECT variable_value-@prev_reads_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; +variable_value-@prev_reads_gs +0 +SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +0 +SELECT * FROM t; +a +301 +302 +303 +304 +305 +306 +307 +308 +309 +310 +311 +312 +SELECT variable_value-@prev_flushed_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; +variable_value-@prev_flushed_gs +9 +SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +9 +SELECT variable_value-@prev_reads_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; +variable_value-@prev_reads_gs +7 +SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +7 +DROP TABLE t; +SET GLOBAL DEBUG_DBUG='+d,ib_ext_bp_disable_LRU_eviction_for_t'; +################################################################### +# Testing for unencrypted PAGE_COMPRESSED=1 table # +################################################################### +CREATE TABLE t ( +`a` INT NOT NULL, +PRIMARY KEY (`a`) +) ENGINE=InnoDB STATS_PERSISTENT=0 PAGE_COMPRESSED=1; +SELECT variable_value INTO @prev_flushed_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; +SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL INTO @prev_written_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +SELECT variable_value INTO @prev_reads_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; +SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL INTO @prev_reads_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +SET @start_val = 2*100; +INSERT INTO t SET a = @start_val+1; +INSERT INTO t SET a = @start_val+2; +INSERT INTO t SET a = @start_val+3; +INSERT INTO t SET a = @start_val+4; +INSERT INTO t SET a = @start_val+5; +INSERT INTO t SET a = @start_val+6; +INSERT INTO t SET a = @start_val+7; +INSERT INTO t SET a = @start_val+8; +INSERT INTO t SET a = @start_val+9; +INSERT INTO t SET a = @start_val+10; +INSERT INTO t SET a = @start_val+11; +INSERT INTO t SET a = @start_val+12; +SET GLOBAL DEBUG_DBUG='-d,ib_ext_bp_disable_LRU_eviction_for_t'; +SET GLOBAL innodb_force_LRU_eviction = TRUE; +SELECT variable_value-@prev_flushed_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; +variable_value-@prev_flushed_gs +9 +SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +9 +SELECT variable_value-@prev_reads_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; +variable_value-@prev_reads_gs +0 +SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +0 +SELECT * FROM t; +a +201 +202 +203 +204 +205 +206 +207 +208 +209 +210 +211 +212 +SELECT variable_value-@prev_flushed_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; +variable_value-@prev_flushed_gs +9 +SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +9 +SELECT variable_value-@prev_reads_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; +variable_value-@prev_reads_gs +7 +SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +7 +DROP TABLE t; +SET GLOBAL DEBUG_DBUG='+d,ib_ext_bp_disable_LRU_eviction_for_t'; +################################################################### +# Testing for encrypted PAGE_COMPRESSED=1 table # +################################################################### +CREATE TABLE t ( +`a` INT NOT NULL, +PRIMARY KEY (`a`) +) ENGINE=InnoDB STATS_PERSISTENT=0 PAGE_COMPRESSED=1 encrypted=yes +encryption_key_id=1; +SELECT variable_value INTO @prev_flushed_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; +SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL INTO @prev_written_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +SELECT variable_value INTO @prev_reads_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; +SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL INTO @prev_reads_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +SET @start_val = 1*100; +INSERT INTO t SET a = @start_val+1; +INSERT INTO t SET a = @start_val+2; +INSERT INTO t SET a = @start_val+3; +INSERT INTO t SET a = @start_val+4; +INSERT INTO t SET a = @start_val+5; +INSERT INTO t SET a = @start_val+6; +INSERT INTO t SET a = @start_val+7; +INSERT INTO t SET a = @start_val+8; +INSERT INTO t SET a = @start_val+9; +INSERT INTO t SET a = @start_val+10; +INSERT INTO t SET a = @start_val+11; +INSERT INTO t SET a = @start_val+12; +SET GLOBAL DEBUG_DBUG='-d,ib_ext_bp_disable_LRU_eviction_for_t'; +SET GLOBAL innodb_force_LRU_eviction = TRUE; +SELECT variable_value-@prev_flushed_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; +variable_value-@prev_flushed_gs +9 +SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +9 +SELECT variable_value-@prev_reads_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; +variable_value-@prev_reads_gs +0 +SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +0 +SELECT * FROM t; +a +101 +102 +103 +104 +105 +106 +107 +108 +109 +110 +111 +112 +SELECT variable_value-@prev_flushed_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; +variable_value-@prev_flushed_gs +9 +SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps +9 +SELECT variable_value-@prev_reads_gs +FROM information_schema.global_status +WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; +variable_value-@prev_reads_gs +7 +SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; +NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps +7 +DROP TABLE t; +disconnect prevent_purge; diff --git a/mysql-test/suite/innodb/r/innodb_information_schema_buffer.result b/mysql-test/suite/innodb/r/innodb_information_schema_buffer.result index e87b35383a70e..7684c96f82b87 100644 --- a/mysql-test/suite/innodb/r/innodb_information_schema_buffer.result +++ b/mysql-test/suite/innodb/r/innodb_information_schema_buffer.result @@ -1,6 +1,6 @@ SELECT * FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; -POOL_ID POOL_SIZE FREE_BUFFERS DATABASE_PAGES OLD_DATABASE_PAGES MODIFIED_DATABASE_PAGES PENDING_DECOMPRESS PENDING_READS PENDING_FLUSH_LRU PENDING_FLUSH_LIST PAGES_MADE_YOUNG PAGES_NOT_MADE_YOUNG PAGES_MADE_YOUNG_RATE PAGES_MADE_NOT_YOUNG_RATE NUMBER_PAGES_READ NUMBER_PAGES_CREATED NUMBER_PAGES_WRITTEN PAGES_READ_RATE PAGES_CREATE_RATE PAGES_WRITTEN_RATE NUMBER_PAGES_GET HIT_RATE YOUNG_MAKE_PER_THOUSAND_GETS NOT_YOUNG_MAKE_PER_THOUSAND_GETS NUMBER_PAGES_READ_AHEAD NUMBER_READ_AHEAD_EVICTED READ_AHEAD_RATE READ_AHEAD_EVICTED_RATE LRU_IO_TOTAL LRU_IO_CURRENT UNCOMPRESS_TOTAL UNCOMPRESS_CURRENT -# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # +POOL_ID POOL_SIZE FREE_BUFFERS DATABASE_PAGES OLD_DATABASE_PAGES MODIFIED_DATABASE_PAGES PENDING_DECOMPRESS PENDING_READS PENDING_FLUSH_LRU PENDING_FLUSH_LIST PAGES_MADE_YOUNG PAGES_NOT_MADE_YOUNG PAGES_MADE_YOUNG_RATE PAGES_MADE_NOT_YOUNG_RATE NUMBER_PAGES_READ NUMBER_PAGES_CREATED NUMBER_PAGES_WRITTEN PAGES_READ_RATE PAGES_CREATE_RATE PAGES_WRITTEN_RATE NUMBER_PAGES_GET HIT_RATE YOUNG_MAKE_PER_THOUSAND_GETS NOT_YOUNG_MAKE_PER_THOUSAND_GETS NUMBER_PAGES_READ_AHEAD NUMBER_READ_AHEAD_EVICTED READ_AHEAD_RATE READ_AHEAD_EVICTED_RATE LRU_IO_TOTAL LRU_IO_CURRENT UNCOMPRESS_TOTAL UNCOMPRESS_CURRENT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL +# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # CREATE TABLE infoschema_buffer_test (col1 INT) ENGINE = INNODB; INSERT INTO infoschema_buffer_test VALUES(9); SELECT * FROM INFORMATION_SCHEMA.INNODB_BUFFER_PAGE diff --git a/mysql-test/suite/innodb/r/innodb_status_variables.result b/mysql-test/suite/innodb/r/innodb_status_variables.result index c6f4d4f27c45a..cc3843f283aaa 100644 --- a/mysql-test/suite/innodb/r/innodb_status_variables.result +++ b/mysql-test/suite/innodb/r/innodb_status_variables.result @@ -41,8 +41,10 @@ INNODB_BUFFER_POOL_READ_AHEAD INNODB_BUFFER_POOL_READ_AHEAD_EVICTED INNODB_BUFFER_POOL_READ_REQUESTS INNODB_BUFFER_POOL_READS +INNODB_EXT_BUFFER_POOL_READS INNODB_BUFFER_POOL_WAIT_FREE INNODB_BUFFER_POOL_WRITE_REQUESTS +INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED INNODB_CHECKPOINT_AGE INNODB_CHECKPOINT_MAX_AGE INNODB_DATA_FSYNCS diff --git a/mysql-test/suite/innodb/t/ext_buf_pool.opt b/mysql-test/suite/innodb/t/ext_buf_pool.opt new file mode 100644 index 0000000000000..2178b845a2fb7 --- /dev/null +++ b/mysql-test/suite/innodb/t/ext_buf_pool.opt @@ -0,0 +1 @@ +--innodb-buffer-pool-size=21M --innodb-extended-buffer-pool-size=1M diff --git a/mysql-test/suite/innodb/t/ext_buf_pool.test b/mysql-test/suite/innodb/t/ext_buf_pool.test new file mode 100644 index 0000000000000..e172686cf179a --- /dev/null +++ b/mysql-test/suite/innodb/t/ext_buf_pool.test @@ -0,0 +1,167 @@ +--source include/have_innodb.inc +--source include/have_debug.inc +--source include/have_debug_sync.inc +--source include/count_sessions.inc +--source ../encryption/include/have_file_key_management_plugin.inc +#--source include/innodb_page_size.inc + +--let $encrypted_row_compressed=6 +--let $unencrypted_row_compressed=5 +--let $unencrypted_uncompressed=4 +--let $encrypted_uncompressed=3 +--let $unencrypted_page_compressed=2 +--let $encrypted_page_compressed=1 +--let $i = $encrypted_row_compressed + +--let $page_size=`SELECT @@GLOBAL.innodb_page_size` +if ($page_size != 16384) { + --let $i=$unencrypted_uncompressed +} + +--connect (prevent_purge,localhost,root) +START TRANSACTION WITH CONSISTENT SNAPSHOT; + +--connection default + +--let $DATADIR = `select @@datadir` + +--disable_query_log +--error 0,ER_UNKNOWN_SYSTEM_VARIABLE +SET @old_innodb_limit_optimistic_insert_debug = @@innodb_limit_optimistic_insert_debug; +SET @old_debug_dbug = @@debug_dbug; +--enable_query_log + +--error 0,ER_UNKNOWN_SYSTEM_VARIABLE +SET GLOBAL innodb_limit_optimistic_insert_debug = 3; +SET GLOBAL DEBUG_DBUG='+d,ib_ext_bp_count_io_only_for_t'; + +while($i) { + + SET GLOBAL DEBUG_DBUG='+d,ib_ext_bp_disable_LRU_eviction_for_t'; + if ($i == $unencrypted_uncompressed) { + --echo ################################################################### + --echo # Testing for unencrypted uncompressed table # + --echo ################################################################### + CREATE TABLE t ( + `a` INT NOT NULL, + PRIMARY KEY (`a`) + ) ENGINE=InnoDB STATS_PERSISTENT=0; + } + if ($i == $encrypted_uncompressed) { + --echo ################################################################### + --echo # Testing for encrypted uncompressed table # + --echo ################################################################### + CREATE TABLE t ( + `a` INT NOT NULL, + PRIMARY KEY (`a`) + ) ENGINE=InnoDB STATS_PERSISTENT=0 encrypted=yes encryption_key_id=1; + } + if ($i == $unencrypted_page_compressed) { + --echo ################################################################### + --echo # Testing for unencrypted PAGE_COMPRESSED=1 table # + --echo ################################################################### + CREATE TABLE t ( + `a` INT NOT NULL, + PRIMARY KEY (`a`) + ) ENGINE=InnoDB STATS_PERSISTENT=0 PAGE_COMPRESSED=1; + } + if ($i == $unencrypted_row_compressed) { + --echo ################################################################### + --echo # Testing for unencrypted ROW_FORMAT=COMPRESSED table # + --echo ################################################################### + CREATE TABLE t ( + `a` INT NOT NULL, + PRIMARY KEY (`a`) + ) ENGINE=InnoDB STATS_PERSISTENT=0 ROW_FORMAT=COMPRESSED KEY_BLOCK_SIZE=1; + } + if ($i == $encrypted_page_compressed) { + --echo ################################################################### + --echo # Testing for encrypted PAGE_COMPRESSED=1 table # + --echo ################################################################### + CREATE TABLE t ( + `a` INT NOT NULL, + PRIMARY KEY (`a`) + ) ENGINE=InnoDB STATS_PERSISTENT=0 PAGE_COMPRESSED=1 encrypted=yes + encryption_key_id=1; + } + if ($i == $encrypted_row_compressed) { + --echo ################################################################### + --echo # Testing for encrypted ROW_FORMAT=COMPRESSED table # + --echo ################################################################### + CREATE TABLE t ( + `a` INT NOT NULL, + PRIMARY KEY (`a`) + ) ENGINE=InnoDB STATS_PERSISTENT=0 ROW_FORMAT=COMPRESSED encrypted=yes + encryption_key_id=1; + } + + SELECT variable_value INTO @prev_flushed_gs + FROM information_schema.global_status + WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; + SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL INTO @prev_written_ps + FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; + SELECT variable_value INTO @prev_reads_gs + FROM information_schema.global_status + WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; + SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL INTO @prev_reads_ps + FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; + + --eval SET @start_val = $i*100 + INSERT INTO t SET a = @start_val+1; + INSERT INTO t SET a = @start_val+2; + INSERT INTO t SET a = @start_val+3; + INSERT INTO t SET a = @start_val+4; + INSERT INTO t SET a = @start_val+5; + INSERT INTO t SET a = @start_val+6; + INSERT INTO t SET a = @start_val+7; + INSERT INTO t SET a = @start_val+8; + INSERT INTO t SET a = @start_val+9; + INSERT INTO t SET a = @start_val+10; + INSERT INTO t SET a = @start_val+11; + INSERT INTO t SET a = @start_val+12; + + SET GLOBAL DEBUG_DBUG='-d,ib_ext_bp_disable_LRU_eviction_for_t'; + SET GLOBAL innodb_force_LRU_eviction = TRUE; + + let $wait_condition = + SELECT (variable_value-@prev_flushed_gs) >= 9 + FROM information_schema.global_status + WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; + --source include/wait_condition.inc + + SELECT variable_value-@prev_flushed_gs + FROM information_schema.global_status + WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; + SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps + FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; + SELECT variable_value-@prev_reads_gs + FROM information_schema.global_status + WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; + SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps + FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; + + SELECT * FROM t; + + SELECT variable_value-@prev_flushed_gs + FROM information_schema.global_status + WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_PAGES_FLUSHED'; + SELECT NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL-@prev_written_ps + FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; + SELECT variable_value-@prev_reads_gs + FROM information_schema.global_status + WHERE variable_name LIKE 'INNODB_EXT_BUFFER_POOL_READS'; + SELECT NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL-@prev_reads_ps + FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; + + DROP TABLE t; + --dec $i +} + +--disable_query_log +SET GLOBAL DEBUG_DBUG=@old_debug_dbug; +--error 0,ER_UNKNOWN_SYSTEM_VARIABLE +SET GLOBAL innodb_limit_optimistic_insert_debug = @old_innodb_limit_optimistic_insert_debug; +--enable_query_log + +--disconnect prevent_purge +--source include/wait_until_count_sessions.inc diff --git a/mysql-test/suite/innodb_i_s/innodb_buffer_pool_stats.result b/mysql-test/suite/innodb_i_s/innodb_buffer_pool_stats.result index f7fdd38f63111..c60caf02fb6be 100644 --- a/mysql-test/suite/innodb_i_s/innodb_buffer_pool_stats.result +++ b/mysql-test/suite/innodb_i_s/innodb_buffer_pool_stats.result @@ -32,5 +32,7 @@ INNODB_BUFFER_POOL_STATS CREATE TEMPORARY TABLE `INNODB_BUFFER_POOL_STATS` ( `LRU_IO_TOTAL` bigint(21) unsigned NOT NULL, `LRU_IO_CURRENT` bigint(21) unsigned NOT NULL, `UNCOMPRESS_TOTAL` bigint(21) unsigned NOT NULL, - `UNCOMPRESS_CURRENT` bigint(21) unsigned NOT NULL + `UNCOMPRESS_CURRENT` bigint(21) unsigned NOT NULL, + `NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL` bigint(21) unsigned NOT NULL, + `NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL` bigint(21) unsigned NOT NULL ) ENGINE=MEMORY DEFAULT CHARSET=utf8mb3 COLLATE=utf8mb3_general_ci diff --git a/mysql-test/suite/sys_vars/r/sysvars_innodb.result b/mysql-test/suite/sys_vars/r/sysvars_innodb.result index b9f1e8b68f60c..6cd87097f71bf 100644 --- a/mysql-test/suite/sys_vars/r/sysvars_innodb.result +++ b/mysql-test/suite/sys_vars/r/sysvars_innodb.result @@ -6,7 +6,8 @@ variable_name not in ( 'innodb_use_native_aio', # default value depends on OS 'innodb_log_file_buffering', # only available on Linux and Windows 'innodb_linux_aio', # existence depends on OS -'innodb_buffer_pool_load_pages_abort') # debug build only, and is only for testing +'innodb_buffer_pool_load_pages_abort', # debug build only, and is only for testing +'innodb_force_lru_eviction') # debug build only, and is only for testing order by variable_name; VARIABLE_NAME INNODB_ADAPTIVE_FLUSHING SESSION_VALUE NULL @@ -572,6 +573,30 @@ NUMERIC_BLOCK_SIZE NULL ENUM_VALUE_LIST OFF,ON READ_ONLY YES COMMAND_LINE_ARGUMENT OPTIONAL +VARIABLE_NAME INNODB_EXTENDED_BUFFER_POOL_PATH +SESSION_VALUE NULL +DEFAULT_VALUE +VARIABLE_SCOPE GLOBAL +VARIABLE_TYPE VARCHAR +VARIABLE_COMMENT Path to extended buffer pool file +NUMERIC_MIN_VALUE NULL +NUMERIC_MAX_VALUE NULL +NUMERIC_BLOCK_SIZE NULL +ENUM_VALUE_LIST NULL +READ_ONLY YES +COMMAND_LINE_ARGUMENT REQUIRED +VARIABLE_NAME INNODB_EXTENDED_BUFFER_POOL_SIZE +SESSION_VALUE NULL +DEFAULT_VALUE 0 +VARIABLE_SCOPE GLOBAL +VARIABLE_TYPE BIGINT UNSIGNED +VARIABLE_COMMENT The extended buffer pool file size +NUMERIC_MIN_VALUE 0 +NUMERIC_MAX_VALUE 18446744073709551615 +NUMERIC_BLOCK_SIZE 0 +ENUM_VALUE_LIST NULL +READ_ONLY NO +COMMAND_LINE_ARGUMENT REQUIRED VARIABLE_NAME INNODB_FAST_SHUTDOWN SESSION_VALUE NULL DEFAULT_VALUE 1 diff --git a/mysql-test/suite/sys_vars/t/sysvars_innodb.test b/mysql-test/suite/sys_vars/t/sysvars_innodb.test index 250eb8b5c8f1b..1d38d8244d66f 100644 --- a/mysql-test/suite/sys_vars/t/sysvars_innodb.test +++ b/mysql-test/suite/sys_vars/t/sysvars_innodb.test @@ -17,5 +17,6 @@ select VARIABLE_NAME, SESSION_VALUE, DEFAULT_VALUE, VARIABLE_SCOPE, VARIABLE_TYP 'innodb_use_native_aio', # default value depends on OS 'innodb_log_file_buffering', # only available on Linux and Windows 'innodb_linux_aio', # existence depends on OS - 'innodb_buffer_pool_load_pages_abort') # debug build only, and is only for testing + 'innodb_buffer_pool_load_pages_abort', # debug build only, and is only for testing + 'innodb_force_lru_eviction') # debug build only, and is only for testing order by variable_name; diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc index a22af4a07dfaa..c8499494dd6ff 100644 --- a/storage/innobase/buf/buf0buf.cc +++ b/storage/innobase/buf/buf0buf.cc @@ -1067,8 +1067,10 @@ inline void buf_pool_t::garbage_collect() noexcept size_in_bytes_requested= size; mysql_mutex_unlock(&mutex); mysql_mutex_lock(&flush_list_mutex); + ++done_flush_list_waiters_count; page_cleaner_wakeup(true); my_cond_wait(&done_flush_list, &flush_list_mutex.m_mutex); + --done_flush_list_waiters_count; mysql_mutex_unlock(&flush_list_mutex); # ifdef BTR_CUR_HASH_ADAPT bool ahi_disabled= btr_search.disable(); @@ -1285,6 +1287,34 @@ buf_block_t *buf_pool_t::allocate() noexcept return nullptr; } +ext_buf_page_t *buf_pool_t::alloc_ext_page(page_id_t page_id) noexcept +{ + mysql_mutex_assert_owner(&mutex); + ext_buf_page_t *p; + if ((p= UT_LIST_GET_FIRST(ext_free))) + UT_LIST_REMOVE(ext_free, p); + else if ((p= UT_LIST_GET_LAST(ext_LRU))) { + for (; p; p= UT_LIST_GET_PREV(ext_LRU_list, p)) { + hash_chain &hash_chain= page_hash.cell_get(p->id_.fold()); + page_hash_latch &hash_lock= page_hash.lock_get(hash_chain); + if (!hash_lock.try_lock()) + continue; + UT_LIST_REMOVE(ext_LRU, p); + page_hash.remove(hash_chain, reinterpret_cast(p)); + hash_lock.unlock(); + break; + } + if (!p) + return nullptr; + } + else + return nullptr; + p->id_= page_id; + ut_d(p->in_LRU_list= p->in_free_list= false); + ut_d(p->in_page_hash= true); + return p; +} + /** Create the hash table. @param n the lower bound of n_cells */ void buf_pool_t::page_hash_table::create(ulint n) noexcept @@ -1436,6 +1466,9 @@ bool buf_pool_t::create() noexcept n_blocks= get_n_blocks(actual_size); n_blocks_to_withdraw= 0; UT_LIST_INIT(free, &buf_page_t::list); + UT_LIST_INIT(ext_free, &ext_buf_page_t::free_list); + ut_d(force_LRU_eviction_to_ebp= 0); + const size_t ssize= srv_page_size_shift - UNIV_PAGE_SIZE_SHIFT_MIN; for (char *extent= memory, @@ -1459,6 +1492,19 @@ bool buf_pool_t::create() noexcept } } + size_t ext_buf_pages_array_size= extended_pages * sizeof(ext_buf_page_t); + ext_buf_pages_array= static_cast( + my_malloc(PSI_NOT_INSTRUMENTED, ext_buf_pages_array_size, MYF(0))); + UT_LIST_INIT(ext_free, &ext_buf_page_t::free_list); + for (ext_buf_page_t *page= ext_buf_pages_array, + *end= ext_buf_pages_array + extended_pages; + page != end; ++page) { + page->frame= reinterpret_cast(ext_buf_page_t::EXT_BUF_FRAME); + ut_d(page->in_free_list= true); + ut_d(page->in_LRU_list= page->in_free_list= false); + UT_LIST_ADD_LAST(ext_free, page); + } + #if defined(__aarch64__) mysql_mutex_init(buf_pool_mutex_key, &mutex, MY_MUTEX_INIT_FAST); #else @@ -1467,6 +1513,7 @@ bool buf_pool_t::create() noexcept UT_LIST_INIT(withdrawn, &buf_page_t::list); UT_LIST_INIT(LRU, &buf_page_t::LRU); + UT_LIST_INIT(ext_LRU, &ext_buf_page_t::ext_LRU_list); UT_LIST_INIT(flush_list, &buf_page_t::list); UT_LIST_INIT(unzip_LRU, &buf_block_t::unzip_LRU); @@ -1594,6 +1641,8 @@ void buf_pool_t::close() noexcept memory_unaligned= nullptr; } + my_free(ext_buf_pages_array); + pthread_cond_destroy(&done_flush_LRU); pthread_cond_destroy(&done_flush_list); pthread_cond_destroy(&do_flush_list); @@ -1877,8 +1926,10 @@ ATTRIBUTE_COLD buf_pool_t::shrink_status buf_pool_t::shrink(size_t size) try_LRU_scan= false; mysql_mutex_unlock(&mutex); + ++done_flush_list_waiters_count; page_cleaner_wakeup(true); my_cond_wait(&done_flush_list, &flush_list_mutex.m_mutex); + --done_flush_list_waiters_count; mysql_mutex_unlock(&flush_list_mutex); mysql_mutex_lock(&mutex); @@ -2095,8 +2146,10 @@ ATTRIBUTE_COLD void buf_pool_t::resize(size_t size, THD *thd) noexcept mysql_mutex_unlock(&mutex); DEBUG_SYNC_C("buf_pool_shrink_before_wakeup"); mysql_mutex_lock(&flush_list_mutex); + ++done_flush_list_waiters_count; page_cleaner_wakeup(true); my_cond_wait(&done_flush_list, &flush_list_mutex.m_mutex); + --done_flush_list_waiters_count; mysql_mutex_unlock(&flush_list_mutex); #ifdef BTR_CUR_HASH_ADAPT ahi_disabled= btr_search.disable(); @@ -3126,7 +3179,7 @@ buf_pool_t::page_hash_table::append(buf_pool_t::hash_chain &chain, *prev= bpage; } -inline void +void buf_pool_t::page_hash_table::replace(buf_pool_t::hash_chain &chain, buf_page_t *old, buf_page_t *bpage) noexcept @@ -3158,7 +3211,20 @@ static buf_block_t *buf_page_create_low(page_id_t page_id, ulint zip_size, retry: mysql_mutex_lock(&buf_pool.mutex); - buf_page_t *bpage= buf_pool.page_hash.get(page_id, chain); + buf_page_t *bpage= buf_pool.page_hash.get(page_id, chain); + + if (bpage && bpage->external()) { + page_hash_latch &hash_lock= buf_pool.page_hash.lock_get(chain); + hash_lock.lock(); + buf_pool.page_hash.remove(chain, bpage); + hash_lock.unlock(); + ut_ad(!bpage->in_page_hash); + ext_buf_page_t *ext_buf_page= + reinterpret_cast(bpage); + buf_pool.remove_ext_page_from_LRU(*ext_buf_page); + buf_pool.free_ext_page(*ext_buf_page); + bpage= nullptr; + } if (bpage) { @@ -4046,6 +4112,8 @@ void buf_pool_t::get_info(buf_pool_info_t *pool_info) noexcept double(stat.n_pages_read - old_stat.n_pages_read) / elapsed; pool_info->pages_created_rate= double(stat.n_pages_created - old_stat.n_pages_created) / elapsed; + pool_info->n_pages_read_from_ebp= stat.n_pages_read_from_ebp; + pool_info->n_pages_written_to_ebp= stat.n_pages_written_to_ebp; pool_info->pages_written_rate= double(stat.n_pages_written - old_stat.n_pages_written) / elapsed; pool_info->n_page_get_delta= pool_info->n_page_gets - diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc index d8b08b4254a87..0c862d9f81467 100644 --- a/storage/innobase/buf/buf0dblwr.cc +++ b/storage/innobase/buf/buf0dblwr.cc @@ -34,6 +34,7 @@ Created 2011/12/19 #include "fil0crypt.h" #include "fil0pagecompress.h" #include "log.h" +#include "scope.h" using st_::span; @@ -86,7 +87,10 @@ bool buf_dblwr_t::create() noexcept { if (is_created()) return true; - + /* Disable external buffer pool flushing for the duration of double write + buffer creating, as double write pages will be removed from LRU */ + ++buf_pool.done_flush_list_waiters_count; + SCOPE_EXIT([]() { --buf_pool.done_flush_list_waiters_count; }); mtr_t mtr{nullptr}; const ulint size= block_size; @@ -680,7 +684,7 @@ bool buf_dblwr_t::flush_buffered_writes(const ulint size) noexcept #ifdef UNIV_DEBUG for (ulint len2= 0, i= 0; i < old_first_free; len2 += srv_page_size, i++) { - buf_page_t *bpage= flush_slot->buf_block_arr[i].request.bpage; + buf_page_t *bpage= flush_slot->buf_block_arr[i].request.bpage(); if (bpage->zip.data) /* No simple validate for ROW_FORMAT=COMPRESSED pages exists. */ @@ -716,7 +720,7 @@ static void *get_frame(const IORequest &request) noexcept { if (request.slot) return request.slot->out_buf; - const buf_page_t *bpage= request.bpage; + const buf_page_t *bpage= request.bpage(); return bpage->zip.data ? bpage->zip.data : bpage->frame; } @@ -726,8 +730,8 @@ void buf_dblwr_t::flush_buffered_writes_completed(const IORequest &request) ut_ad(this == &buf_dblwr); ut_ad(is_created()); ut_ad(!srv_read_only_mode); - ut_ad(!request.bpage); - ut_ad(request.node == fil_system.sys_space->chain.start); + ut_ad(!request.bpage()); + ut_ad(request.node() == fil_system.sys_space->chain.start); ut_ad(request.type == IORequest::DBLWR_BATCH); mysql_mutex_lock(&mutex); ut_ad(batch_running); @@ -753,14 +757,14 @@ void buf_dblwr_t::flush_buffered_writes_completed(const IORequest &request) log_checkpoint(). Writes to the system tablespace should be rare, except when executing DDL or using the non-default settings innodb_file_per_table=OFF or innodb_undo_tablespaces=0. */ - os_file_flush(request.node->handle); + os_file_flush(request.node()->handle); /* The writes have been flushed to disk now and in recovery we will find them in the doublewrite buffer blocks. Next, write the data pages. */ for (ulint i= 0, first_free= flush_slot->first_free; i < first_free; i++) { auto e= flush_slot->buf_block_arr[i]; - buf_page_t* bpage= e.request.bpage; + buf_page_t* bpage= e.request.bpage(); ut_ad(bpage->in_file()); void *frame= get_frame(e.request); @@ -785,10 +789,10 @@ void buf_dblwr_t::flush_buffered_writes_completed(const IORequest &request) ut_ad(lsn); ut_ad(lsn >= bpage->oldest_modification()); log_write_up_to(lsn, true); - ut_ad(!e.request.node->space->full_crc32() || + ut_ad(!e.request.node()->space->full_crc32() || !buf_page_is_corrupted(true, static_cast(frame), - e.request.node->space->flags)); - e.request.node->space->io(e.request, bpage->physical_offset(), e_size, + e.request.node()->space->flags)); + e.request.node()->space->io(e.request, bpage->physical_offset(), e_size, frame, bpage); } } @@ -820,13 +824,13 @@ flush_buffered_writes() will be invoked to make space. @param size payload size in bytes */ void buf_dblwr_t::add_to_batch(const IORequest &request, size_t size) noexcept { - ut_ad(request.bpage); - ut_ad(request.bpage->in_file()); - ut_ad(request.node); - ut_ad(!request.node->space->is_temporary()); - ut_ad(!request.node->space->is_being_imported()); - ut_ad(request.node->space->id == request.bpage->id().space()); - ut_ad(request.node->space->referenced()); + ut_ad(request.bpage()); + ut_ad(request.bpage()->in_file()); + ut_ad(request.node()); + ut_ad(!request.node()->space->is_temporary()); + ut_ad(!request.node()->space->is_being_imported()); + ut_ad(request.node()->space->id == request.bpage()->id().space()); + ut_ad(request.node()->space->referenced()); ut_ad(!srv_read_only_mode); const ulint buf_size= 2 * block_size; @@ -854,7 +858,7 @@ void buf_dblwr_t::add_to_batch(const IORequest &request, size_t size) noexcept are integer multiples of 256, so the above can translate into simple SIMD instructions. Currently, we make no such assumptions about the non-pointer parameters that are passed to the _aligned templates. */ - ut_ad(!request.bpage->zip_size() || request.bpage->zip_size() == size); + ut_ad(!request.bpage()->zip_size() || request.bpage()->zip_size() == size); ut_ad(active_slot->reserved == active_slot->first_free); ut_ad(active_slot->reserved < buf_size); new (active_slot->buf_block_arr + active_slot->first_free++) diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index 2f10c3ded1215..4a4d2a16693bb 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -43,6 +43,7 @@ Created 11/11/1995 Heikki Tuuri #include "fil0pagecompress.h" #include "lzo/lzo1x.h" #include "snappy-c.h" +#include "scope.h" /** Number of pages flushed via LRU. Protected by buf_pool.mutex. Also included in buf_pool.stat.n_pages_written. */ @@ -276,23 +277,27 @@ buf_flush_relocate_on_flush_list( ut_d(buf_flush_validate_low()); } -void buf_page_t::write_complete(bool persistent, bool error, uint32_t state) - noexcept +void buf_page_t::write_complete(space_type type, bool error, + uint32_t state) noexcept { - ut_ad(!persistent == fsp_is_system_temporary(id().space())); + ut_ad(type == EXT_BUF || + (type == TEMPORARY) == fsp_is_system_temporary(id().space())); ut_ad(state >= WRITE_FIX); ut_ad(!frame || frame == reinterpret_cast(this)->frame_address()); if (UNIV_LIKELY(!error)) { + bool persistent = (type == PERSISTENT); ut_d(lsn_t om= oldest_modification()); - ut_ad(om >= 2); + ut_ad(type == EXT_BUF || om >= 2); ut_ad(persistent == (om > 2)); + ut_ad(type != EXT_BUF || !oldest_modification()); /* We use release memory order to guarantee that callers of oldest_modification_acquire() will observe the block as being detached from buf_pool.flush_list, after reading the value 0. */ - oldest_modification_.store(persistent, std::memory_order_release); + if (type != EXT_BUF) + oldest_modification_.store(persistent, std::memory_order_release); } zip.fix.fetch_sub((state >= WRITE_FIX_REINIT) ? (WRITE_FIX_REINIT - UNFIXED) @@ -321,14 +326,14 @@ void buf_page_write_complete(const IORequest &request, bool error) noexcept { ut_ad(request.is_write()); ut_ad(!srv_read_only_mode); - buf_page_t *bpage= request.bpage; + buf_page_t *bpage= request.bpage(); ut_ad(bpage); const auto state= bpage->state(); /* io-fix can only be cleared by buf_page_t::write_complete() and buf_page_t::read_complete() */ ut_ad(state >= buf_page_t::WRITE_FIX); ut_ad(!buf_dblwr.is_inside(bpage->id())); - ut_ad(request.node->space->id == bpage->id().space()); + ut_ad(request.ext_buf() || request.node()->space->id == bpage->id().space()); if (request.slot) request.slot->release(); @@ -341,24 +346,53 @@ void buf_page_write_complete(const IORequest &request, bool error) noexcept mysql_mutex_assert_not_owner(&buf_pool.mutex); mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex); - const bool persistent= bpage->oldest_modification() != 2; + buf_page_t::space_type type= request.ext_buf() + ? buf_page_t::EXT_BUF + : static_cast( + bpage->oldest_modification() == 2); - if (UNIV_UNLIKELY(!persistent) && UNIV_LIKELY(!error)) + if (UNIV_UNLIKELY(type != buf_page_t::PERSISTENT) && UNIV_LIKELY(!error)) { + if (type == buf_page_t::EXT_BUF) + { + ut_d(if (DBUG_IF("ib_ext_bp_count_io_only_for_t")) { + if (fil_space_t *space= fil_space_t::get(bpage->id_.space())) + { + auto space_name= space->name(); + if (fil_page_get_type(bpage->frame) == FIL_PAGE_INDEX && + space_name.data() && + !strncmp(space_name.data(), "test/t.ibd", space_name.size())) + { + ++buf_pool.stat.n_pages_written_to_ebp; + } + space->release(); + } + } else) + ++buf_pool.stat.n_pages_written_to_ebp; + } /* We must hold buf_pool.mutex while releasing the block, so that no other thread can access it before we have freed it. */ mysql_mutex_lock(&buf_pool.mutex); - bpage->write_complete(persistent, error, state); - buf_LRU_free_page(bpage, true); + bpage->write_complete(type, error, state); + buf_LRU_free_page(bpage, true, + request.ext_buf() ? request.ext_buf_page() : nullptr); mysql_mutex_unlock(&buf_pool.mutex); } else { - bpage->write_complete(persistent, error, state); + if (error && type == buf_page_t::EXT_BUF && + fil_system.ext_buf_pool_enabled()) + { + sql_print_warning("InnoDB: There was IO error during writing to " + "external buffer pool file, external buffer pool is " + "disabled."); + fil_system.ext_buf_pool_disable(); + } + bpage->write_complete(type, error, state); if (request.is_doublewritten()) { ut_ad(state < buf_page_t::WRITE_FIX_REINIT); - ut_ad(persistent); + ut_ad(type == buf_page_t::PERSISTENT); buf_dblwr.write_completed(); } } @@ -728,8 +762,9 @@ ATTRIBUTE_COLD void buf_pool_t::release_freed_page(buf_page_t *bpage) noexcept /** Write a flushable page to a file or free a freeable block. @param space tablespace +@param to_ext_buf whether to write the page to external buffer pull file @return whether a page write was initiated and buf_pool.mutex released */ -bool buf_page_t::flush(fil_space_t *space) noexcept +bool buf_page_t::flush(fil_space_t *space, bool to_ext_buf) noexcept { mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex); ut_ad(in_file()); @@ -742,13 +777,15 @@ bool buf_page_t::flush(fil_space_t *space) noexcept const lsn_t lsn= mach_read_from_8(my_assume_aligned<8> (FIL_PAGE_LSN + (zip.data ? zip.data : frame))); - ut_ad(lsn - ? lsn >= oldest_modification() || oldest_modification() == 2 - : (space->is_temporary() || space->is_being_imported())); + ut_ad(to_ext_buf || + (lsn ? lsn >= oldest_modification() || oldest_modification() == 2 + : (space->is_temporary() || space->is_being_imported()))); if (s < UNFIXED) { ut_a(s >= FREED); + if (to_ext_buf) + return false; if (!space->is_temporary() && !space->is_being_imported()) { freed: @@ -767,15 +804,21 @@ bool buf_page_t::flush(fil_space_t *space) noexcept { ut_ad(!space->is_temporary()); ut_ad(!space->is_being_imported()); + if (to_ext_buf) + return false; goto freed; } + ext_buf_page_t *ext_page= nullptr; + if (to_ext_buf && !(ext_page= buf_pool.alloc_ext_page(id()))) + return false; + ut_d(const auto f=) zip.fix.fetch_add(WRITE_FIX - UNFIXED); ut_ad(f >= UNFIXED); ut_ad(f < READ_FIX); - ut_ad((space == fil_system.temp_space) + ut_ad(to_ext_buf || ((space == fil_system.temp_space) ? oldest_modification() == 2 - : oldest_modification() > 2); + : oldest_modification() > 2)); /* Increment the I/O operation count used for selecting LRU policy. */ buf_LRU_stat_inc_io(); @@ -790,7 +833,8 @@ bool buf_page_t::flush(fil_space_t *space) noexcept buf_block_t *block= reinterpret_cast(this); page_t *write_frame= zip.data; - space->reacquire(); + if (!to_ext_buf) + space->reacquire(); size_t size; #if defined HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE || defined _WIN32 size_t orig_size; @@ -848,7 +892,12 @@ bool buf_page_t::flush(fil_space_t *space) noexcept write_frame= page; } - if ((s & LRU_MASK) == REINIT || !space->use_doublewrite()) + if (to_ext_buf) { + ut_ad(ext_page); + fil_system.ext_bp_io(*this, *ext_page, IORequest::WRITE_ASYNC, slot, size, + write_frame); + } + else if ((s & LRU_MASK) == REINIT || !space->use_doublewrite()) { if (!space->is_temporary() && !space->is_being_imported() && lsn > log_sys.get_flushed_lsn()) @@ -1260,9 +1309,11 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n, const size_t buf_lru_min_len= std::min((buf_pool.usable_size()) / 20 - 1, size_t{BUF_LRU_MIN_LEN}); + ulint free_or_flush= 0; for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.LRU); bpage && - ((UT_LIST_GET_LEN(buf_pool.LRU) > buf_lru_min_len && + (ut_d(buf_pool.force_LRU_eviction_to_ebp ||) + (UT_LIST_GET_LEN(buf_pool.LRU) > buf_lru_min_len && UT_LIST_GET_LEN(buf_pool.free) < free_limit) || to_withdraw || recv_recovery_is_on()); @@ -1274,6 +1325,7 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n, ut_ad(state >= buf_page_t::FREED); ut_ad(bpage->in_LRU_list); + bool flush_to_ebp= false; if (!bpage->oldest_modification()) { evict: @@ -1282,8 +1334,39 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n, continue; if (UNIV_UNLIKELY(to_withdraw != 0)) to_withdraw= buf_flush_LRU_to_withdraw(to_withdraw, *bpage); - buf_LRU_free_page(bpage, true); - ++n->evicted; + DBUG_EXECUTE_IF( + "ib_ext_bp_disable_LRU_eviction_for_t", + if (fil_space_t *space= fil_space_t::get(bpage->id_.space())) { + SCOPE_EXIT([space]() { space->release(); }); + auto space_name= space->name(); + if (space_name.data() && + !strncmp(space_name.data(), "test/t.ibd", space_name.size())) + continue; + }); + DBUG_EXECUTE_IF( + "ib_ext_bp_count_io_only_for_t", + if (fil_space_t *space= fil_space_t::get(bpage->id_.space())) { + SCOPE_EXIT([space]() { space->release(); }); + auto space_name= space->name(); + if (!space_name.data() || + strncmp(space_name.data(), "test/t.ibd", space_name.size())) + goto free_page;; + }); + // FIXME: currently every second page is flushed, consider more + // suitable algorithm there + if (!recv_recovery_is_on() && state != buf_page_t::FREED && + fil_system.ext_bp_size && !buf_pool.done_flush_list_waiters_count && + (ut_d(buf_pool.force_LRU_eviction_to_ebp ||)((++free_or_flush) & 1))) + { + flush_to_ebp= true; + goto flush_to_ebp; + } + else + { + free_page: + buf_LRU_free_page(bpage, true); + ++n->evicted; + } if (UNIV_LIKELY(scanned & 31)) continue; mysql_mutex_unlock(&buf_pool.mutex); @@ -1292,68 +1375,79 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n, continue; } - if (state < buf_page_t::READ_FIX && bpage->lock.u_lock_try(true)) +flush_to_ebp: + if ((flush_to_ebp || state < buf_page_t::READ_FIX) && + bpage->lock.u_lock_try(true)) + { + ut_ad(!bpage->is_io_fixed()); + switch (bpage->oldest_modification()) { - ut_ad(!bpage->is_io_fixed()); - switch (bpage->oldest_modification()) { - case 2: - /* LRU flushing will always evict pages of the temporary tablespace, - in buf_page_write_complete(). */ - ++n->evicted; - break; - case 1: - mysql_mutex_lock(&buf_pool.flush_list_mutex); - if (ut_d(lsn_t lsn=) bpage->oldest_modification()) - { - ut_ad(lsn == 1); /* It must be clean while we hold bpage->lock */ - buf_pool.delete_from_flush_list(bpage); - } - mysql_mutex_unlock(&buf_pool.flush_list_mutex); - /* fall through */ - case 0: + case 2: + /* LRU flushing will always evict pages of the temporary tablespace, + in buf_page_write_complete(). */ + ++n->evicted; + break; + case 1: + mysql_mutex_lock(&buf_pool.flush_list_mutex); + if (ut_d(lsn_t lsn=) bpage->oldest_modification()) + { + ut_ad(lsn == 1); /* It must be clean while we hold bpage->lock */ + buf_pool.delete_from_flush_list(bpage); + } + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + /* fall through */ + case 0: + if (!flush_to_ebp) + { bpage->lock.u_unlock(true); goto evict; } - /* Block is ready for flush. Dispatch an IO request. */ - const page_id_t page_id(bpage->id()); - const uint32_t space_id= page_id.space(); - if (!space || space->id != space_id) + } + /* Block is ready for flush. Dispatch an IO request. */ + const page_id_t page_id(bpage->id()); + const uint32_t space_id= page_id.space(); + if (!space || space->id != space_id) + { + if (last_space_id != space_id) { - if (last_space_id != space_id) + buf_pool.lru_hp.set(bpage); + mysql_mutex_unlock(&buf_pool.mutex); + if (space) + space->release(); + auto p= buf_flush_space(space_id); + space= p.first; + last_space_id= space_id; + if (!space) { - buf_pool.lru_hp.set(bpage); - mysql_mutex_unlock(&buf_pool.mutex); - if (space) - space->release(); - auto p= buf_flush_space(space_id); - space= p.first; - last_space_id= space_id; - if (!space) - { - mysql_mutex_lock(&buf_pool.mutex); - goto no_space; - } mysql_mutex_lock(&buf_pool.mutex); - buf_pool.stat.n_pages_written+= p.second; - } - else - { - ut_ad(!space); goto no_space; } + mysql_mutex_lock(&buf_pool.mutex); + buf_pool.stat.n_pages_written+= p.second; } - else if (space->is_stopping_writes()) + else { - space->release(); - space= nullptr; - no_space: + ut_ad(!space); + goto no_space; + } + } + else if (space->is_stopping_writes()) + { + space->release(); + space= nullptr; + no_space: + if (flush_to_ebp && !bpage->oldest_modification()) { + bpage->lock.u_unlock(true); + buf_LRU_free_page(bpage, true); + } else { mysql_mutex_lock(&buf_pool.flush_list_mutex); buf_flush_discard_page(bpage); - ++n->evicted; - continue; } + ++n->evicted; + continue; + } - if (state < buf_page_t::UNFIXED) + if (!flush_to_ebp && state < buf_page_t::UNFIXED) goto flush; if (n->flushed >= max && !recv_recovery_is_on()) @@ -1362,13 +1456,24 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n, break; } - if (neighbors && space->is_rotational() && UNIV_LIKELY(!to_withdraw) && - /* Skip neighbourhood flush from LRU list if we haven't yet reached - half of the free page target. */ - UT_LIST_GET_LEN(buf_pool.free) * 2 >= free_limit) + if (flush_to_ebp) + { + /* The page latch will be released in io callback */ + if (!bpage->flush(space, true)) + { + buf_LRU_free_page(bpage, true); + bpage->lock.u_unlock(true); + ++n->evicted; + continue; + } + } + else if (neighbors && space->is_rotational() && + UNIV_LIKELY(!to_withdraw) && + /* Skip neighbourhood flush from LRU list if we haven't yet + reached half of the free page target. */ + UT_LIST_GET_LEN(buf_pool.free) * 2 >= free_limit) n->flushed+= buf_flush_try_neighbors(space, page_id, bpage, - neighbors == 1, - n->flushed, max); + neighbors == 1, n->flushed, max); else { flush: @@ -2095,9 +2200,11 @@ static void buf_flush_wait(lsn_t lsn) noexcept { buf_flush_sync_lsn= lsn; buf_pool.page_cleaner_set_idle(false); + ++buf_pool.done_flush_list_waiters_count; pthread_cond_signal(&buf_pool.do_flush_list); my_cond_wait(&buf_pool.done_flush_list, &buf_pool.flush_list_mutex.m_mutex); + --buf_pool.done_flush_list_waiters_count; oldest_lsn= buf_pool.get_oldest_modification(lsn); if (oldest_lsn >= lsn) break; @@ -2509,7 +2616,7 @@ bool buf_pool_t::need_LRU_eviction() const noexcept { /* try_LRU_scan==false means that buf_LRU_get_free_block() is waiting for buf_flush_page_cleaner() to evict some blocks */ - return UNIV_UNLIKELY(!try_LRU_scan || + return UNIV_UNLIKELY(ut_d(force_LRU_eviction_to_ebp ||) !try_LRU_scan || (UT_LIST_GET_LEN(LRU) > BUF_LRU_MIN_LEN && UT_LIST_GET_LEN(free) < LRU_scan_depth / 2)); } @@ -2874,11 +2981,13 @@ void buf_flush_sync() noexcept { log_sys.latch.wr_unlock(); mysql_mutex_lock(&buf_pool.flush_list_mutex); + ++buf_pool.done_flush_list_waiters_count; buf_flush_wait(lsn); /* Wait for the page cleaner to be idle (for log resizing at startup) */ while (buf_flush_sync_lsn) my_cond_wait(&buf_pool.done_flush_list, &buf_pool.flush_list_mutex.m_mutex); + --buf_pool.done_flush_list_waiters_count; mysql_mutex_unlock(&buf_pool.flush_list_mutex); log_sys.latch.wr_lock(SRW_LOCK_CALL); lsn_t new_lsn= log_sys.get_lsn(); diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc index 95e2a6cbe0972..200f3a72d0c2d 100644 --- a/storage/innobase/buf/buf0lru.cc +++ b/storage/innobase/buf/buf0lru.cc @@ -105,10 +105,11 @@ uint buf_LRU_old_threshold_ms; If !bpage->frame && bpage->oldest_modification() <= 1, the object will be freed. -@param bpage buffer block -@param id page identifier -@param chain locked buf_pool.page_hash chain (will be released here) -@param zip whether bpage->zip of BUF_BLOCK_FILE_PAGE should be freed +@param bpage buffer block +@param id page identifier +@param chain locked buf_pool.page_hash chain (will be released here) +@param zip whether bpage->zip of BUF_BLOCK_FILE_PAGE should be freed +@param ext_buf_page external buffer page to replace bpage in page hash If a compressed page is freed other compressed pages may be relocated. @retval true if bpage with bpage->frame was removed from page_hash. The @@ -117,7 +118,8 @@ caller needs to free the page to the free list this case the block is already returned to the buddy allocator. */ static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id, buf_pool_t::hash_chain &chain, - bool zip); + bool zip, + ext_buf_page_t *ext_buf_page= nullptr); /** Free a block to buf_pool */ static void buf_LRU_block_free_hashed_page(buf_block_t *block) @@ -736,7 +738,8 @@ The caller must hold buf_pool.mutex. @param zip whether to remove both copies of a ROW_FORMAT=COMPRESSED page @retval true if freed and buf_pool.mutex may have been temporarily released @retval false if the page was not freed */ -bool buf_LRU_free_page(buf_page_t *bpage, bool zip) +bool buf_LRU_free_page(buf_page_t *bpage, bool zip, + ext_buf_page_t *ext_buf_page) { const page_id_t id{bpage->id()}; buf_page_t* b = nullptr; @@ -820,7 +823,7 @@ bool buf_LRU_free_page(buf_page_t *bpage, bool zip) ut_ad(bpage->can_relocate()); - if (!buf_LRU_block_remove_hashed(bpage, id, chain, zip)) { + if (!buf_LRU_block_remove_hashed(bpage, id, chain, zip, ext_buf_page)) { ut_ad(!b); mysql_mutex_assert_not_owner(&buf_pool.flush_list_mutex); return(true); @@ -993,7 +996,7 @@ ATTRIBUTE_COLD void buf_pool_t::free_block(buf_block_t *block) noexcept mysql_mutex_unlock(&mutex); } -inline void +void buf_pool_t::page_hash_table::remove(buf_pool_t::hash_chain &chain, buf_page_t *bpage) noexcept { @@ -1019,6 +1022,7 @@ If !bpage->frame && !bpage->oldest_modification(), the object will be freed. @param id page identifier @param chain locked buf_pool.page_hash chain (will be released here) @param zip whether bpage->zip of BUF_BLOCK_FILE_PAGE should be freed +@param ext_buf_page external buffer page to replace bpage in page hash If a compressed page is freed other compressed pages may be relocated. @retval true if BUF_BLOCK_FILE_PAGE was removed from page_hash. The @@ -1027,7 +1031,8 @@ caller needs to free the page to the free list this case the block is already returned to the buddy allocator. */ static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id, buf_pool_t::hash_chain &chain, - bool zip) + bool zip, + ext_buf_page_t *ext_buf_page) { ut_a(bpage->can_relocate()); ut_ad(buf_pool.page_hash.lock_get(chain).is_write_locked()); @@ -1091,7 +1096,16 @@ static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id, MEM_CHECK_ADDRESSABLE(bpage->zip.data, bpage->zip_size()); } - buf_pool.page_hash.remove(chain, bpage); + if (ext_buf_page) { + buf_pool.push_ext_page_to_LRU(*ext_buf_page); + ut_ad(ext_buf_page->id_ == bpage->id()); + ext_buf_page->hash= bpage->hash; + buf_pool.page_hash.replace(chain, bpage, + reinterpret_cast(ext_buf_page)); + } + else + buf_pool.page_hash.remove(chain, bpage); + page_hash_latch& hash_lock = buf_pool.page_hash.lock_get(chain); if (UNIV_UNLIKELY(!bpage->frame)) { diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc index 59bb3c5c2327e..b59a2eab817f1 100644 --- a/storage/innobase/buf/buf0rea.cc +++ b/storage/innobase/buf/buf0rea.cc @@ -57,6 +57,15 @@ read-ahead is not done: this is to prevent flooding the buffer pool with i/o-fixed buffer blocks */ #define BUF_READ_AHEAD_PEND_LIMIT 2 +/** The result, returned by buf_page_init_for_read() */ +struct page_init_result { +/* page_init_result() : bpage(nullptr), ext_buf_page(nullptr) {} */ + bool in_ext_buffer_pool() const noexcept { return ext_buf_page; } + buf_page_t* bpage; /* Initialized page */ + ext_buf_page_t *ext_buf_page; /* External buffer pool page if bpage can be + read from from exretnal buffer pool file */ +}; + /** Initialize a page for read to the buffer buf_pool. If the page is (1) already in buf_pool, or (2) if the tablespace has been or is being deleted, @@ -69,39 +78,59 @@ and the lock released later. bitwise-ORed with 1 in recovery @param chain buf_pool.page_hash cell for page_id @param block preallocated buffer block (set to nullptr if consumed) -@return pointer to the block -@retval nullptr in case of an error -@retval pointer to block | 1 if the page already exists in buf_pool */ -static buf_page_t *buf_page_init_for_read(const page_id_t page_id, - ulint zip_size, - buf_pool_t::hash_chain &chain, - buf_block_t *&block) noexcept +@retval page_init_result::bpage == nullptr in case of an error, + otherwise + page_init_result::bpage points to initialized page and + the first bit of page_init_result::bpage is set only if the + page already exists in buf_pool, + or + page_init_result::ext_buf_page point to external buffer pool + page if page_init_result::bpage can be read from external + buffer pool file */ +static page_init_result buf_page_init_for_read(const page_id_t page_id, + ulint zip_size, + buf_pool_t::hash_chain &chain, + buf_block_t *&block) noexcept { buf_page_t *bpage= !zip_size || (zip_size & 1) ? &block->page : nullptr; + ext_buf_page_t *ext_buf_page= nullptr; constexpr uint32_t READ_BUF_FIX{buf_page_t::READ_FIX + 1}; page_hash_latch &hash_lock= buf_pool.page_hash.lock_get(chain); hash_lock.lock(); - buf_page_t *hash_page= buf_pool.page_hash.get(page_id, chain); + buf_page_t *hash_page= buf_pool.page_hash.get(page_id, chain); if (hash_page) { - page_exists: - /* The page is already in the buffer pool. */ - ut_d(const uint32_t state=) hash_page->fix(); - ut_ad(state >= buf_page_t::FREED); - hash_lock.unlock(); - return reinterpret_cast(uintptr_t(hash_page) | 1); + if (hash_page->external()) + ext_buf_page= reinterpret_cast(hash_page); + else + { + page_exists: + /* The page is already in the buffer pool. */ + ut_d(const uint32_t state=) hash_page->fix(); + ut_ad(state >= buf_page_t::FREED); + hash_lock.unlock(); + return page_init_result{ + reinterpret_cast(uintptr_t(hash_page) | 1), nullptr}; + } } if (UNIV_UNLIKELY(mysql_mutex_trylock(&buf_pool.mutex))) { hash_lock.unlock(); + /* ext_buf_page can be set previously, and should be zeroed out to prevent + wrong value usage afterwards */ + ext_buf_page= nullptr; mysql_mutex_lock(&buf_pool.mutex); hash_lock.lock(); - hash_page= buf_pool.page_hash.get(page_id, chain); + hash_page= buf_pool.page_hash.get(page_id, chain); if (hash_page) { - mysql_mutex_unlock(&buf_pool.mutex); - goto page_exists; + if (hash_page->external()) + ext_buf_page= reinterpret_cast(hash_page); + else { + mysql_mutex_unlock(&buf_pool.mutex); + goto page_exists; + } } } @@ -116,7 +145,18 @@ static buf_page_t *buf_page_init_for_read(const page_id_t page_id, in buf_page_t::read_complete() by the io-handler thread. */ bpage->lock.x_lock(true); /* Insert into the hash table of file pages */ - buf_pool.page_hash.append(chain, bpage); + if (ext_buf_page) + { + bpage->hash= ext_buf_page->hash; + ut_d(bpage->in_page_hash= true); + buf_pool.page_hash.replace( + chain, reinterpret_cast(ext_buf_page), + bpage); + ut_ad(!ext_buf_page->in_page_hash); + buf_pool.remove_ext_page_from_LRU(*ext_buf_page); + } + else + buf_pool.page_hash.append(chain, bpage); hash_lock.unlock(); /* The block must be put to the LRU list, to the old blocks */ @@ -135,7 +175,8 @@ static buf_page_t *buf_page_init_for_read(const page_id_t page_id, we have to add this block to unzip_LRU after block->page.zip.data is set. */ ut_ad(bpage->belongs_to_unzip_LRU()); - buf_unzip_LRU_add_block(reinterpret_cast(bpage), TRUE); + buf_unzip_LRU_add_block(reinterpret_cast(bpage), + TRUE); } } else @@ -153,20 +194,29 @@ static buf_page_t *buf_page_init_for_read(const page_id_t page_id, check the page_hash again, as it may have been modified. */ if (UNIV_UNLIKELY(lru)) { - hash_page= buf_pool.page_hash.get(page_id, chain); + hash_page= buf_pool.page_hash.get(page_id, chain); if (UNIV_LIKELY_NULL(hash_page)) { - /* The block was added by some other thread. */ - ut_d(const uint32_t state=) hash_page->fix(); - ut_ad(state >= buf_page_t::FREED); - buf_buddy_free(data, zip_size); - mysql_mutex_unlock(&buf_pool.mutex); - return reinterpret_cast(uintptr_t(hash_page) | 1); + if (UNIV_UNLIKELY(hash_page->external())) + ext_buf_page= reinterpret_cast(hash_page); + else + { + /* The block was added by some other thread. */ + ut_d(const uint32_t state=) hash_page->fix(); + ut_ad(state >= buf_page_t::FREED); + buf_buddy_free(data, zip_size); + mysql_mutex_unlock(&buf_pool.mutex); + return page_init_result{ + reinterpret_cast(uintptr_t(hash_page) | 1), + nullptr}; + } } } - bpage= static_cast(ut_zalloc_nokey(sizeof *bpage)); + bpage= + static_cast(ut_zalloc_nokey(sizeof *bpage)); + // TODO: do we need to init it for compressed pages? I think no. page_zip_des_init(&bpage->zip); page_zip_set_size(&bpage->zip, zip_size); bpage->zip.data = (page_zip_t*) data; @@ -179,19 +229,39 @@ static buf_page_t *buf_page_init_for_read(const page_id_t page_id, bpage->init(READ_BUF_FIX, page_id); bpage->lock.x_lock(true); - hash_lock.lock(); - buf_pool.page_hash.append(chain, bpage); - hash_lock.unlock(); + + if (ext_buf_page) { + bpage->hash= ext_buf_page->hash; + ut_d(bpage->in_page_hash= true); + hash_lock.lock(); + buf_pool.page_hash.replace( + chain, reinterpret_cast(ext_buf_page), bpage); + hash_lock.unlock(); + ut_ad(!ext_buf_page->in_page_hash); + buf_pool.remove_ext_page_from_LRU(*ext_buf_page); + } + else + { + hash_lock.lock(); + buf_pool.page_hash.append(chain, bpage); + hash_lock.unlock(); + } /* The block must be put to the LRU list, to the old blocks. The zip size is already set into the page zip */ buf_LRU_add_block(bpage, true/* to old blocks */); } - buf_pool.stat.n_pages_read++; + if (!ext_buf_page) + buf_pool.stat.n_pages_read++; ut_ad(!bpage || bpage->in_file()); + if (ext_buf_page && !fil_system.ext_buf_pool_enabled()) + { + buf_pool.free_ext_page(*ext_buf_page); + ext_buf_page= nullptr; + } mysql_mutex_unlock(&buf_pool.mutex); - return bpage; + return page_init_result{bpage, ext_buf_page}; } inline ulonglong mariadb_measure() noexcept @@ -273,14 +343,20 @@ buf_read_page_low( return nullptr; } - buf_page_t *bpage= buf_page_init_for_read(page_id, zip_size, chain, block); + auto init_page_result= + buf_page_init_for_read(page_id, zip_size, chain, block); + buf_page_t *bpage= init_page_result.bpage; if (UNIV_UNLIKELY(!bpage)) + { + ut_ad(!init_page_result.ext_buf_page); goto fail; + } const bool exist(uintptr_t(bpage) & 1); bpage= reinterpret_cast(uintptr_t(bpage) & ~uintptr_t{1}); trx_t *const trx= thd ? thd_to_trx(thd) : nullptr; if (exist) { + ut_ad(!init_page_result.ext_buf_page); if (!err) { bpage->unfix(); @@ -320,16 +396,28 @@ buf_read_page_low( void* dst= zip_size > 1 ? bpage->zip.data : bpage->frame; const size_t len= zip_size & ~1 ? zip_size & ~1 : srv_page_size; - + /* Synchronous read */ if (err != nullptr) { thd_wait_begin(thd, THD_WAIT_DISKIO); ha_handler_stats *const stats= trx ? trx->active_handler_stats : nullptr; const ulonglong start= stats ? mariadb_measure() : 0; - auto fio= space->io(IORequest(IORequest::READ_SYNC), + auto fio= + init_page_result.in_ext_buffer_pool() + ? fil_io_t{fil_system.ext_bp_io( + *bpage, *init_page_result.ext_buf_page, + IORequest::READ_SYNC, nullptr, len, dst), + nullptr} + : space->io(IORequest(IORequest::READ_SYNC), os_offset_t{page_id.page_no()} * len, len, dst, bpage); *err= fio.err; thd_wait_end(thd); + if (init_page_result.in_ext_buffer_pool()) + { + mysql_mutex_lock(&buf_pool.mutex); + buf_pool.free_ext_page(*init_page_result.ext_buf_page); + mysql_mutex_unlock(&buf_pool.mutex); + } if (stats) { stats->pages_read_count++; @@ -338,22 +426,44 @@ buf_read_page_low( } if (UNIV_LIKELY(*err == DB_SUCCESS)) { - *err= bpage->read_complete(*fio.node, recv_sys.recovery_on); + *err= bpage->read_complete(init_page_result.in_ext_buffer_pool() + ? *UT_LIST_GET_FIRST(space->chain) + : *fio.node, + recv_sys.recovery_on); if (*err) bpage= nullptr; space->release(); - + if (init_page_result.in_ext_buffer_pool()) + { + ut_d(if (DBUG_IF("ib_ext_bp_count_io_only_for_t")) { + auto space_name= space->name(); + if (fil_page_get_type(bpage->frame) == FIL_PAGE_INDEX && + space_name.data() && + !strncmp(space_name.data(), "test/t.ibd", space_name.size())) + ++buf_pool.stat.n_pages_read_from_ebp; + } else)++ buf_pool.stat.n_pages_read_from_ebp; + } /* FIXME: Remove this, and accumulate stats->pages_read_count to global statistics somewhere! */ buf_LRU_stat_inc_io(); return bpage; } } + else if (init_page_result.in_ext_buffer_pool()) + { + auto err= fil_system.ext_bp_io(*bpage, *init_page_result.ext_buf_page, + IORequest::READ_ASYNC, nullptr, len, dst); + space->release(); + if (UNIV_LIKELY(DB_SUCCESS == err)) + return reinterpret_cast(-1); + } else if (UNIV_LIKELY(DB_SUCCESS == - space->io(IORequest(IORequest::READ_ASYNC), - os_offset_t{page_id.page_no()} * len, len, - dst, bpage).err)) - return reinterpret_cast(-1); + space + ->io(IORequest(IORequest::READ_ASYNC), + os_offset_t{page_id.page_no()} * len, len, dst, + bpage) + .err)) + return reinterpret_cast(-1); recv_sys.free_corrupted_page(page_id, *space->chain.start); buf_pool.corrupted_evict(bpage, buf_page_t::READ_FIX + 1); @@ -791,7 +901,10 @@ void buf_read_recover(fil_space_t *space, const page_id_t page_id, if (init_lsn) { - buf_page_t *bpage= buf_page_init_for_read(page_id, zip_size, chain, block); + auto init_page_result= + buf_page_init_for_read(page_id, zip_size, chain, block); + ut_ad(!init_page_result.ext_buf_page); + buf_page_t *bpage= init_page_result.bpage; if (UNIV_UNLIKELY(!bpage)) goto fail; const bool exist(uintptr_t(bpage) & 1); diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index f6f2331f6d258..a56b628e03453 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -59,6 +59,9 @@ Created 10/25/1995 Heikki Tuuri #include "bzlib.h" #include "snappy-c.h" +/* External buffer pool file name */ +const char *ext_bp_file_name= "ext_buffer_pool"; + ATTRIBUTE_COLD bool fil_space_t::set_corrupted() const noexcept { if (!is_stopping() && !is_corrupted.test_and_set()) @@ -497,7 +500,8 @@ pfs_os_file_t fil_node_t::detach() noexcept void fil_node_t::prepare_to_close_or_detach() noexcept { mysql_mutex_assert_owner(&fil_system.mutex); - ut_ad(space->is_ready_to_close() || srv_operation == SRV_OPERATION_BACKUP || + ut_ad(space->is_ready_to_close() || + srv_operation == SRV_OPERATION_BACKUP || srv_operation == SRV_OPERATION_RESTORE_DELTA); ut_a(is_open()); ut_a(!being_extended); @@ -1295,6 +1299,14 @@ void fil_system_t::close() noexcept if (is_initialised()) { + if (ext_bp_file != OS_FILE_CLOSED) + { + int res= mysql_file_close( + IF_WIN(my_win_handle2File((os_file_t) ext_bp_file), ext_bp_file), + MYF(MY_WME)); + ut_a(res != -1); + ext_bp_file= OS_FILE_CLOSED; + } spaces.free(); mysql_mutex_destroy(&mutex); fil_space_crypt_cleanup(); @@ -2917,18 +2929,72 @@ fil_io_t fil_space_t::io(const IORequest &type, os_offset_t offset, size_t len, return {err, node}; } +bool fil_system_t::create_ext_file() noexcept { + bool ret; + ext_bp_file= pfs_create_temp_file( + ext_bp_path ? ext_bp_path : fil_path_to_mysql_datadir, + "/Extended buffer pool file", "ext_buf_"); + if (ext_bp_file == OS_FILE_CLOSED) + { + sql_print_error("Cannot open/create extended buffer pool file"); + /* Report OS error in error log */ + (void)os_file_get_last_error(true, false); + return false; + } + ret= os_file_set_size(ext_bp_file_name, ext_bp_file.m_file, ext_bp_size); + if (!ret) + { + os_file_close_func(ext_bp_file.m_file); + sql_print_error("Cannot set extended buffer pool file size to %zum", + ext_bp_size); + return false; + } + return true; +} + +dberr_t fil_system_t::ext_bp_io(buf_page_t &bpage, ext_buf_page_t &ext_page, + IORequest::Type io_request_type, + buf_tmp_buffer_t *slot, size_t len, + void *buf) noexcept +{ + ut_ad(len % 512 == 0); /* page_compressed */ + ut_ad(io_request_type == IORequest::WRITE_ASYNC || + io_request_type == IORequest::READ_SYNC || + io_request_type == IORequest::READ_ASYNC) ; + /* Queue the aio request */ + return os_aio(IORequest{&bpage, slot, &ext_page, io_request_type}, buf, + buf_pool.ext_page_offset(ext_page), len, ext_bp_file, + ext_bp_file_name); +} + #include void IORequest::write_complete(int io_error) const noexcept { ut_ad(fil_validate_skip()); - ut_ad(node); - fil_space_t *space= node->space; + ut_ad(node_ptr); + buf_page_t *buf_page= bpage(); ut_ad(is_write()); - if (!bpage) + fil_space_t *space; + if (ext_buf()) + { + space= fil_space_t::get(buf_page->id().space()); + if (!space) + { + buf_page->lock.u_unlock(true); + // TODO: should we update the statistics here? + //++buf_pool.stat.n_pages_written_to_ebp; + return; + } + } + else + space= node_ptr->space; + + if (!buf_page) { ut_ad(!srv_read_only_mode); + ut_ad(!ext_buf()); if (type == IORequest::DBLWR_BATCH) { buf_dblwr.flush_buffered_writes_completed(*this); @@ -2942,30 +3008,65 @@ void IORequest::write_complete(int io_error) const noexcept else buf_page_write_complete(*this, io_error); - space->complete_write(); + if (!ext_buf()) + space->complete_write(); func_exit: space->release(); } void IORequest::read_complete(int io_error) const noexcept { + buf_page_t *buf_page= bpage(); ut_ad(fil_validate_skip()); - ut_ad(node); + ut_ad(node_ptr); ut_ad(is_read()); - ut_ad(bpage); - ut_d(auto s= bpage->state()); + ut_ad(bpage()); + ut_d(auto s= bpage()->state()); ut_ad(s > buf_page_t::READ_FIX); ut_ad(s <= buf_page_t::WRITE_FIX); - const page_id_t id(bpage->id()); + fil_space_t *space; + if (ext_buf()) { + ut_ad(ext_buf_page()->id_ == buf_page->id()); + mysql_mutex_lock(&buf_pool.mutex); + buf_pool.free_ext_page(*ext_buf_page()); + mysql_mutex_unlock(&buf_pool.mutex); + /* The space will be released at the end of this function */ + space= fil_space_t::get(buf_page->id().space()); + if (!space) { + buf_page->lock.x_unlock(true); + ++buf_pool.stat.n_pages_read_from_ebp; + return; + } + ut_d(if (DBUG_IF("ib_ext_bp_count_io_only_for_t")) { + auto space_name= space->name(); + if (fil_page_get_type(buf_page->frame) == FIL_PAGE_INDEX && + space_name.data() && + !strncmp(space_name.data(), "test/t.ibd", space_name.size())) + { + ++buf_pool.stat.n_pages_read_from_ebp; + } + } else) + ++buf_pool.stat.n_pages_read_from_ebp; + } + else + space= node_ptr->space; + + const page_id_t id(buf_page->id()); const bool in_recovery{recv_sys.recovery_on}; if (UNIV_UNLIKELY(io_error != 0)) { sql_print_error("InnoDB: Read error %d of page " UINT32PF " in file %s", - io_error, id.page_no(), node->name); - recv_sys.free_corrupted_page(id, *node); - buf_pool.corrupted_evict(bpage, buf_page_t::READ_FIX + 1); + io_error, id.page_no(), + ext_buf() ? "of external buffer pool, external buffer " + "pool is disabled" + : node_ptr->name); + if (ext_buf()) + fil_system.ext_buf_pool_disable(); + else + recv_sys.free_corrupted_page(id, *node_ptr); + buf_pool.corrupted_evict(buf_page, buf_page_t::READ_FIX + 1); corrupted: if (in_recovery && !srv_force_recovery) { @@ -2974,12 +3075,14 @@ void IORequest::read_complete(int io_error) const noexcept mysql_mutex_unlock(&recv_sys.mutex); } } - else if (bpage->read_complete(*node, in_recovery)) + else if (bpage()->read_complete(ext_buf() ? *UT_LIST_GET_FIRST(space->chain) + : *node_ptr, + in_recovery)) goto corrupted; else - bpage->unfix(); + bpage()->unfix(); - node->space->release(); + space->release(); } /** Flush to disk the writes in file spaces of the given type @@ -3347,3 +3450,44 @@ fil_space_t *fil_space_t::prev_in_unflushed_spaces() noexcept } #endif + +pfs_os_file_t pfs_create_temp_file(const char *path, const char *label, + const char *prefix) +{ + if (!path) + { + path= mysql_tmpdir; + } +#ifdef UNIV_PFS_IO + /* This temp file open does not go through normal + file APIs, add instrumentation to register with + performance schema */ + struct PSI_file_locker *locker; + PSI_file_locker_state state; + char *name= + static_cast(ut_malloc_nokey(strlen(path) + strlen(label) + 1)); + strcpy(name, path); + strcat(name, label); + + register_pfs_file_open_begin(&state, locker, innodb_temp_file_key, + PSI_FILE_CREATE, path ? name : label, __FILE__, + __LINE__); + +#endif + DBUG_ASSERT(strlen(path) + 2 <= FN_REFLEN); + char filename[FN_REFLEN]; + File f= create_temp_file(filename, path, prefix, O_BINARY | O_SEQUENTIAL, + MYF(MY_WME | MY_TEMPORARY)); + pfs_os_file_t fd= IF_WIN((os_file_t) my_get_osfhandle(f), f); + +#ifdef UNIV_PFS_IO + register_pfs_file_open_end(locker, fd, (fd == OS_FILE_CLOSED) ? NULL : &fd); + ut_free(name); +#endif + + if (fd == OS_FILE_CLOSED) + { + ib::error() << "Cannot create temporary merge file"; + } + return (fd); +} diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index bb325313d8b57..0a332f628e290 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -963,8 +963,12 @@ static SHOW_VAR innodb_status_variables[]= { &buf_pool.stat.n_ra_pages_evicted, SHOW_SIZE_T}, {"buffer_pool_read_requests", &buf_pool.stat.n_page_gets, SHOW_SIZE_T}, {"buffer_pool_reads", &buf_pool.stat.n_pages_read, SHOW_SIZE_T}, + {"ext_buffer_pool_reads", &buf_pool.stat.n_pages_read_from_ebp, + SHOW_SIZE_T}, {"buffer_pool_wait_free", &buf_pool.stat.LRU_waits, SHOW_SIZE_T}, {"buffer_pool_write_requests", &buf_pool.flush_list_requests, SHOW_SIZE_T}, + {"ext_buffer_pool_pages_flushed", &buf_pool.stat.n_pages_written_to_ebp, + SHOW_SIZE_T}, {"checkpoint_age", &export_vars.innodb_checkpoint_age, SHOW_SIZE_T}, {"checkpoint_max_age", &export_vars.innodb_checkpoint_max_age, SHOW_SIZE_T}, {"data_fsyncs", (size_t*) &os_n_fsyncs, SHOW_SIZE_T}, @@ -3651,6 +3655,40 @@ static void innodb_buffer_pool_size_update(THD* thd,st_mysql_sys_var*,void*, buf_pool.resize(*static_cast(save), thd); } +static void innodb_extended_buffer_pool_size_update(THD *thd, + st_mysql_sys_var *, void *, + const void *save) +{ + buf_pool.extended_pages= + (*static_cast(save) >> srv_page_size_shift); + fil_system.ext_bp_size= buf_pool.extended_pages << srv_page_size_shift; +} + +#ifdef UNIV_DEBUG +static void innodb_force_LRU_eviction_set(THD *, st_mysql_sys_var *, void *, + const void *save) +{ + buf_pool.force_LRU_eviction_to_ebp= *static_cast(save); + if (buf_pool.force_LRU_eviction_to_ebp) + { + /* Wake up page cleaner twice, the first one is to flush dirty pages to + data files, the second one is to flush clean pages to external buffer pool. + */ + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_pool.page_cleaner_wakeup(true); + my_cond_wait(&buf_pool.done_flush_list, + &buf_pool.flush_list_mutex.m_mutex); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + mysql_mutex_lock(&buf_pool.flush_list_mutex); + buf_pool.page_cleaner_wakeup(true); + my_cond_wait(&buf_pool.done_flush_list, + &buf_pool.flush_list_mutex.m_mutex); + mysql_mutex_unlock(&buf_pool.flush_list_mutex); + } + buf_pool.force_LRU_eviction_to_ebp= false; +} +#endif /* UNIV_DEBUG */ + static MYSQL_SYSVAR_SIZE_T(buffer_pool_size, buf_pool.size_in_bytes_requested, PLUGIN_VAR_RQCMDARG, "The size of the memory buffer InnoDB uses to cache data" @@ -3689,6 +3727,24 @@ static MYSQL_SYSVAR_UINT(log_write_ahead_size, log_sys.write_size, "Redo log write size to avoid read-on-write; must be a power of two", nullptr, nullptr, 512, 512, 4096, 1); +static MYSQL_SYSVAR_SIZE_T(extended_buffer_pool_size, fil_system.ext_bp_size, + PLUGIN_VAR_RQCMDARG, + "The extended buffer pool file size", + nullptr, innodb_extended_buffer_pool_size_update, + // TODO: set correct min and max values here. + 0, 0, SIZE_T_MAX, 0); + +static MYSQL_SYSVAR_STR(extended_buffer_pool_path, fil_system.ext_bp_path, + PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, + "Path to extended buffer pool file", + nullptr, nullptr, nullptr); + +#ifdef UNIV_DEBUG +static MYSQL_SYSVAR_BOOL(force_LRU_eviction, buf_pool.force_LRU_eviction_to_ebp, + PLUGIN_VAR_OPCMDARG, + "Wake up page cleaner and wait for pages flushing end, used for testing only", + NULL, innodb_force_LRU_eviction_set, FALSE); +#endif #ifdef BTR_CUR_HASH_ADAPT static void innodb_adaptive_hash_index_update(THD*, st_mysql_sys_var*, void*, @@ -3821,6 +3877,8 @@ static int innodb_init_params() buf_pool.size_in_bytes_max; #endif + buf_pool.extended_pages = fil_system.ext_bp_size >> srv_page_size_shift; + if (innodb_buffer_pool_size < min) { sql_print_error("InnoDB: innodb_page_size=%lu requires " @@ -18650,9 +18708,11 @@ static void innodb_log_file_size_update(THD *thd, st_mysql_sys_var*, lsn_t resizing= log_sys.resize_in_progress(); if (resizing > buf_pool.get_oldest_modification(0)) { + ++buf_pool.done_flush_list_waiters_count; buf_pool.page_cleaner_wakeup(true); my_cond_timedwait(&buf_pool.done_flush_list, &buf_pool.flush_list_mutex.m_mutex, &abstime); + --buf_pool.done_flush_list_waiters_count; resizing= log_sys.resize_in_progress(); } mysql_mutex_unlock(&buf_pool.flush_list_mutex); @@ -19881,6 +19941,11 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(buffer_pool_size_auto_min), #endif MYSQL_SYSVAR(buffer_pool_size_max), + MYSQL_SYSVAR(extended_buffer_pool_size), + MYSQL_SYSVAR(extended_buffer_pool_path), +#ifdef UNIV_DEBUG + MYSQL_SYSVAR(force_LRU_eviction), +#endif MYSQL_SYSVAR(buffer_pool_chunk_size), MYSQL_SYSVAR(buffer_pool_filename), MYSQL_SYSVAR(buffer_pool_dump_now), diff --git a/storage/innobase/handler/i_s.cc b/storage/innobase/handler/i_s.cc index 5eed69d1f72e5..105619630534c 100644 --- a/storage/innobase/handler/i_s.cc +++ b/storage/innobase/handler/i_s.cc @@ -3361,6 +3361,12 @@ static ST_FIELD_INFO i_s_innodb_buffer_stats_fields_info[]= #define IDX_BUF_STATS_UNZIP_CUR 31 Column("UNCOMPRESS_CURRENT", ULonglong(), NOT_NULL), +#define IDX_BUF_STATS_PAGE_WRITTEN_TO_EBP 32 + Column("NUMBER_PAGES_WRITTEN_TO_EXTERNAL_BUFFER_POOL",ULonglong(), NOT_NULL), + +#define IDX_BUF_STATS_PAGE_READ_FROM_EBP 33 + Column("NUMBER_PAGES_READ_FROM_EXTERNAL_BUFFER_POOL",ULonglong(), NOT_NULL), + CEnd() }; } // namespace Show @@ -3428,12 +3434,18 @@ static int i_s_innodb_stats_fill(THD *thd, TABLE_LIST * tables, Item *) OK(fields[IDX_BUF_STATS_PAGE_READ]->store(info.n_pages_read, true)); + OK(fields[IDX_BUF_STATS_PAGE_READ_FROM_EBP]->store( + info.n_pages_read_from_ebp, true)); + OK(fields[IDX_BUF_STATS_PAGE_CREATED]->store( info.n_pages_created, true)); OK(fields[IDX_BUF_STATS_PAGE_WRITTEN]->store( info.n_pages_written, true)); + OK(fields[IDX_BUF_STATS_PAGE_WRITTEN_TO_EBP]->store( + info.n_pages_written_to_ebp, true)); + OK(fields[IDX_BUF_STATS_GET]->store(info.n_page_gets, true)); OK(fields[IDX_BUF_STATS_PAGE_READ_RATE]->store( diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index 231df868166f8..30063690db3e4 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -85,8 +85,12 @@ struct buf_pool_info_t ulint n_pages_made_young; /*!< number of pages made young */ ulint n_pages_not_made_young; /*!< number of pages not made young */ ulint n_pages_read; /*!< buf_pool.n_pages_read */ + /** buf_pool.n_pages_read_from_ebp */ + ulint n_pages_read_from_ebp; ulint n_pages_created; /*!< buf_pool.n_pages_created */ ulint n_pages_written; /*!< buf_pool.n_pages_written */ + /** buf_pool.n_pages_written_to_ebp */ + ulint n_pages_written_to_ebp; ulint n_page_gets; /*!< buf_pool.n_page_gets */ ulint n_ra_pages_read_rnd; /*!< buf_pool.n_ra_pages_read_rnd, number of pages readahead */ @@ -457,15 +461,11 @@ for compressed and uncompressed frames */ class buf_pool_t; -class buf_page_t +struct buf_page_base_t { - friend buf_pool_t; - friend buf_block_t; - - /** @name General fields */ - /* @{ */ - -public: // FIXME: fix fil_iterate() + /** ext_buf_page_t indentifier */ + static constexpr std::uintptr_t EXT_BUF_FRAME{1}; + // FIXME: fix fil_iterate() /** Page id. Protected by buf_pool.page_hash.lock_get() when the page is in buf_pool.page_hash. */ page_id_t id_; @@ -483,7 +483,57 @@ class buf_page_t uint16_t free_offset; }; }; -private: + /** pointer to aligned, uncompressed page frame of innodb_page_size */ + byte *frame; +#ifdef UNIV_DEBUG + /** whether this->LRU is in buf_pool.LRU (in_file()); + protected by buf_pool.mutex */ + bool in_LRU_list; + /** whether this is in buf_pool.page_hash (in_file()); + protected by buf_pool.mutex */ + bool in_page_hash; + /** whether this->list is in buf_pool.free (state() == NOT_USED); + protected by buf_pool.flush_list_mutex */ + bool in_free_list; +#endif /* UNIV_DEBUG */ + buf_page_base_t() : id_{0} {} + buf_page_base_t(const buf_page_base_t &b) + : id_(b.id_), hash(b.hash), frame(b.frame) +#ifdef UNIV_DEBUG + , + in_LRU_list(b.in_LRU_list), in_page_hash(b.in_page_hash), + in_free_list(b.in_free_list) +#endif /* UNIV_DEBUG */ + { + } + + bool external() const noexcept + { + /* TODO: we could just compare the address of the page, as it is done for + sentinel pages, and use *frame for something else */ + return reinterpret_cast(frame) == EXT_BUF_FRAME; + } +}; + +/* External buffer pool page. The first 3 members (6 for debug build) must be +the same as in buf_page_t. The "frame" member must always be equal to +EXT_BUF_FRAME, this is how we determine if some page is external one. */ +struct ext_buf_page_t : public buf_page_base_t { +public: + /** Node of buf_pool_t::ext_free */ + UT_LIST_NODE_T(ext_buf_page_t) free_list; + /** Node of buf_pool_t::ext_LRU */ + UT_LIST_NODE_T(ext_buf_page_t) ext_LRU_list; +}; + +class buf_page_t : public buf_page_base_t +{ + friend buf_pool_t; + friend buf_block_t; + + /** @name General fields */ + /* @{ */ + /** log sequence number of the START of the log entry written of the oldest modification to this block which has not yet been written to the data file; @@ -520,23 +570,10 @@ class buf_page_t /** lock covering the contents of frame() */ block_lock lock; - /** pointer to aligned, uncompressed page frame of innodb_page_size */ - byte *frame; /* @} */ /** ROW_FORMAT=COMPRESSED page; zip.data (but not the data it points to) is also protected by buf_pool.mutex */ page_zip_des_t zip; -#ifdef UNIV_DEBUG - /** whether this->LRU is in buf_pool.LRU (in_file()); - protected by buf_pool.mutex */ - bool in_LRU_list; - /** whether this is in buf_pool.page_hash (in_file()); - protected by buf_pool.mutex */ - bool in_page_hash; - /** whether this->list is in buf_pool.free (state() == NOT_USED); - protected by buf_pool.flush_list_mutex */ - bool in_free_list; -#endif /* UNIV_DEBUG */ /** list member in one of the lists of buf_pool; protected by buf_pool.mutex or buf_pool.flush_list_mutex @@ -569,21 +606,17 @@ class buf_page_t Atomic_counter access_time; /*!< time of first access, or 0 if the block was never accessed in the buffer pool. */ - buf_page_t() : id_{0} + buf_page_t() : buf_page_base_t() { static_assert(NOT_USED == 0, "compatibility"); memset((void*) this, 0, sizeof *this); } buf_page_t(const buf_page_t &b) : - id_(b.id_), hash(b.hash), + buf_page_base_t(b), oldest_modification_(b.oldest_modification_), - lock() /* not copied */, - frame(b.frame), zip(b.zip), -#ifdef UNIV_DEBUG - in_LRU_list(b.in_LRU_list), - in_page_hash(b.in_page_hash), in_free_list(b.in_free_list), -#endif /* UNIV_DEBUG */ + lock(), /* not copied */ + zip(b.zip), list(b.list), LRU(b.LRU), old(b.old), freed_page_clock(b.freed_page_clock), access_time(b.access_time) { @@ -724,16 +757,26 @@ class buf_page_t @param trx transaction (for updating trx->active_handler_stats) */ void read_wait(trx_t *trx) noexcept; + /** Space type for write complete */ + enum space_type + { + PERSISTENT, /* Persistent space */ + TEMPORARY, /* Temporary space */ + EXT_BUF /* External buffer pool space */ + }; + /** Release a write fix after a page write was completed. - @param persistent whether the page belongs to a persistent tablespace + @param type the type of space which the page was written to @param error whether an error may have occurred while writing @param state recently read state() value with the correct io-fix */ - void write_complete(bool persistent, bool error, uint32_t state) noexcept; + void write_complete(space_type type, bool error, + uint32_t state) noexcept; /** Write a flushable page to a file or free a freeable block. @param space tablespace + @param to_ext_buf wherher to write the page to external buffer pull file @return whether a page write was initiated and buf_pool.mutex released */ - bool flush(fil_space_t *space) noexcept; + bool flush(fil_space_t *space, bool to_ext_buf= false) noexcept; /** Notify that a page in a temporary tablespace has been modified. */ void set_temp_modified() noexcept @@ -1050,7 +1093,11 @@ struct buf_pool_stat_t{ ulint n_page_gets_nonatomic; }; ulint n_pages_read; /*!< number read operations */ + /** Number of pages, read from external buffer pool file */ + ulint n_pages_read_from_ebp; ulint n_pages_written;/*!< number write operations */ + /** Number of pages, written to external buffer pool file */ + ulint n_pages_written_to_ebp; ulint n_pages_created;/*!< number of pages created in the pool with no read */ ulint n_ra_pages_read_rnd;/*!< number of pages read in @@ -1108,6 +1155,11 @@ class buf_pool_t protected by mutex */ Atomic_relaxed size_in_bytes; + /** External buffer pool pages array*/ + ext_buf_page_t *ext_buf_pages_array; + /** External buffer pool pages free list, protected with buf_pool.mutex */ + UT_LIST_BASE_NODE_T(ext_buf_page_t) ext_free; + public: /** The requested innodb_buffer_pool_size */ size_t size_in_bytes_requested; @@ -1117,6 +1169,71 @@ class buf_pool_t #endif /** The maximum allowed innodb_buffer_pool_size */ size_t size_in_bytes_max; + /** Amount of pages in extended buffr pool file and the size of + ext_buf_pages_array */ + size_t extended_pages; +#ifdef UNIV_DEBUG + /** Shows if force LRU eviction to external buffer poll is currently on. + Debug only. */ + my_bool force_LRU_eviction_to_ebp; +#endif + /** Hash cell chain in page_hash_table */ + struct hash_chain + { + /** pointer to the first block */ + buf_page_t *first; + }; + + /** Allocates external buffer pool page. Tries to get a page for external + buffer pool free list. If the list is empty, tries to get page from the tail + of external buffer pool LRU list, if the corresponding page hash chain is not + locked, removes the page from the chain. + @param page_id page id which will be assigned to allocated page + @return allocated external buffer pool page or nullptr if free list is empty + and all page hash chains were locked */ + ext_buf_page_t *alloc_ext_page(page_id_t page_id) noexcept; + + /** Frees external buffer pool page. Pushes a page to the head of external + buffer pool free list. + @param p page to free. */ + void free_ext_page(ext_buf_page_t &p) noexcept + { + ut_ad(&p >= ext_buf_pages_array && + &p < ext_buf_pages_array + extended_pages); + mysql_mutex_assert_owner(&mutex); + UT_LIST_ADD_FIRST(ext_free, &p); + ut_d(p.in_free_list= true); + } + + /** Pushes external buffer pool page to the head of external buffer pool LRU + list. + @param ext_page page to push */ + void push_ext_page_to_LRU(ext_buf_page_t &ext_page) noexcept { + ut_ad(&ext_page >= ext_buf_pages_array && + &ext_page < ext_buf_pages_array + extended_pages); + mysql_mutex_assert_owner(&mutex); + UT_LIST_ADD_FIRST(ext_LRU, &ext_page); + ut_d(ext_page.in_LRU_list= true); + } + + /** Removes external buffer pool page from external buffer pool LRU list. + @param ext_page page to remove */ + void remove_ext_page_from_LRU(ext_buf_page_t &ext_page) noexcept { + ut_ad(&ext_page >= ext_buf_pages_array && + &ext_page < ext_buf_pages_array + extended_pages); + mysql_mutex_assert_owner(&mutex); + UT_LIST_REMOVE(ext_LRU, &ext_page); + ut_d(ext_page.in_LRU_list= false); + } + + /** Calculates external buffer pool page offset in external buffer pool file. + @param ext_page page for which offset is calulated + @return offset in external biffer pool file */ + os_offset_t ext_page_offset(const ext_buf_page_t &ext_page) const noexcept { + ut_ad(&ext_page >= ext_buf_pages_array && + &ext_page < ext_buf_pages_array + extended_pages); + return (&ext_page - ext_buf_pages_array) << srv_page_size_shift; + } /** @return the current size of the buffer pool, in bytes */ size_t curr_pool_size() const noexcept { return size_in_bytes; } @@ -1144,12 +1261,6 @@ class buf_pool_t static int madvise_do_dump() noexcept; #endif - /** Hash cell chain in page_hash_table */ - struct hash_chain - { - /** pointer to the first block */ - buf_page_t *first; - }; private: /** Determine the number of blocks in a buffer pool of a particular size. @param size_in_bytes innodb_buffer_pool_size in bytes @@ -1452,14 +1563,17 @@ class buf_pool_t void append(hash_chain &chain, buf_page_t *bpage) noexcept; /** Remove a block descriptor from a hash bucket chain. */ - inline void remove(hash_chain &chain, buf_page_t *bpage) noexcept; + void remove(hash_chain &chain, buf_page_t *bpage) noexcept; /** Replace a block descriptor with another. */ - inline void replace(hash_chain &chain, buf_page_t *old, buf_page_t *bpage) + void replace(hash_chain &chain, buf_page_t *old, buf_page_t *bpage) noexcept; - /** Look up a page in a hash bucket chain. */ - inline buf_page_t *get(const page_id_t id, const hash_chain &chain) const - noexcept; + /** Look up a page in a hash bucket chain. + @tparam show_ext_pages false if external buffer pool pages must be ignored, + true otherwise */ + template + inline buf_page_t *get(const page_id_t id, + const hash_chain &chain) const noexcept; }; /** Buffer pool mutex */ @@ -1528,6 +1642,11 @@ class buf_pool_t /** broadcast when a batch completes; protected by flush_list_mutex */ pthread_cond_t done_flush_list; + /** The number of threads waiting for done_flush_list, must be set before + page cleaner wake up and reset after done_flush_list waiting is finished, + protected with flush_list_mutex */ + size_t done_flush_list_waiters_count; + /** @return number of pending LRU flush */ unsigned n_flush() const noexcept { @@ -1658,7 +1777,8 @@ class buf_pool_t UT_LIST_BASE_NODE_T(buf_block_t) unzip_LRU; /*!< base node of the unzip_LRU list */ - + /** base node of external LRU list */ + UT_LIST_BASE_NODE_T(ext_buf_page_t) ext_LRU; /* @} */ /** free ROW_FORMAT=COMPRESSED page frames */ UT_LIST_BASE_NODE_T(buf_buddy_free_t) zip_free[BUF_BUDDY_SIZES_MAX]; @@ -1731,6 +1851,7 @@ class buf_pool_t /** The InnoDB buffer pool */ extern buf_pool_t buf_pool; +template inline buf_page_t *buf_pool_t::page_hash_table::get(const page_id_t id, const hash_chain &chain) const noexcept @@ -1742,9 +1863,11 @@ inline buf_page_t *buf_pool_t::page_hash_table::get(const page_id_t id, for (buf_page_t *bpage= chain.first; bpage; bpage= bpage->hash) { ut_ad(bpage->in_page_hash); - ut_ad(bpage->in_file()); - if (bpage->id() == id) + ut_ad(bpage->external() || bpage->in_file()); + if (bpage->id() == id && (show_ext_pages || !bpage->external())) return bpage; + /* There can be sentinel pages, don't break the loop if external page + was found and ignored. */ } return nullptr; } diff --git a/storage/innobase/include/buf0lru.h b/storage/innobase/include/buf0lru.h index a45fc72665289..6c9d0e55ab4bf 100644 --- a/storage/innobase/include/buf0lru.h +++ b/storage/innobase/include/buf0lru.h @@ -47,8 +47,10 @@ The caller must hold buf_pool.mutex. @param zip whether to remove both copies of a ROW_FORMAT=COMPRESSED page @retval true if freed and buf_pool.mutex may have been temporarily released @retval false if the page was not freed */ -bool buf_LRU_free_page(buf_page_t *bpage, bool zip) - MY_ATTRIBUTE((nonnull)); +bool buf_LRU_free_page(buf_page_t *bpage/* TODO: use reference instead of + pointer */, + bool zip, + ext_buf_page_t *ext_buf_page= nullptr); /** Try to free a replaceable block. @param limit maximum number of blocks to scan diff --git a/storage/innobase/include/buf0types.h b/storage/innobase/include/buf0types.h index cf5ab38df5f9f..1881b2b93c5ec 100644 --- a/storage/innobase/include/buf0types.h +++ b/storage/innobase/include/buf0types.h @@ -29,6 +29,7 @@ Created 11/17/1995 Heikki Tuuri /** Buffer page (uncompressed or compressed) */ class buf_page_t; +struct ext_buf_page_t; /** Buffer block for which an uncompressed page exists */ struct buf_block_t; /** Buffer pool statistics struct */ diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h index 73bbc79f665c3..f5ace037348c0 100644 --- a/storage/innobase/include/fil0fil.h +++ b/storage/innobase/include/fil0fil.h @@ -1412,9 +1412,13 @@ struct fil_system_t fil_system.mutex. */ fil_space_t *space_list_last_opened= nullptr; + /** External buffer pool file handler */ + pfs_os_file_t ext_bp_file; + #ifdef __linux__ /** available block devices that reside on non-rotational storage */ std::vector ssd; + public: /** @return whether a file system device is on non-rotational storage */ bool is_ssd(dev_t dev) const noexcept @@ -1440,6 +1444,15 @@ struct fil_system_t mysql_mutex_t mutex; fil_space_t* sys_space; /*!< The innodb_system tablespace */ fil_space_t* temp_space; /*!< The innodb_temporary tablespace */ +public: + + /** Extended buffer pool file path */ + char *ext_bp_path; + + /** Extended buffer pool file size, equals to 0 if extended buffer pool is + not used. */ + size_t ext_bp_size; + /** Map of fil_space_t::id to fil_space_t* */ hash_table_t spaces; @@ -1497,6 +1510,33 @@ struct fil_system_t potential space_id reuse */ bool space_id_reuse_warned; + /** Create external buffer pool file. + @return whether the creation failed */ + bool create_ext_file() noexcept; + + /** External bufer pool os_aio() wrapper. + @param bpage buffer pool page for read/write + @param ext_buf_page external buffer pool page which will be freed on read + completion and replace bpage in buffer pool on write + completion + @param io_request_type IORequest::WRITE_ASYNC, IORequest::READ_SYNC or + IORequest::READ_ASYNC + @param slot memory to be used for encrypted or page_compressed + pages + @param len length to read/write + @param buf buffer + @retval DB_SUCCESS if request was queued successfully + @retval DB_IO_ERROR on I/O error */ + dberr_t ext_bp_io(buf_page_t &bpage, ext_buf_page_t &ext_buf_page, + IORequest::Type io_request_type, buf_tmp_buffer_t *slot, + size_t len, void *buf) noexcept; + + /** Returns if external buffer pool is enabled. */ + bool ext_buf_pool_enabled() const { return ext_bp_size; } + + /** Disable external boffer pool */ + void ext_buf_pool_disable() { ext_bp_size= 0; } + /** Add the file to the end of opened spaces list in fil_system.space_list, so that fil_space_t::try_to_close() should close it as a last resort. @@ -1833,4 +1873,12 @@ ulint fil_space_get_block_size(const fil_space_t* space, unsigned offset) bool fil_crypt_check(fil_space_crypt_t *crypt_data, const char *f_name) noexcept; +/** Create temporary files in the given paramater path, and if +UNIV_PFS_IO defined, register the file descriptor with Performance Schema. +@param path location for creating temporary merge files, or NULL +@param label label for registration in Performance Schema if path == nullptr +@param prefix temporary file name prefix +@return File descriptor */ +pfs_os_file_t pfs_create_temp_file(const char *path, const char *label, + const char *prefix); #endif /* UNIV_INNOCHECKSUM */ diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h index 5e3b208d85f6d..091dec5c2ed78 100644 --- a/storage/innobase/include/os0file.h +++ b/storage/innobase/include/os0file.h @@ -64,6 +64,7 @@ extern bool os_has_said_disk_full; typedef ib_uint64_t os_offset_t; class buf_tmp_buffer_t; +struct ext_buf_page_t; #ifdef _WIN32 @@ -206,13 +207,28 @@ class IORequest PUNCH_RANGE= WRITE_SYNC | 32, }; + /* This ctor is used inside of fil_space_t::io(...) */ constexpr IORequest(buf_page_t *bpage, buf_tmp_buffer_t *slot, - fil_node_t *node, Type type) : - bpage(bpage), slot(slot), node(node), type(type) {} + fil_node_t *node, Type type) + : node_ptr{node}, bpage_ptr(bpage), slot(slot), type(type) + { + } + /* This ctor is used by the callers of fil_space_t::io(...) */ constexpr IORequest(Type type= READ_SYNC, buf_page_t *bpage= nullptr, - buf_tmp_buffer_t *slot= nullptr) : - bpage(bpage), slot(slot), type(type) {} + buf_tmp_buffer_t *slot= nullptr) + : bpage_ptr(bpage), slot(slot), type(type) + { + } + + IORequest(buf_page_t *bpage, buf_tmp_buffer_t *slot, + ext_buf_page_t *ext_buf_page, Type type) + : ext_buf_page_ptr(ext_buf_page), + bpage_ptr(reinterpret_cast( + reinterpret_cast(bpage) | 1)), + slot(slot), type(type) + { + } bool is_read() const noexcept { return (type & READ_SYNC) != 0; } bool is_write() const noexcept { return (type & WRITE_SYNC) != 0; } @@ -224,7 +240,7 @@ class IORequest IORequest doublewritten() const noexcept { ut_ad(type == WRITE_ASYNC || type == PUNCH); - return IORequest{bpage, slot, node, Type(type | 4)}; + return IORequest{bpage(), slot, node(), Type(type | 4)}; } void write_complete(int io_error) const noexcept; @@ -237,9 +253,9 @@ class IORequest @return DB_SUCCESS or error code */ dberr_t maybe_punch_hole(os_offset_t off, ulint len) noexcept { - return off && len && node && (type & (PUNCH ^ WRITE_ASYNC)) - ? punch_hole(off, len) - : DB_SUCCESS; + return off && len && (type & (PUNCH ^ WRITE_ASYNC)) && node() + ? punch_hole(off, len) + : DB_SUCCESS; } private: @@ -249,18 +265,48 @@ class IORequest @return DB_SUCCESS or error code */ dberr_t punch_hole(os_offset_t off, ulint len) const noexcept; -public: + union + { + /** File descriptor */ + fil_node_t *const node_ptr= nullptr; + /** External buffer pool page if the request is for external buffer pool + file, nullptr otherwise */ + ext_buf_page_t *const ext_buf_page_ptr; + }; + /** Page to be written on write operation */ - buf_page_t *const bpage= nullptr; + buf_page_t *const bpage_ptr= nullptr; + +public: /** Memory to be used for encrypted or page_compressed pages */ buf_tmp_buffer_t *const slot= nullptr; - /** File descriptor */ - fil_node_t *const node= nullptr; + buf_page_t *bpage() const + { + return reinterpret_cast( + reinterpret_cast(bpage_ptr) & ~ptrdiff_t(1)); + }; + + bool ext_buf() const + { + return reinterpret_cast(bpage_ptr) & 1; + } + + fil_node_t *node() const + { + ut_ad(!ext_buf()); + return node_ptr; + } + + ext_buf_page_t *ext_buf_page() const { + ut_ad(ext_buf()); + return ext_buf_page_ptr; + }; /** Request type bit flags */ const Type type; + }; constexpr IORequest IORequestRead(IORequest::READ_SYNC); @@ -999,6 +1045,17 @@ void os_aio_free() noexcept; @param offset additional context */ void os_fake_read(const IORequest &type, os_offset_t offset) noexcept; +/** Request a read or write. +@param type I/O request +@param buf buffer +@param offset file offset +@param n number of bytes +@retval DB_SUCCESS if request was queued successfully +@retval DB_IO_ERROR on I/O error */ +dberr_t os_aio(const IORequest &type, void *buf, os_offset_t offset, + size_t n, pfs_os_file_t handle, + const char *file_name) noexcept; + /** Request a read or write. @param type I/O request @param buf buffer diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc index 6198cb04f9931..881c1f3fd241a 100644 --- a/storage/innobase/log/log0recv.cc +++ b/storage/innobase/log/log0recv.cc @@ -3584,10 +3584,10 @@ bool recv_recover_page(fil_space_t* space, buf_page_t* bpage) void IORequest::fake_read_complete(os_offset_t offset) const noexcept { - ut_ad(node); + ut_ad(node()); ut_ad(is_read()); - ut_ad(bpage); - ut_ad(bpage->frame); + ut_ad(bpage_ptr); + ut_ad(bpage_ptr->frame); ut_ad(recv_recovery_is_on()); ut_ad(offset); @@ -3595,30 +3595,30 @@ void IORequest::fake_read_complete(os_offset_t offset) const noexcept mtr.start(); mtr.set_log_mode(MTR_LOG_NO_REDO); - ut_ad(bpage->frame); + ut_ad(bpage_ptr->frame); /* Move the ownership of the x-latch on the page to this OS thread, so that we can acquire a second x-latch on it. This is needed for the operations to the page to pass the debug checks. */ - bpage->lock.claim_ownership(); - bpage->lock.x_lock_recursive(); - bpage->fix_on_recovery(); - mtr.memo_push(reinterpret_cast(bpage), MTR_MEMO_PAGE_X_FIX); + bpage_ptr->lock.claim_ownership(); + bpage_ptr->lock.x_lock_recursive(); + bpage_ptr->fix_on_recovery(); + mtr.memo_push(reinterpret_cast(bpage_ptr), MTR_MEMO_PAGE_X_FIX); page_recv_t &recs= *reinterpret_cast(slot); ut_ad(recs.being_processed == 1); const lsn_t init_lsn{offset}; ut_ad(init_lsn > 1); - if (recv_recover_page(reinterpret_cast(bpage), - mtr, recs, node->space, init_lsn)) + if (recv_recover_page(reinterpret_cast(bpage_ptr), + mtr, recs, node()->space, init_lsn)) { - ut_ad(bpage->oldest_modification() || bpage->is_freed()); - bpage->lock.x_unlock(true); + ut_ad(bpage_ptr->oldest_modification() || bpage_ptr->is_freed()); + bpage_ptr->lock.x_unlock(true); } recs.being_processed= -1; ut_ad(mtr.has_committed()); - node->space->release(); + node()->space->release(); } /** @return whether a page has been freed */ @@ -3982,9 +3982,12 @@ static void log_sort_flush_list() noexcept { os_aio_wait_until_no_pending_writes(false); mysql_mutex_lock(&buf_pool.flush_list_mutex); - if (buf_pool.page_cleaner_active()) + if (buf_pool.page_cleaner_active()) { + ++buf_pool.done_flush_list_waiters_count; my_cond_wait(&buf_pool.done_flush_list, &buf_pool.flush_list_mutex.m_mutex); + --buf_pool.done_flush_list_waiters_count; + } else if (!os_aio_pending_writes()) break; mysql_mutex_unlock(&buf_pool.flush_list_mutex); diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc index f3c04027077b3..4c6764edcaa33 100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@ -2566,8 +2566,8 @@ os_file_io( " Retrying.", n, type.is_read() ? "read" : "written", offset, - type.node - ? type.node->name + type.node() + ? type.node()->name : "(unknown file)", bytes_returned); } @@ -2719,7 +2719,7 @@ os_file_read_func( ulint n, ulint* o) noexcept { - ut_ad(!type.node || type.node->handle == file); + ut_ad(type.ext_buf() || !type.node() || type.node()->handle == file); ut_ad(n); os_bytes_read_since_printout+= n; @@ -2733,11 +2733,11 @@ os_file_read_func( if (ulint(n_bytes) == n || err != DB_SUCCESS) return err; - os_file_handle_error_no_exit(type.node ? type.node->name : nullptr, "read", - false); + os_file_handle_error_no_exit(type.node() ? type.node()->name : nullptr, + "read", false); sql_print_error("InnoDB: Tried to read %zu bytes at offset %" PRIu64 " of file %s, but was only able to read %zd", - n, offset, type.node ? type.node->name : "(unknown)", + n, offset, type.node() ? type.node()->name : "(unknown)", n_bytes); return err ? err : DB_IO_ERROR; @@ -2930,7 +2930,8 @@ os_file_punch_hole( @return DB_SUCCESS or error code */ dberr_t IORequest::punch_hole(os_offset_t off, ulint len) const noexcept { - ulint trim_len = bpage ? bpage->physical_size() - len : 0; + ut_ad(!ext_buf()); + ulint trim_len = bpage_ptr ? bpage_ptr->physical_size() - len : 0; if (trim_len == 0) { return(DB_SUCCESS); @@ -2940,18 +2941,18 @@ dberr_t IORequest::punch_hole(os_offset_t off, ulint len) const noexcept /* Check does file system support punching holes for this tablespace. */ - if (!node->punch_hole) { + if (!node()->punch_hole) { return DB_IO_NO_PUNCH_HOLE; } - dberr_t err = os_file_punch_hole(node->handle, off, trim_len); + dberr_t err = os_file_punch_hole(node()->handle, off, trim_len); switch (err) { case DB_SUCCESS: srv_stats.page_compressed_trim_op.inc(); return err; case DB_IO_NO_PUNCH_HOLE: - node->punch_hole = false; + node()->punch_hole = false; err = DB_SUCCESS; /* fall through */ default: @@ -3060,7 +3061,7 @@ static void write_io_callback(void *c) ib::info () << "IO Error: " << cb->m_err << " during write of " << cb->m_len << " bytes, for file " - << request.node->name << "(" << cb->m_fh << "), returned " + << request.node()->name << "(" << cb->m_fh << "), returned " << cb->m_ret_len; request.write_complete(cb->m_err); @@ -3247,7 +3248,7 @@ void os_fake_read(const IORequest &type, os_offset_t offset) noexcept tpool::aiocb *cb= read_slots->acquire(); cb->m_group= read_slots->get_task_group(); - cb->m_fh= type.node->handle.m_file; + cb->m_fh= type.node()->handle.m_file; cb->m_buffer= nullptr; cb->m_len= 0; cb->m_offset= offset; @@ -3268,16 +3269,15 @@ void os_fake_read(const IORequest &type, os_offset_t offset) noexcept @param n number of bytes @retval DB_SUCCESS if request was queued successfully @retval DB_IO_ERROR on I/O error */ -dberr_t os_aio(const IORequest &type, void *buf, os_offset_t offset, size_t n) - noexcept +dberr_t os_aio(const IORequest &type, void *buf, os_offset_t offset, + size_t n, pfs_os_file_t handle, + const char *file_name) noexcept { ut_ad(n > 0); ut_ad(!(n & 511)); /* payload of page_compressed tables */ ut_ad((offset % UNIV_ZIP_SIZE_MIN) == 0); ut_ad((reinterpret_cast(buf) % UNIV_ZIP_SIZE_MIN) == 0); ut_ad(type.is_read() || type.is_write()); - ut_ad(type.node); - ut_ad(type.node->is_open()); #ifdef WIN_ASYNC_IO ut_ad((n & 0xFFFFFFFFUL) == n); @@ -3286,7 +3286,7 @@ dberr_t os_aio(const IORequest &type, void *buf, os_offset_t offset, size_t n) #ifdef UNIV_PFS_IO PSI_file_locker_state state; PSI_file_locker* locker= nullptr; - register_pfs_file_io_begin(&state, locker, type.node->handle, n, + register_pfs_file_io_begin(&state, locker, handle, n, type.is_write() ? PSI_FILE_WRITE : PSI_FILE_READ, __FILE__, __LINE__); @@ -3295,10 +3295,10 @@ dberr_t os_aio(const IORequest &type, void *buf, os_offset_t offset, size_t n) if (!type.is_async()) { err = type.is_read() - ? os_file_read_func(type, type.node->handle, + ? os_file_read_func(type, handle, buf, offset, n, nullptr) - : os_file_write_func(type, type.node->name, - type.node->handle, + : os_file_write_func(type, file_name, + handle, buf, offset, n); func_exit: #ifdef UNIV_PFS_IO @@ -3329,7 +3329,7 @@ dberr_t os_aio(const IORequest &type, void *buf, os_offset_t offset, size_t n) cb->m_buffer = buf; cb->m_callback = callback; cb->m_group = slots->get_task_group(); - cb->m_fh = type.node->handle.m_file; + cb->m_fh = handle.m_file; cb->m_len = (int)n; cb->m_offset = offset; cb->m_opcode = opcode; @@ -3337,16 +3337,34 @@ dberr_t os_aio(const IORequest &type, void *buf, os_offset_t offset, size_t n) if (srv_thread_pool->submit_io(cb)) { slots->release(cb); - os_file_handle_error_no_exit(type.node->name, type.is_read() + os_file_handle_error_no_exit(file_name, type.is_read() ? "aio read" : "aio write", false); err = DB_IO_ERROR; - type.node->space->release(); } goto func_exit; } +/** Request a read or write. +@param type I/O request +@param buf buffer +@param offset file offset +@param n number of bytes +@retval DB_SUCCESS if request was queued successfully +@retval DB_IO_ERROR on I/O error */ +dberr_t os_aio(const IORequest &type, void *buf, os_offset_t offset, + size_t n) noexcept +{ + ut_ad(type.node()); + ut_ad(type.node()->is_open()); + dberr_t err= + os_aio(type, buf, offset, n, type.node()->handle, type.node()->name); + if (err == DB_IO_ERROR) + type.node()->space->release(); + return err; +} + void os_aio_print(FILE *file) noexcept { time_t current_time; diff --git a/storage/innobase/row/row0merge.cc b/storage/innobase/row/row0merge.cc index efb89cb069f3b..0ca58526e35a5 100644 --- a/storage/innobase/row/row0merge.cc +++ b/storage/innobase/row/row0merge.cc @@ -61,6 +61,9 @@ Completed by Sunny Bains and Marko Makela /* Whether to disable file system cache */ char srv_disable_sort_file_cache; +static const char *merge_temp_file_label= "/Innodb Merge Temp File"; +static const char *merge_temp_file_prefix= "ib"; + /** Class that caches spatial index row tuples made from a single cluster index page scan, and then insert into corresponding index tree */ class spatial_index_info { @@ -4339,57 +4342,18 @@ void row_merge_drop_temp_indexes() } -/** Create temporary merge files in the given paramater path, and if -UNIV_PFS_IO defined, register the file descriptor with Performance Schema. -@param[in] path location for creating temporary merge files, or NULL -@return File descriptor */ -static pfs_os_file_t row_merge_file_create_mode(const char *path, int mode) -{ - if (!path) { - path = mysql_tmpdir; - } -#ifdef UNIV_PFS_IO - /* This temp file open does not go through normal - file APIs, add instrumentation to register with - performance schema */ - struct PSI_file_locker* locker; - PSI_file_locker_state state; - static const char label[] = "/Innodb Merge Temp File"; - char* name = static_cast( - ut_malloc_nokey(strlen(path) + sizeof label)); - strcpy(name, path); - strcat(name, label); - - register_pfs_file_open_begin( - &state, locker, innodb_temp_file_key, - PSI_FILE_CREATE, path ? name : label, __FILE__, __LINE__); - -#endif - DBUG_ASSERT(strlen(path) + 2 <= FN_REFLEN); - char filename[FN_REFLEN]; - File f = create_temp_file(filename, path, "ib", - O_BINARY | O_SEQUENTIAL, - MYF(MY_WME | MY_TEMPORARY)); - pfs_os_file_t fd = IF_WIN((os_file_t)my_get_osfhandle(f), f); - -#ifdef UNIV_PFS_IO - register_pfs_file_open_end(locker, fd, - (fd == OS_FILE_CLOSED)?NULL:&fd); - ut_free(name); -#endif - - if (fd == OS_FILE_CLOSED) { - ib::error() << "Cannot create temporary merge file"; - } - return(fd); -} - /** Create a temporary file at the specified path. @param path location for creating temporary merge files, or nullptr @return File descriptor */ pfs_os_file_t row_merge_file_create_low(const char *path) { - return row_merge_file_create_mode(path, O_BINARY | O_SEQUENTIAL); + auto fd= pfs_create_temp_file(path, merge_temp_file_label, + merge_temp_file_prefix); + if (fd == OS_FILE_CLOSED) + { + ib::error() << "Cannot create temporary merge file"; + } + return fd; } /** Create a merge file in the given location. @@ -4404,13 +4368,13 @@ row_merge_file_create( merge_file->offset = 0; merge_file->n_rec = 0; merge_file->fd = - row_merge_file_create_mode(path, -#if !defined _WIN32 && defined O_DIRECT - srv_disable_sort_file_cache - ? O_DIRECT | O_BINARY | O_SEQUENTIAL - : -#endif - O_BINARY | O_SEQUENTIAL); + pfs_create_temp_file(path, + merge_temp_file_label, + merge_temp_file_prefix); + if (merge_file->fd == OS_FILE_CLOSED) + { + ib::error() << "Cannot create temporary merge file"; + } return(merge_file->fd); } diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index 24b41f7602fdf..8c81e1a558da4 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -1426,10 +1426,18 @@ dberr_t srv_start(bool create_new_db) fil_system.create(srv_file_per_table ? 50000 : 5000); + if (!fil_system.is_initialised()) { + return srv_init_abort(DB_ERROR); + } + if (buf_pool.create()) { return(srv_init_abort(DB_ERROR)); } + if (srv_operation == SRV_OPERATION_NORMAL + && fil_system.ext_bp_size && !fil_system.create_ext_file()) + return(srv_init_abort(DB_ERROR)); + log_sys.create(); recv_sys.create(); lock_sys.create(srv_lock_table_size = 5 * buf_pool.curr_size()); @@ -2088,9 +2096,11 @@ void innodb_shutdown() mysql_mutex_lock(&buf_pool.flush_list_mutex); srv_shutdown_state = SRV_SHUTDOWN_CLEANUP; while (buf_page_cleaner_is_active) { + ++buf_pool.done_flush_list_waiters_count; pthread_cond_signal(&buf_pool.do_flush_list); my_cond_wait(&buf_pool.done_flush_list, &buf_pool.flush_list_mutex.m_mutex); + --buf_pool.done_flush_list_waiters_count; } mysql_mutex_unlock(&buf_pool.flush_list_mutex); break; From 64eb035a6c3c8cbe474f7984bc5d6edbee8d9ee9 Mon Sep 17 00:00:00 2001 From: Vlad Lesin Date: Sun, 11 Jan 2026 23:17:12 +0300 Subject: [PATCH 2/6] MDEV-31956 SSD based InnoDB buffer pool extension Fix some tests. Make ext_buf_pool test more stable avoiding race conditions for read/write counters. --- include/my_sys.h | 1 + .../sys_vars/r/sysvars_innodb,32bit.rdiff | 87 +++++++++++-------- mysys/my_create.c | 2 +- mysys/my_open.c | 2 +- mysys/my_winfile.c | 14 ++- mysys/mysys_priv.h | 4 +- storage/innobase/buf/buf0flu.cc | 7 +- storage/innobase/buf/buf0lru.cc | 11 ++- storage/innobase/fil/fil0fil.cc | 24 +++-- storage/innobase/include/buf0buf.h | 30 +++++-- storage/innobase/include/fil0fil.h | 9 +- storage/innobase/row/row0merge.cc | 4 +- tpool/aio_liburing.cc | 5 +- tpool/tpool.h | 6 +- 14 files changed, 131 insertions(+), 75 deletions(-) diff --git a/include/my_sys.h b/include/my_sys.h index 4148ef0ba0408..b59dcdf4110f1 100644 --- a/include/my_sys.h +++ b/include/my_sys.h @@ -60,6 +60,7 @@ C_MODE_START #define MY_IGNORE_ENOENT 32U /* my_delete() ignores ENOENT (no such file) */ #define MY_ENCRYPT 64U /* Encrypt IO_CACHE temporary files */ #define MY_TEMPORARY 64U /* create_temp_file(): delete file at once */ +#define MY_OPEN_FOR_ASYNC_IO 128U /* my_open() open file for async io */ #define MY_NOSYMLINKS 512U /* my_open(): don't follow symlinks */ #define MY_FULL_IO 512U /* my_read(): loop until I/O is complete */ #define MY_DONT_CHECK_FILESIZE 128U /* Option to init_io_cache() */ diff --git a/mysql-test/suite/sys_vars/r/sysvars_innodb,32bit.rdiff b/mysql-test/suite/sys_vars/r/sysvars_innodb,32bit.rdiff index a2b062a586036..975409646c58e 100644 --- a/mysql-test/suite/sys_vars/r/sysvars_innodb,32bit.rdiff +++ b/mysql-test/suite/sys_vars/r/sysvars_innodb,32bit.rdiff @@ -1,6 +1,6 @@ --- a/mysql-test/suite/sys_vars/r/sysvars_innodb.result +++ b/mysql-test/suite/sys_vars/r/sysvars_innodb.result -@@ -51,7 +51,7 @@ +@@ -52,7 +52,7 @@ VARIABLE_TYPE INT UNSIGNED VARIABLE_COMMENT Number of adaptive hash table cells in each partition; 16381 at start defaults to being derived from innodb_buffer_pool_size NUMERIC_MIN_VALUE 16381 @@ -9,7 +9,7 @@ NUMERIC_BLOCK_SIZE 0 ENUM_VALUE_LIST NULL READ_ONLY NO -@@ -60,7 +60,7 @@ +@@ -61,7 +61,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 8 VARIABLE_SCOPE GLOBAL @@ -18,7 +18,7 @@ VARIABLE_COMMENT Number of InnoDB Adaptive Hash Index Partitions (default 8) NUMERIC_MIN_VALUE 1 NUMERIC_MAX_VALUE 512 -@@ -96,7 +96,7 @@ +@@ -97,7 +97,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 1 VARIABLE_SCOPE GLOBAL @@ -27,7 +27,7 @@ VARIABLE_COMMENT The AUTOINC lock modes supported by InnoDB: 0 => Old style AUTOINC locking (for backward compatibility); 1 => New style AUTOINC locking; 2 => No AUTOINC locking (unsafe for SBR) NUMERIC_MIN_VALUE 0 NUMERIC_MAX_VALUE 2 -@@ -108,10 +108,10 @@ +@@ -109,10 +109,10 @@ SESSION_VALUE NULL DEFAULT_VALUE 0 VARIABLE_SCOPE GLOBAL @@ -40,7 +40,7 @@ NUMERIC_BLOCK_SIZE 1048576 ENUM_VALUE_LIST NULL READ_ONLY YES -@@ -144,7 +144,7 @@ +@@ -145,7 +145,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 25 VARIABLE_SCOPE GLOBAL @@ -49,7 +49,7 @@ VARIABLE_COMMENT Dump only the hottest N% of each buffer pool, defaults to 25 NUMERIC_MIN_VALUE 1 NUMERIC_MAX_VALUE 100 -@@ -216,10 +216,10 @@ +@@ -217,10 +217,10 @@ SESSION_VALUE NULL DEFAULT_VALUE 134217728 VARIABLE_SCOPE GLOBAL @@ -62,7 +62,7 @@ NUMERIC_BLOCK_SIZE 1048576 ENUM_VALUE_LIST NULL READ_ONLY NO -@@ -228,11 +228,11 @@ +@@ -229,11 +229,11 @@ SESSION_VALUE NULL DEFAULT_VALUE 0 VARIABLE_SCOPE GLOBAL @@ -77,7 +77,7 @@ ENUM_VALUE_LIST NULL READ_ONLY NO COMMAND_LINE_ARGUMENT REQUIRED -@@ -240,11 +240,11 @@ +@@ -241,11 +241,11 @@ SESSION_VALUE NULL DEFAULT_VALUE 0 VARIABLE_SCOPE GLOBAL @@ -92,7 +92,7 @@ ENUM_VALUE_LIST NULL READ_ONLY YES COMMAND_LINE_ARGUMENT REQUIRED -@@ -252,7 +252,7 @@ +@@ -253,7 +253,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 0 VARIABLE_SCOPE GLOBAL @@ -101,7 +101,7 @@ VARIABLE_COMMENT A number that tells how often buffer pool dump status in percentages should be printed. E.g. 10 means that buffer pool dump status is printed when every 10% of number of buffer pool pages are dumped. Default is 0 (only start and end status is printed) NUMERIC_MIN_VALUE 0 NUMERIC_MAX_VALUE 100 -@@ -324,7 +324,7 @@ +@@ -325,7 +325,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 5 VARIABLE_SCOPE GLOBAL @@ -110,7 +110,7 @@ VARIABLE_COMMENT If the compression failure rate of a table is greater than this number more padding is added to the pages to reduce the failures. A value of zero implies no padding NUMERIC_MIN_VALUE 0 NUMERIC_MAX_VALUE 100 -@@ -348,7 +348,7 @@ +@@ -349,7 +349,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 50 VARIABLE_SCOPE GLOBAL @@ -119,7 +119,20 @@ VARIABLE_COMMENT Percentage of empty space on a data page that can be reserved to make the page compressible NUMERIC_MIN_VALUE 0 NUMERIC_MAX_VALUE 75 -@@ -588,7 +588,7 @@ +@@ -589,10 +589,10 @@ + SESSION_VALUE NULL + DEFAULT_VALUE 0 + VARIABLE_SCOPE GLOBAL +-VARIABLE_TYPE BIGINT UNSIGNED ++VARIABLE_TYPE INT UNSIGNED + VARIABLE_COMMENT The extended buffer pool file size + NUMERIC_MIN_VALUE 0 +-NUMERIC_MAX_VALUE 18446744073709551615 ++NUMERIC_MAX_VALUE 4294967295 + NUMERIC_BLOCK_SIZE 0 + ENUM_VALUE_LIST NULL + READ_ONLY NO +@@ -613,7 +613,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 600 VARIABLE_SCOPE GLOBAL @@ -128,7 +141,7 @@ VARIABLE_COMMENT Maximum number of seconds that semaphore times out in InnoDB NUMERIC_MIN_VALUE 1 NUMERIC_MAX_VALUE 4294967295 -@@ -636,7 +636,7 @@ +@@ -661,7 +661,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 30 VARIABLE_SCOPE GLOBAL @@ -137,7 +150,7 @@ VARIABLE_COMMENT Number of iterations over which the background flushing is averaged NUMERIC_MIN_VALUE 1 NUMERIC_MAX_VALUE 1000 -@@ -660,7 +660,7 @@ +@@ -685,7 +685,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 1 VARIABLE_SCOPE GLOBAL @@ -146,7 +159,7 @@ VARIABLE_COMMENT Controls the durability/speed trade-off for commits. Set to 0 (write and flush redo log to disk only once per second), 1 (flush to disk at each commit), 2 (write to log at commit but flush to disk only once per second) or 3 (flush to disk at prepare and at commit, slower and usually redundant). 1 and 3 guarantees that after a crash, committed transactions will not be lost and will be consistent with the binlog and other transactional engines. 2 can get inconsistent and lose transactions if there is a power failure or kernel crash but not if mysqld crashes. 0 has no guarantees in case of crash. 0 and 2 can be faster than 1 or 3 NUMERIC_MIN_VALUE 0 NUMERIC_MAX_VALUE 3 -@@ -684,7 +684,7 @@ +@@ -709,7 +709,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 1 VARIABLE_SCOPE GLOBAL @@ -155,7 +168,7 @@ VARIABLE_COMMENT Set to 0 (don't flush neighbors from buffer pool), 1 (flush contiguous neighbors from buffer pool) or 2 (flush neighbors from buffer pool), when flushing a block NUMERIC_MIN_VALUE 0 NUMERIC_MAX_VALUE 2 -@@ -720,7 +720,7 @@ +@@ -745,7 +745,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 0 VARIABLE_SCOPE GLOBAL @@ -164,7 +177,7 @@ VARIABLE_COMMENT Helps to save your data in case the disk image of the database becomes corrupt. Value 5 can return bogus data, and 6 can permanently corrupt data NUMERIC_MIN_VALUE 0 NUMERIC_MAX_VALUE 6 -@@ -744,10 +744,10 @@ +@@ -769,10 +769,10 @@ SESSION_VALUE NULL DEFAULT_VALUE 8000000 VARIABLE_SCOPE GLOBAL @@ -177,7 +190,7 @@ NUMERIC_BLOCK_SIZE 0 ENUM_VALUE_LIST NULL READ_ONLY NO -@@ -780,7 +780,7 @@ +@@ -805,7 +805,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 84 VARIABLE_SCOPE GLOBAL @@ -186,16 +199,16 @@ VARIABLE_COMMENT InnoDB Fulltext search maximum token size in characters NUMERIC_MIN_VALUE 10 NUMERIC_MAX_VALUE 84 -@@ -792,7 +792,7 @@ +@@ -817,7 +817,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 3 VARIABLE_SCOPE GLOBAL -VARIABLE_TYPE BIGINT UNSIGNED +VARIABLE_TYPE INT UNSIGNED VARIABLE_COMMENT InnoDB Fulltext search minimum token size in characters - NUMERIC_MIN_VALUE 0 + NUMERIC_MIN_VALUE 1 NUMERIC_MAX_VALUE 16 -@@ -804,7 +804,7 @@ +@@ -829,7 +829,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 2000 VARIABLE_SCOPE GLOBAL @@ -204,7 +217,7 @@ VARIABLE_COMMENT InnoDB Fulltext search number of words to optimize for each optimize table call NUMERIC_MIN_VALUE 1000 NUMERIC_MAX_VALUE 10000 -@@ -816,10 +816,10 @@ +@@ -841,10 +841,10 @@ SESSION_VALUE NULL DEFAULT_VALUE 2000000000 VARIABLE_SCOPE GLOBAL @@ -217,7 +230,7 @@ NUMERIC_BLOCK_SIZE 0 ENUM_VALUE_LIST NULL READ_ONLY NO -@@ -840,7 +840,7 @@ +@@ -865,7 +865,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 2 VARIABLE_SCOPE GLOBAL @@ -226,7 +239,7 @@ VARIABLE_COMMENT InnoDB Fulltext search parallel sort degree, will round up to nearest power of 2 number NUMERIC_MIN_VALUE 1 NUMERIC_MAX_VALUE 16 -@@ -852,10 +852,10 @@ +@@ -877,10 +877,10 @@ SESSION_VALUE NULL DEFAULT_VALUE 640000000 VARIABLE_SCOPE GLOBAL @@ -239,7 +252,7 @@ NUMERIC_BLOCK_SIZE 0 ENUM_VALUE_LIST NULL READ_ONLY NO -@@ -900,7 +900,7 @@ +@@ -925,7 +925,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 200 VARIABLE_SCOPE GLOBAL @@ -248,7 +261,7 @@ VARIABLE_COMMENT Number of IOPs the server can do. Tunes the background IO rate NUMERIC_MIN_VALUE 100 NUMERIC_MAX_VALUE 4294967295 -@@ -912,7 +912,7 @@ +@@ -937,7 +937,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 4294967295 VARIABLE_SCOPE GLOBAL @@ -257,7 +270,7 @@ VARIABLE_COMMENT Limit to which innodb_io_capacity can be inflated NUMERIC_MIN_VALUE 100 NUMERIC_MAX_VALUE 4294967295 -@@ -1044,10 +1044,10 @@ +@@ -1069,10 +1069,10 @@ SESSION_VALUE NULL DEFAULT_VALUE 32 VARIABLE_SCOPE GLOBAL @@ -270,7 +283,7 @@ NUMERIC_BLOCK_SIZE 0 ENUM_VALUE_LIST NULL READ_ONLY NO -@@ -1056,10 +1056,10 @@ +@@ -1081,10 +1081,10 @@ SESSION_VALUE NULL DEFAULT_VALUE 1536 VARIABLE_SCOPE GLOBAL @@ -283,7 +296,7 @@ NUMERIC_BLOCK_SIZE 0 ENUM_VALUE_LIST NULL READ_ONLY NO -@@ -1092,10 +1092,10 @@ +@@ -1117,10 +1117,10 @@ SESSION_VALUE NULL DEFAULT_VALUE 0 VARIABLE_SCOPE GLOBAL @@ -296,7 +309,7 @@ NUMERIC_BLOCK_SIZE 0 ENUM_VALUE_LIST NULL READ_ONLY NO -@@ -1104,7 +1104,7 @@ +@@ -1129,7 +1129,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 0 VARIABLE_SCOPE GLOBAL @@ -305,7 +318,7 @@ VARIABLE_COMMENT Maximum delay of user threads in micro-seconds NUMERIC_MIN_VALUE 0 NUMERIC_MAX_VALUE 10000000 -@@ -1236,10 +1236,10 @@ +@@ -1261,10 +1261,10 @@ SESSION_VALUE NULL DEFAULT_VALUE 0 VARIABLE_SCOPE GLOBAL @@ -318,7 +331,7 @@ NUMERIC_BLOCK_SIZE 0 ENUM_VALUE_LIST NULL READ_ONLY YES -@@ -1260,7 +1260,7 @@ +@@ -1285,7 +1285,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 16384 VARIABLE_SCOPE GLOBAL @@ -327,7 +340,7 @@ VARIABLE_COMMENT Page size to use for all InnoDB tablespaces NUMERIC_MIN_VALUE 4096 NUMERIC_MAX_VALUE 65536 -@@ -1296,7 +1296,7 @@ +@@ -1321,7 +1321,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 127 VARIABLE_SCOPE GLOBAL @@ -336,7 +349,7 @@ VARIABLE_COMMENT Number of UNDO log pages to purge in one batch from the history list NUMERIC_MIN_VALUE 1 NUMERIC_MAX_VALUE 5000 -@@ -1308,7 +1308,7 @@ +@@ -1333,7 +1333,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 128 VARIABLE_SCOPE GLOBAL @@ -345,7 +358,7 @@ VARIABLE_COMMENT Unused NUMERIC_MIN_VALUE 1 NUMERIC_MAX_VALUE 128 -@@ -1344,7 +1344,7 @@ +@@ -1369,7 +1369,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 56 VARIABLE_SCOPE GLOBAL @@ -354,7 +367,7 @@ VARIABLE_COMMENT Number of pages that must be accessed sequentially for InnoDB to trigger a readahead NUMERIC_MIN_VALUE 0 NUMERIC_MAX_VALUE 64 -@@ -1428,7 +1428,7 @@ +@@ -1453,7 +1453,7 @@ SESSION_VALUE NULL DEFAULT_VALUE 1048576 VARIABLE_SCOPE GLOBAL @@ -363,7 +376,7 @@ VARIABLE_COMMENT Memory buffer size for index creation NUMERIC_MIN_VALUE 65536 NUMERIC_MAX_VALUE 67108864 -@@ -1596,10 +1596,10 @@ +@@ -1621,10 +1621,10 @@ SESSION_VALUE NULL DEFAULT_VALUE 30 VARIABLE_SCOPE GLOBAL diff --git a/mysys/my_create.c b/mysys/my_create.c index 32cc73a53c476..93da3d0a06399 100644 --- a/mysys/my_create.c +++ b/mysys/my_create.c @@ -41,7 +41,7 @@ File my_create(const char *FileName, mode_t CreateFlags, int access_flags, DBUG_PRINT("my",("Name: '%s' CreateFlags: %u AccessFlags: %d MyFlags: %lu", FileName, CreateFlags, access_flags, MyFlags)); #if defined(_WIN32) - fd= my_win_open(FileName, access_flags | O_CREAT); + fd= my_win_open(FileName, access_flags | O_CREAT, MyFlags); #else fd= open((char *) FileName, access_flags | O_CREAT | O_CLOEXEC, CreateFlags ? CreateFlags : my_umask); diff --git a/mysys/my_open.c b/mysys/my_open.c index 182bb14927743..1d229530bf169 100644 --- a/mysys/my_open.c +++ b/mysys/my_open.c @@ -50,7 +50,7 @@ File my_open(const char *FileName, int Flags, myf MyFlags) if (!(MyFlags & (MY_WME | MY_FAE | MY_FFNF))) MyFlags|= my_global_flags; #if defined(_WIN32) - fd= my_win_open(FileName, Flags); + fd= my_win_open(FileName, Flags, MyFlags); #else if (MyFlags & MY_NOSYMLINKS) fd = open_nosymlinks(FileName, Flags | O_CLOEXEC, my_umask); diff --git a/mysys/my_winfile.c b/mysys/my_winfile.c index 7a1e3e60b12ef..f01b970331f62 100644 --- a/mysys/my_winfile.c +++ b/mysys/my_winfile.c @@ -166,13 +166,16 @@ LPSECURITY_ATTRIBUTES my_win_file_secattr() oflag operation flags shflag share flag pmode permission flags + MyFlags flags, used to open files, currently only MY_OPEN_FOR_ASYNC_IO + is used RETURN VALUE File descriptor of opened file if success -1 and sets errno if fails. */ -File my_win_sopen(const char *path, int oflag, int shflag, int pmode) +File my_win_sopen(const char *path, int oflag, int shflag, int pmode, + myf MyFlags) { int fh; /* handle of opened file */ int mask; @@ -285,6 +288,11 @@ File my_win_sopen(const char *path, int oflag, int shflag, int pmode) fileaccess|= DELETE; } + if (MyFlags & MY_OPEN_FOR_ASYNC_IO) + { + fileattrib|= FILE_FLAG_OVERLAPPED; + } + /* Set temporary file (delay-flush-to-disk) attribute if requested.*/ if (oflag & _O_SHORT_LIVED) fileattrib|= FILE_ATTRIBUTE_TEMPORARY; @@ -317,11 +325,11 @@ File my_win_sopen(const char *path, int oflag, int shflag, int pmode) } -File my_win_open(const char *path, int flags) +File my_win_open(const char *path, int flags, myf MyFlags) { DBUG_ENTER("my_win_open"); DBUG_RETURN(my_win_sopen((char *) path, flags | _O_BINARY, _SH_DENYNO, - _S_IREAD | S_IWRITE)); + _S_IREAD | S_IWRITE, MyFlags)); } diff --git a/mysys/mysys_priv.h b/mysys/mysys_priv.h index efeb0c65af3bb..8a3558f5150f0 100644 --- a/mysys/mysys_priv.h +++ b/mysys/mysys_priv.h @@ -182,7 +182,7 @@ static int PROTO { NOSYMLINK_FUNCTION_BODY(AT,NOAT) } #ifdef _WIN32 #include /* my_winfile.c exports, should not be used outside mysys */ -extern File my_win_open(const char *path, int oflag); +extern File my_win_open(const char *path, int oflag, myf MyFlags); extern int my_win_close(File fd); extern size_t my_win_read(File fd, uchar *buffer, size_t count); extern size_t my_win_write(File fd, const uchar *buffer, size_t count); @@ -200,7 +200,7 @@ extern int my_win_stat(const char *path, struct _stati64 *buf); extern int my_win_fstat(File fd, struct _stati64 *buf); extern int my_win_fsync(File fd); extern File my_win_dup(File fd); -extern File my_win_sopen(const char *path, int oflag, int shflag, int perm); +extern File my_win_sopen(const char *path, int oflag, int shflag, int perm, myf MyFlags); extern File my_open_osfhandle(HANDLE handle, int oflag); diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index 4a4d2a16693bb..8df9ba5e26ff3 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -299,10 +299,7 @@ void buf_page_t::write_complete(space_type type, bool error, if (type != EXT_BUF) oldest_modification_.store(persistent, std::memory_order_release); } - zip.fix.fetch_sub((state >= WRITE_FIX_REINIT) - ? (WRITE_FIX_REINIT - UNFIXED) - : (WRITE_FIX - UNFIXED)); - lock.u_unlock(true); + write_complete_release(state); } inline void buf_pool_t::n_flush_inc() noexcept @@ -1363,7 +1360,9 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n, } else { + #if !defined(DBUG_OFF) free_page: + #endif buf_LRU_free_page(bpage, true); ++n->evicted; } diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc index 200f3a72d0c2d..419cd320bba54 100644 --- a/storage/innobase/buf/buf0lru.cc +++ b/storage/innobase/buf/buf0lru.cc @@ -1154,10 +1154,12 @@ static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id, } /** Release and evict a corrupted page. -@param bpage x-latched page that was found corrupted -@param state expected current state of the page */ +@param bpage x-latched page that was found corrupted +@param state expected current state of the page +@param set_corrupt_id true to call bpage->set_corrupt_id() */ ATTRIBUTE_COLD -void buf_pool_t::corrupted_evict(buf_page_t *bpage, uint32_t state) noexcept +void buf_pool_t::corrupted_evict(buf_page_t *bpage, uint32_t state, + bool set_corrupt_id) noexcept { const page_id_t id{bpage->id()}; buf_pool_t::hash_chain &chain= buf_pool.page_hash.cell_get(id.fold()); @@ -1167,7 +1169,8 @@ void buf_pool_t::corrupted_evict(buf_page_t *bpage, uint32_t state) noexcept hash_lock.lock(); ut_ad(!bpage->oldest_modification()); - bpage->set_corrupt_id(); + if (set_corrupt_id) + bpage->set_corrupt_id(); auto unfix= state - buf_page_t::FREED; auto s= bpage->zip.fix.fetch_sub(unfix) - unfix; bpage->lock.x_unlock(true); diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index a56b628e03453..9d9c82a77b08a 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -1301,6 +1301,8 @@ void fil_system_t::close() noexcept { if (ext_bp_file != OS_FILE_CLOSED) { + if (srv_thread_pool) + srv_thread_pool->unbind(ext_bp_file.m_file); int res= mysql_file_close( IF_WIN(my_win_handle2File((os_file_t) ext_bp_file), ext_bp_file), MYF(MY_WME)); @@ -2933,7 +2935,7 @@ bool fil_system_t::create_ext_file() noexcept { bool ret; ext_bp_file= pfs_create_temp_file( ext_bp_path ? ext_bp_path : fil_path_to_mysql_datadir, - "/Extended buffer pool file", "ext_buf_"); + "/Extended buffer pool file", "ext_buf_", true); if (ext_bp_file == OS_FILE_CLOSED) { sql_print_error("Cannot open/create extended buffer pool file"); @@ -2949,6 +2951,13 @@ bool fil_system_t::create_ext_file() noexcept { ext_bp_size); return false; } + if (srv_thread_pool && srv_thread_pool->bind(ext_bp_file.m_file) != 0) + { + sql_print_error("Cannot set async io for extended buffer pool file"); + /* Report OS error in error log */ + (void) os_file_get_last_error(true, false); + return false; + } return true; } @@ -2982,9 +2991,7 @@ void IORequest::write_complete(int io_error) const noexcept space= fil_space_t::get(buf_page->id().space()); if (!space) { - buf_page->lock.u_unlock(true); - // TODO: should we update the statistics here? - //++buf_pool.stat.n_pages_written_to_ebp; + buf_page->write_complete_release(buf_page->state()); return; } } @@ -3034,7 +3041,7 @@ void IORequest::read_complete(int io_error) const noexcept /* The space will be released at the end of this function */ space= fil_space_t::get(buf_page->id().space()); if (!space) { - buf_page->lock.x_unlock(true); + buf_pool.corrupted_evict(buf_page, buf_page_t::READ_FIX + 1, false); ++buf_pool.stat.n_pages_read_from_ebp; return; } @@ -3452,7 +3459,7 @@ fil_space_t *fil_space_t::prev_in_unflushed_spaces() noexcept #endif pfs_os_file_t pfs_create_temp_file(const char *path, const char *label, - const char *prefix) + const char *prefix, bool async_io) { if (!path) { @@ -3476,8 +3483,9 @@ pfs_os_file_t pfs_create_temp_file(const char *path, const char *label, #endif DBUG_ASSERT(strlen(path) + 2 <= FN_REFLEN); char filename[FN_REFLEN]; - File f= create_temp_file(filename, path, prefix, O_BINARY | O_SEQUENTIAL, - MYF(MY_WME | MY_TEMPORARY)); + File f= create_temp_file( + filename, path, prefix, O_BINARY | O_SEQUENTIAL, + MYF(MY_WME | MY_TEMPORARY | (async_io ? MY_OPEN_FOR_ASYNC_IO : 0))); pfs_os_file_t fd= IF_WIN((os_file_t) my_get_osfhandle(f), f); #ifdef UNIV_PFS_IO diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index 30063690db3e4..fec01096ff236 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -772,6 +772,16 @@ class buf_page_t : public buf_page_base_t void write_complete(space_type type, bool error, uint32_t state) noexcept; + /** Set correct state and unlock the page on write completion. + @param state current page's state */ + void write_complete_release(uint32_t state) noexcept + { + zip.fix.fetch_sub((state >= WRITE_FIX_REINIT) + ? (WRITE_FIX_REINIT - UNFIXED) + : (WRITE_FIX - UNFIXED)); + lock.u_unlock(true); + } + /** Write a flushable page to a file or free a freeable block. @param space tablespace @param to_ext_buf wherher to write the page to external buffer pull file @@ -1093,11 +1103,20 @@ struct buf_pool_stat_t{ ulint n_page_gets_nonatomic; }; ulint n_pages_read; /*!< number read operations */ + ulint n_pages_written;/*!< number write operations */ + /* Make external buffer pool counters to be atomic for debug build to + avoid race conditions during MTR test case execution */ +#if defined(UNIV_DEBUG) || !defined(DBUG_OFF) + /** Number of pages, read from external buffer pool file */ + Atomic_counter n_pages_read_from_ebp; + /** Number of pages, written to external buffer pool file */ + Atomic_counter n_pages_written_to_ebp; +#else /** Number of pages, read from external buffer pool file */ ulint n_pages_read_from_ebp; - ulint n_pages_written;/*!< number write operations */ /** Number of pages, written to external buffer pool file */ ulint n_pages_written_to_ebp; +#endif ulint n_pages_created;/*!< number of pages created in the pool with no read */ ulint n_ra_pages_read_rnd;/*!< number of pages read in @@ -1316,10 +1335,11 @@ class buf_pool_t ATTRIBUTE_COLD bool withdraw(buf_page_t &bpage) noexcept; /** Release and evict a corrupted page. - @param bpage x-latched page that was found corrupted - @param state expected current state of the page */ - ATTRIBUTE_COLD void corrupted_evict(buf_page_t *bpage, uint32_t state) - noexcept; + @param bpage x-latched page that was found corrupted + @param state expected current state of the page + @param set_corrupt_id true to call bpage->set_corrupt_id() */ + ATTRIBUTE_COLD void corrupted_evict(buf_page_t *bpage, uint32_t state, + bool set_corrupt_id= true) noexcept; /** Release a memory block to the buffer pool. */ ATTRIBUTE_COLD void free_block(buf_block_t *block) noexcept; diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h index f5ace037348c0..322bb8a3926c4 100644 --- a/storage/innobase/include/fil0fil.h +++ b/storage/innobase/include/fil0fil.h @@ -1875,10 +1875,11 @@ bool fil_crypt_check(fil_space_crypt_t *crypt_data, const char *f_name) /** Create temporary files in the given paramater path, and if UNIV_PFS_IO defined, register the file descriptor with Performance Schema. -@param path location for creating temporary merge files, or NULL -@param label label for registration in Performance Schema if path == nullptr -@param prefix temporary file name prefix +@param path location for creating temporary merge files, or NULL +@param label label for registration in Performance Schema if path == nullptr +@param prefix temporary file name prefix +@param async_io true if the file is going to be used with asynchronous IO @return File descriptor */ pfs_os_file_t pfs_create_temp_file(const char *path, const char *label, - const char *prefix); + const char *prefix, bool async_io); #endif /* UNIV_INNOCHECKSUM */ diff --git a/storage/innobase/row/row0merge.cc b/storage/innobase/row/row0merge.cc index 0ca58526e35a5..2c4f7b7c7dc45 100644 --- a/storage/innobase/row/row0merge.cc +++ b/storage/innobase/row/row0merge.cc @@ -4348,7 +4348,7 @@ void row_merge_drop_temp_indexes() pfs_os_file_t row_merge_file_create_low(const char *path) { auto fd= pfs_create_temp_file(path, merge_temp_file_label, - merge_temp_file_prefix); + merge_temp_file_prefix, false); if (fd == OS_FILE_CLOSED) { ib::error() << "Cannot create temporary merge file"; @@ -4370,7 +4370,7 @@ row_merge_file_create( merge_file->fd = pfs_create_temp_file(path, merge_temp_file_label, - merge_temp_file_prefix); + merge_temp_file_prefix, false); if (merge_file->fd == OS_FILE_CLOSED) { ib::error() << "Cannot create temporary merge file"; diff --git a/tpool/aio_liburing.cc b/tpool/aio_liburing.cc index c3176adcf8cf4..1fba7874006c4 100644 --- a/tpool/aio_liburing.cc +++ b/tpool/aio_liburing.cc @@ -138,8 +138,9 @@ class aio_uring final : public aio auto it= std::lower_bound(files_.begin(), files_.end(), fd); assert(it == files_.end() || *it != fd); files_.insert(it, fd); - return io_uring_register_files_update(&uring_, 0, files_.data(), - files_.size()); + int err= io_uring_register_files_update(&uring_, 0, files_.data(), + files_.size()); + return err < 0 ? err : 0; } int unbind(const native_file_handle &fd) final diff --git a/tpool/tpool.h b/tpool/tpool.h index f2cf96b268922..3905f8bb7c7e8 100644 --- a/tpool/tpool.h +++ b/tpool/tpool.h @@ -197,9 +197,11 @@ class aio On completion, cb->m_callback is executed. */ virtual int submit_io(aiocb *cb)= 0; - /** "Bind" file to AIO handler (used on Windows only) */ + /** "Bind" file to AIO handler. Used at least with Windows and liburing. + @param fd file handle + @return 0 on success and error code on error */ virtual int bind(native_file_handle &fd)= 0; - /** "Unind" file to AIO handler (used on Windows only) */ + /** "Unbind" file to AIO handler. Used at least with Windows and liburing. */ virtual int unbind(const native_file_handle &fd)= 0; virtual const char *get_implementation() const=0; virtual ~aio(){}; From 3ac1611f4cdfe59949132069ed660b6e6113e580 Mon Sep 17 00:00:00 2001 From: Vlad Lesin Date: Fri, 16 Jan 2026 11:49:58 +0300 Subject: [PATCH 3/6] MDEV-31956 SSD based InnoDB buffer pool extension Fix Windows and liburing issues. --- include/my_sys.h | 2 +- mysys/my_copy.c | 2 +- storage/innobase/fil/fil0fil.cc | 4 ++++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/include/my_sys.h b/include/my_sys.h index b59dcdf4110f1..da8df1c39f7b5 100644 --- a/include/my_sys.h +++ b/include/my_sys.h @@ -60,7 +60,7 @@ C_MODE_START #define MY_IGNORE_ENOENT 32U /* my_delete() ignores ENOENT (no such file) */ #define MY_ENCRYPT 64U /* Encrypt IO_CACHE temporary files */ #define MY_TEMPORARY 64U /* create_temp_file(): delete file at once */ -#define MY_OPEN_FOR_ASYNC_IO 128U /* my_open() open file for async io */ +#define MY_OPEN_FOR_ASYNC_IO 1024U /* my_open() open file for async io */ #define MY_NOSYMLINKS 512U /* my_open(): don't follow symlinks */ #define MY_FULL_IO 512U /* my_read(): loop until I/O is complete */ #define MY_DONT_CHECK_FILESIZE 128U /* Option to init_io_cache() */ diff --git a/mysys/my_copy.c b/mysys/my_copy.c index 3b07dd5fd53ea..392adecc66282 100644 --- a/mysys/my_copy.c +++ b/mysys/my_copy.c @@ -86,7 +86,7 @@ int my_copy(const char *from, const char *to, myf MyFlags) file_created= 1; while ((Count=my_read(from_file, buff, sizeof(buff), MyFlags)) != 0) { - if (Count == (uint) -1 || + if (Count == (size_t) -1 || my_write(to_file,buff,Count,MYF(MyFlags | MY_NABP))) goto err; } diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index 9d9c82a77b08a..27c002d505f68 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -1301,8 +1301,10 @@ void fil_system_t::close() noexcept { if (ext_bp_file != OS_FILE_CLOSED) { +#if defined(_WIN32) if (srv_thread_pool) srv_thread_pool->unbind(ext_bp_file.m_file); +#endif int res= mysql_file_close( IF_WIN(my_win_handle2File((os_file_t) ext_bp_file), ext_bp_file), MYF(MY_WME)); @@ -2951,6 +2953,7 @@ bool fil_system_t::create_ext_file() noexcept { ext_bp_size); return false; } +#if defined(_WIN32) if (srv_thread_pool && srv_thread_pool->bind(ext_bp_file.m_file) != 0) { sql_print_error("Cannot set async io for extended buffer pool file"); @@ -2958,6 +2961,7 @@ bool fil_system_t::create_ext_file() noexcept { (void) os_file_get_last_error(true, false); return false; } +#endif return true; } From c78f5ac816e33c44dfdc5c672d0af8e76d7ed387 Mon Sep 17 00:00:00 2001 From: Vlad Lesin Date: Sun, 18 Jan 2026 19:29:11 +0300 Subject: [PATCH 4/6] MDEV-31956 SSD based InnoDB buffer pool extension Use persistent named files for external buffer pool instead of temporary one. --- include/my_sys.h | 1 - mysql-test/suite/innodb/r/ext_buf_pool.result | 1 + mysql-test/suite/innodb/t/ext_buf_pool.test | 2 + mysys/my_create.c | 2 +- mysys/my_open.c | 2 +- mysys/my_winfile.c | 14 +-- mysys/mysys_priv.h | 4 +- storage/innobase/fil/fil0fil.cc | 92 +++++-------------- storage/innobase/include/fil0fil.h | 10 -- storage/innobase/include/os0file.h | 4 +- storage/innobase/os/os0file.cc | 13 +++ storage/innobase/row/row0merge.cc | 70 ++++++++++---- 12 files changed, 102 insertions(+), 113 deletions(-) diff --git a/include/my_sys.h b/include/my_sys.h index da8df1c39f7b5..4148ef0ba0408 100644 --- a/include/my_sys.h +++ b/include/my_sys.h @@ -60,7 +60,6 @@ C_MODE_START #define MY_IGNORE_ENOENT 32U /* my_delete() ignores ENOENT (no such file) */ #define MY_ENCRYPT 64U /* Encrypt IO_CACHE temporary files */ #define MY_TEMPORARY 64U /* create_temp_file(): delete file at once */ -#define MY_OPEN_FOR_ASYNC_IO 1024U /* my_open() open file for async io */ #define MY_NOSYMLINKS 512U /* my_open(): don't follow symlinks */ #define MY_FULL_IO 512U /* my_read(): loop until I/O is complete */ #define MY_DONT_CHECK_FILESIZE 128U /* Option to init_io_cache() */ diff --git a/mysql-test/suite/innodb/r/ext_buf_pool.result b/mysql-test/suite/innodb/r/ext_buf_pool.result index 1ecbab38815bc..a62ae65874ed8 100644 --- a/mysql-test/suite/innodb/r/ext_buf_pool.result +++ b/mysql-test/suite/innodb/r/ext_buf_pool.result @@ -1,6 +1,7 @@ connect prevent_purge,localhost,root; START TRANSACTION WITH CONSISTENT SNAPSHOT; connection default; +ext_buffer_pool SET GLOBAL innodb_limit_optimistic_insert_debug = 3; SET GLOBAL DEBUG_DBUG='+d,ib_ext_bp_count_io_only_for_t'; SET GLOBAL DEBUG_DBUG='+d,ib_ext_bp_disable_LRU_eviction_for_t'; diff --git a/mysql-test/suite/innodb/t/ext_buf_pool.test b/mysql-test/suite/innodb/t/ext_buf_pool.test index e172686cf179a..b7d422f26d9e0 100644 --- a/mysql-test/suite/innodb/t/ext_buf_pool.test +++ b/mysql-test/suite/innodb/t/ext_buf_pool.test @@ -24,6 +24,8 @@ START TRANSACTION WITH CONSISTENT SNAPSHOT; --connection default --let $DATADIR = `select @@datadir` +# Set ext buffer pool file size and check it was created, check it's size +--list_files $DATADIR ext_buffer_pool --disable_query_log --error 0,ER_UNKNOWN_SYSTEM_VARIABLE diff --git a/mysys/my_create.c b/mysys/my_create.c index 93da3d0a06399..32cc73a53c476 100644 --- a/mysys/my_create.c +++ b/mysys/my_create.c @@ -41,7 +41,7 @@ File my_create(const char *FileName, mode_t CreateFlags, int access_flags, DBUG_PRINT("my",("Name: '%s' CreateFlags: %u AccessFlags: %d MyFlags: %lu", FileName, CreateFlags, access_flags, MyFlags)); #if defined(_WIN32) - fd= my_win_open(FileName, access_flags | O_CREAT, MyFlags); + fd= my_win_open(FileName, access_flags | O_CREAT); #else fd= open((char *) FileName, access_flags | O_CREAT | O_CLOEXEC, CreateFlags ? CreateFlags : my_umask); diff --git a/mysys/my_open.c b/mysys/my_open.c index 1d229530bf169..182bb14927743 100644 --- a/mysys/my_open.c +++ b/mysys/my_open.c @@ -50,7 +50,7 @@ File my_open(const char *FileName, int Flags, myf MyFlags) if (!(MyFlags & (MY_WME | MY_FAE | MY_FFNF))) MyFlags|= my_global_flags; #if defined(_WIN32) - fd= my_win_open(FileName, Flags, MyFlags); + fd= my_win_open(FileName, Flags); #else if (MyFlags & MY_NOSYMLINKS) fd = open_nosymlinks(FileName, Flags | O_CLOEXEC, my_umask); diff --git a/mysys/my_winfile.c b/mysys/my_winfile.c index f01b970331f62..7a1e3e60b12ef 100644 --- a/mysys/my_winfile.c +++ b/mysys/my_winfile.c @@ -166,16 +166,13 @@ LPSECURITY_ATTRIBUTES my_win_file_secattr() oflag operation flags shflag share flag pmode permission flags - MyFlags flags, used to open files, currently only MY_OPEN_FOR_ASYNC_IO - is used RETURN VALUE File descriptor of opened file if success -1 and sets errno if fails. */ -File my_win_sopen(const char *path, int oflag, int shflag, int pmode, - myf MyFlags) +File my_win_sopen(const char *path, int oflag, int shflag, int pmode) { int fh; /* handle of opened file */ int mask; @@ -288,11 +285,6 @@ File my_win_sopen(const char *path, int oflag, int shflag, int pmode, fileaccess|= DELETE; } - if (MyFlags & MY_OPEN_FOR_ASYNC_IO) - { - fileattrib|= FILE_FLAG_OVERLAPPED; - } - /* Set temporary file (delay-flush-to-disk) attribute if requested.*/ if (oflag & _O_SHORT_LIVED) fileattrib|= FILE_ATTRIBUTE_TEMPORARY; @@ -325,11 +317,11 @@ File my_win_sopen(const char *path, int oflag, int shflag, int pmode, } -File my_win_open(const char *path, int flags, myf MyFlags) +File my_win_open(const char *path, int flags) { DBUG_ENTER("my_win_open"); DBUG_RETURN(my_win_sopen((char *) path, flags | _O_BINARY, _SH_DENYNO, - _S_IREAD | S_IWRITE, MyFlags)); + _S_IREAD | S_IWRITE)); } diff --git a/mysys/mysys_priv.h b/mysys/mysys_priv.h index 8a3558f5150f0..efeb0c65af3bb 100644 --- a/mysys/mysys_priv.h +++ b/mysys/mysys_priv.h @@ -182,7 +182,7 @@ static int PROTO { NOSYMLINK_FUNCTION_BODY(AT,NOAT) } #ifdef _WIN32 #include /* my_winfile.c exports, should not be used outside mysys */ -extern File my_win_open(const char *path, int oflag, myf MyFlags); +extern File my_win_open(const char *path, int oflag); extern int my_win_close(File fd); extern size_t my_win_read(File fd, uchar *buffer, size_t count); extern size_t my_win_write(File fd, const uchar *buffer, size_t count); @@ -200,7 +200,7 @@ extern int my_win_stat(const char *path, struct _stati64 *buf); extern int my_win_fstat(File fd, struct _stati64 *buf); extern int my_win_fsync(File fd); extern File my_win_dup(File fd); -extern File my_win_sopen(const char *path, int oflag, int shflag, int perm, myf MyFlags); +extern File my_win_sopen(const char *path, int oflag, int shflag, int perm); extern File my_open_osfhandle(HANDLE handle, int oflag); diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index 27c002d505f68..31d3a050c8bfd 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -1301,15 +1301,14 @@ void fil_system_t::close() noexcept { if (ext_bp_file != OS_FILE_CLOSED) { -#if defined(_WIN32) - if (srv_thread_pool) - srv_thread_pool->unbind(ext_bp_file.m_file); -#endif - int res= mysql_file_close( - IF_WIN(my_win_handle2File((os_file_t) ext_bp_file), ext_bp_file), - MYF(MY_WME)); - ut_a(res != -1); + int ret= os_file_close(ext_bp_file); + ut_a(ret); ext_bp_file= OS_FILE_CLOSED; + char path[FN_REFLEN]; + snprintf(path, sizeof(path), "%s" FN_ROOTDIR "%s", + ext_bp_path ? ext_bp_path : fil_path_to_mysql_datadir, + ext_bp_file_name); + os_file_delete(innodb_data_file_key, path); } spaces.free(); mysql_mutex_destroy(&mutex); @@ -2933,35 +2932,32 @@ fil_io_t fil_space_t::io(const IORequest &type, os_offset_t offset, size_t len, return {err, node}; } -bool fil_system_t::create_ext_file() noexcept { +bool fil_system_t::create_ext_file() noexcept +{ + char path[FN_REFLEN]; + snprintf(path, sizeof(path), "%s" FN_ROOTDIR "%s", + ext_bp_path ? ext_bp_path : fil_path_to_mysql_datadir, + ext_bp_file_name); bool ret; - ext_bp_file= pfs_create_temp_file( - ext_bp_path ? ext_bp_path : fil_path_to_mysql_datadir, - "/Extended buffer pool file", "ext_buf_", true); - if (ext_bp_file == OS_FILE_CLOSED) + ext_bp_file= + os_file_create(innodb_data_file_key, path, OS_FILE_OPEN_OR_CREATE, + OS_DATA_FILE, false, &ret); + if (!ret) { - sql_print_error("Cannot open/create extended buffer pool file"); + sql_print_error("Cannot open/create extended buffer pool file '%s'", path); /* Report OS error in error log */ - (void)os_file_get_last_error(true, false); + (void) os_file_get_last_error(true, false); return false; } - ret= os_file_set_size(ext_bp_file_name, ext_bp_file.m_file, ext_bp_size); + ut_ad(ext_bp_file != OS_FILE_CLOSED); + ret= os_file_set_size(path, ext_bp_file.m_file, ext_bp_size); if (!ret) { os_file_close_func(ext_bp_file.m_file); - sql_print_error("Cannot set extended buffer pool file size to %zum", - ext_bp_size); + sql_print_error("Cannot set extended buffer pool file '%s' size to %zum", + path, ext_bp_size); return false; } -#if defined(_WIN32) - if (srv_thread_pool && srv_thread_pool->bind(ext_bp_file.m_file) != 0) - { - sql_print_error("Cannot set async io for extended buffer pool file"); - /* Report OS error in error log */ - (void) os_file_get_last_error(true, false); - return false; - } -#endif return true; } @@ -3461,45 +3457,3 @@ fil_space_t *fil_space_t::prev_in_unflushed_spaces() noexcept } #endif - -pfs_os_file_t pfs_create_temp_file(const char *path, const char *label, - const char *prefix, bool async_io) -{ - if (!path) - { - path= mysql_tmpdir; - } -#ifdef UNIV_PFS_IO - /* This temp file open does not go through normal - file APIs, add instrumentation to register with - performance schema */ - struct PSI_file_locker *locker; - PSI_file_locker_state state; - char *name= - static_cast(ut_malloc_nokey(strlen(path) + strlen(label) + 1)); - strcpy(name, path); - strcat(name, label); - - register_pfs_file_open_begin(&state, locker, innodb_temp_file_key, - PSI_FILE_CREATE, path ? name : label, __FILE__, - __LINE__); - -#endif - DBUG_ASSERT(strlen(path) + 2 <= FN_REFLEN); - char filename[FN_REFLEN]; - File f= create_temp_file( - filename, path, prefix, O_BINARY | O_SEQUENTIAL, - MYF(MY_WME | MY_TEMPORARY | (async_io ? MY_OPEN_FOR_ASYNC_IO : 0))); - pfs_os_file_t fd= IF_WIN((os_file_t) my_get_osfhandle(f), f); - -#ifdef UNIV_PFS_IO - register_pfs_file_open_end(locker, fd, (fd == OS_FILE_CLOSED) ? NULL : &fd); - ut_free(name); -#endif - - if (fd == OS_FILE_CLOSED) - { - ib::error() << "Cannot create temporary merge file"; - } - return (fd); -} diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h index 322bb8a3926c4..d3457caac9c61 100644 --- a/storage/innobase/include/fil0fil.h +++ b/storage/innobase/include/fil0fil.h @@ -1418,7 +1418,6 @@ struct fil_system_t #ifdef __linux__ /** available block devices that reside on non-rotational storage */ std::vector ssd; - public: /** @return whether a file system device is on non-rotational storage */ bool is_ssd(dev_t dev) const noexcept @@ -1873,13 +1872,4 @@ ulint fil_space_get_block_size(const fil_space_t* space, unsigned offset) bool fil_crypt_check(fil_space_crypt_t *crypt_data, const char *f_name) noexcept; -/** Create temporary files in the given paramater path, and if -UNIV_PFS_IO defined, register the file descriptor with Performance Schema. -@param path location for creating temporary merge files, or NULL -@param label label for registration in Performance Schema if path == nullptr -@param prefix temporary file name prefix -@param async_io true if the file is going to be used with asynchronous IO -@return File descriptor */ -pfs_os_file_t pfs_create_temp_file(const char *path, const char *label, - const char *prefix, bool async_io); #endif /* UNIV_INNOCHECKSUM */ diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h index 091dec5c2ed78..33dc28607d236 100644 --- a/storage/innobase/include/os0file.h +++ b/storage/innobase/include/os0file.h @@ -130,9 +130,11 @@ enum os_file_create_t { OS_FILE_OPEN_RETRY, /** open a raw block device */ OS_FILE_OPEN_RAW, + /** open file or create if it doesn't exist */ + OS_FILE_OPEN_OR_CREATE, /** do not display diagnostic messages */ - OS_FILE_ON_ERROR_SILENT= 4, + OS_FILE_ON_ERROR_SILENT= 8, OS_FILE_CREATE_SILENT= OS_FILE_CREATE | OS_FILE_ON_ERROR_SILENT, OS_FILE_OPEN_SILENT= OS_FILE_OPEN | OS_FILE_ON_ERROR_SILENT, diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc index 4c6764edcaa33..6c9b156d26cac 100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@ -966,6 +966,8 @@ os_file_create_simple_func( if (read_only) { } else if (create_mode == OS_FILE_CREATE) { create_flag = O_RDWR | O_CREAT | O_EXCL | O_CLOEXEC; + } else if (create_mode == OS_FILE_OPEN_OR_CREATE) { + create_flag = O_RDWR | O_CREAT | O_CLOEXEC; } else { ut_ad(create_mode == OS_FILE_OPEN); if (access_type != OS_FILE_READ_ONLY) { @@ -1117,6 +1119,8 @@ os_file_create_func( } else if (create_mode == OS_FILE_CREATE || create_mode == OS_FILE_CREATE_SILENT) { create_flag = O_RDWR | O_CREAT | O_EXCL | O_CLOEXEC; + } else if (create_mode == OS_FILE_OPEN_OR_CREATE) { + create_flag= O_RDWR | O_CREAT | O_CLOEXEC; } else { ut_ad(create_mode == OS_FILE_OPEN || create_mode == OS_FILE_OPEN_SILENT @@ -1283,6 +1287,8 @@ os_file_create_simple_no_error_handling_func( if (read_only) { } else if (create_mode == OS_FILE_CREATE) { create_flag = O_RDWR | O_CREAT | O_EXCL | O_CLOEXEC; + } else if (create_mode == OS_FILE_OPEN_OR_CREATE) { + create_flag = O_RDWR | O_CREAT | O_CLOEXEC; } else { ut_ad(create_mode == OS_FILE_OPEN); if (access_type != OS_FILE_READ_ONLY) { @@ -1904,6 +1910,8 @@ os_file_create_simple_func( if (read_only || create_mode == OS_FILE_OPEN) { create_flag = OPEN_EXISTING; + } else if (create_mode == OS_FILE_OPEN_OR_CREATE) { + create_flag = OPEN_ALWAYS; } else { ut_ad(create_mode == OS_FILE_CREATE); create_flag = CREATE_NEW; @@ -2029,6 +2037,9 @@ os_file_create_func( case OS_FILE_CREATE: create_flag = CREATE_NEW; break; + case OS_FILE_OPEN_OR_CREATE: + create_flag= OPEN_ALWAYS; + break; default: ut_ad(create_mode == OS_FILE_OPEN || create_mode == OS_FILE_OPEN_SILENT @@ -2133,6 +2144,8 @@ os_file_create_simple_no_error_handling_func( } else { if (create_mode == OS_FILE_CREATE) { create_flag = CREATE_NEW; + } else if (create_mode == OS_FILE_OPEN_OR_CREATE) { + create_flag = OPEN_ALWAYS; } else { ut_ad(create_mode == OS_FILE_OPEN); } diff --git a/storage/innobase/row/row0merge.cc b/storage/innobase/row/row0merge.cc index 2c4f7b7c7dc45..efb89cb069f3b 100644 --- a/storage/innobase/row/row0merge.cc +++ b/storage/innobase/row/row0merge.cc @@ -61,9 +61,6 @@ Completed by Sunny Bains and Marko Makela /* Whether to disable file system cache */ char srv_disable_sort_file_cache; -static const char *merge_temp_file_label= "/Innodb Merge Temp File"; -static const char *merge_temp_file_prefix= "ib"; - /** Class that caches spatial index row tuples made from a single cluster index page scan, and then insert into corresponding index tree */ class spatial_index_info { @@ -4342,18 +4339,57 @@ void row_merge_drop_temp_indexes() } +/** Create temporary merge files in the given paramater path, and if +UNIV_PFS_IO defined, register the file descriptor with Performance Schema. +@param[in] path location for creating temporary merge files, or NULL +@return File descriptor */ +static pfs_os_file_t row_merge_file_create_mode(const char *path, int mode) +{ + if (!path) { + path = mysql_tmpdir; + } +#ifdef UNIV_PFS_IO + /* This temp file open does not go through normal + file APIs, add instrumentation to register with + performance schema */ + struct PSI_file_locker* locker; + PSI_file_locker_state state; + static const char label[] = "/Innodb Merge Temp File"; + char* name = static_cast( + ut_malloc_nokey(strlen(path) + sizeof label)); + strcpy(name, path); + strcat(name, label); + + register_pfs_file_open_begin( + &state, locker, innodb_temp_file_key, + PSI_FILE_CREATE, path ? name : label, __FILE__, __LINE__); + +#endif + DBUG_ASSERT(strlen(path) + 2 <= FN_REFLEN); + char filename[FN_REFLEN]; + File f = create_temp_file(filename, path, "ib", + O_BINARY | O_SEQUENTIAL, + MYF(MY_WME | MY_TEMPORARY)); + pfs_os_file_t fd = IF_WIN((os_file_t)my_get_osfhandle(f), f); + +#ifdef UNIV_PFS_IO + register_pfs_file_open_end(locker, fd, + (fd == OS_FILE_CLOSED)?NULL:&fd); + ut_free(name); +#endif + + if (fd == OS_FILE_CLOSED) { + ib::error() << "Cannot create temporary merge file"; + } + return(fd); +} + /** Create a temporary file at the specified path. @param path location for creating temporary merge files, or nullptr @return File descriptor */ pfs_os_file_t row_merge_file_create_low(const char *path) { - auto fd= pfs_create_temp_file(path, merge_temp_file_label, - merge_temp_file_prefix, false); - if (fd == OS_FILE_CLOSED) - { - ib::error() << "Cannot create temporary merge file"; - } - return fd; + return row_merge_file_create_mode(path, O_BINARY | O_SEQUENTIAL); } /** Create a merge file in the given location. @@ -4368,13 +4404,13 @@ row_merge_file_create( merge_file->offset = 0; merge_file->n_rec = 0; merge_file->fd = - pfs_create_temp_file(path, - merge_temp_file_label, - merge_temp_file_prefix, false); - if (merge_file->fd == OS_FILE_CLOSED) - { - ib::error() << "Cannot create temporary merge file"; - } + row_merge_file_create_mode(path, +#if !defined _WIN32 && defined O_DIRECT + srv_disable_sort_file_cache + ? O_DIRECT | O_BINARY | O_SEQUENTIAL + : +#endif + O_BINARY | O_SEQUENTIAL); return(merge_file->fd); } From 3eedbcf0661759dd116d094bda7db80078e9dfda Mon Sep 17 00:00:00 2001 From: Vlad Lesin Date: Tue, 20 Jan 2026 11:52:32 +0300 Subject: [PATCH 5/6] MDEV-31956 SSD based InnoDB buffer pool extension Squash it. Fix for the following RQG test failures: 2. Scenario: The server is under load (9 concurrent sessions). At some point of time he crashes with mariadbd: 11.8-MDEV-31956-ext_buf_pool/storage/innobase/buf/buf0flu.cc:294: void buf_page_t::write_complete(buf_page_t::space_type, bool, uint32_t): Assertion `persistent == (om > 2)' failed. 4. Scenario: The server was some time under load (one connection). Intentional SIGKILL DB server followed by restart and running certain checks. All that did not show some error. But the shutdown hang like Fragment of rqg.log: # 2026-01-16T13:15:57 [1467965] INFO: DBServer_e::MySQL::MySQLd::stopServer: server[1]: Stopping server on port 25140 ... # 2026-01-16T13:28:22 [1467965] ERROR: DBServer_e::MySQL::MySQLd::stopServer: server[1]: Did not shut down properly. Terminate it == RQG loses the "patience" and sends finally SIGABRT to the process of the DB server. The server error log shows 2026-01-16 13:15:58 0 [Note] /data/Server_bin/11.8-MDEV-31956-ext_buf_pool_debug_Og/bin/mariadbd (initiated by: root[root] @ localhost [127.0.0.1]): Normal shutdown ... 2026-01-16 13:15:58 0 [Note] InnoDB: FTS optimize thread exiting. 2026-01-16 13:16:01 0 [Note] InnoDB: Starting shutdown... .... 2026-01-16 13:16:01 0 [Note] InnoDB: Buffer pool(s) dump completed at 260116 13:16:01 2026-01-16 13:18:37 0 [Note] InnoDB: Waiting for page cleaner thread to exit .... 2026-01-16 13:26:24 0 [Note] InnoDB: Waiting for page cleaner thread to exit --- storage/innobase/buf/buf0flu.cc | 15 ++++++++++----- storage/innobase/fil/fil0fil.cc | 2 ++ 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index 8df9ba5e26ff3..c2638975eeb1d 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -1381,11 +1381,6 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n, ut_ad(!bpage->is_io_fixed()); switch (bpage->oldest_modification()) { - case 2: - /* LRU flushing will always evict pages of the temporary tablespace, - in buf_page_write_complete(). */ - ++n->evicted; - break; case 1: mysql_mutex_lock(&buf_pool.flush_list_mutex); if (ut_d(lsn_t lsn=) bpage->oldest_modification()) @@ -1401,6 +1396,16 @@ static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n, bpage->lock.u_unlock(true); goto evict; } + break; + case 2: + /* LRU flushing will always evict pages of the temporary tablespace, + in buf_page_write_complete(). */ + ++n->evicted; + /* fall through */ + default: + /* bpage->oldest_modification() could be changed from 0 to not 0 while + bpage was unlocked, in this case we just flush the page to its space */ + flush_to_ebp= false; } /* Block is ready for flush. Dispatch an IO request. */ const page_id_t page_id(bpage->id()); diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index 31d3a050c8bfd..0fd7dc3541b46 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -2992,6 +2992,8 @@ void IORequest::write_complete(int io_error) const noexcept if (!space) { buf_page->write_complete_release(buf_page->state()); + if (slot) + slot->release(); return; } } From 4918d15299450579a74d33d92df6428ea1be84a6 Mon Sep 17 00:00:00 2001 From: Vlad Lesin Date: Tue, 20 Jan 2026 17:27:41 +0300 Subject: [PATCH 6/6] MDEV-31956 SSD based InnoDB buffer pool extension Evict page on write completion if it's space was removed. Lock external buffer pool file on Linux. --- storage/innobase/fil/fil0fil.cc | 35 ++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index 0fd7dc3541b46..3cc31c62fb3d5 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -2950,6 +2950,13 @@ bool fil_system_t::create_ext_file() noexcept return false; } ut_ad(ext_bp_file != OS_FILE_CLOSED); +#ifndef _WIN32 /* On Microsoft Windows, mandatory locking is used */ + if (!my_disable_locking && os_file_lock(ext_bp_file.m_file, path)) + { + os_file_close_func(ext_bp_file.m_file); + return false; + } +#endif ret= os_file_set_size(path, ext_bp_file.m_file, ext_bp_size); if (!ret) { @@ -2988,12 +2995,31 @@ void IORequest::write_complete(int io_error) const noexcept fil_space_t *space; if (ext_buf()) { - space= fil_space_t::get(buf_page->id().space()); + ut_d(fil_space_t *debug_space=) space= + fil_space_t::get(buf_page->id().space()); + DBUG_EXECUTE_IF("ib_ext_bp_remove_space_on_write_complete", + space= nullptr;); if (!space) { - buf_page->write_complete_release(buf_page->state()); if (slot) slot->release(); + ut_d(auto debug_page_no = buf_page->id().page_no()); + /* We must hold buf_pool.mutex while releasing the block, so that + no other thread can access it before we have freed it. */ + mysql_mutex_lock(&buf_pool.mutex); + buf_page->write_complete_release(buf_page->state()); + buf_LRU_free_page(buf_page, true, ext_buf_page()); + mysql_mutex_unlock(&buf_pool.mutex); + DBUG_EXECUTE_IF( + "ib_ext_bp_remove_space_on_write_complete", if (debug_space) { + sql_print_information( + "The page number " UINT32PF + " was freed after write completion to external " + "buffer pool file because the page's space was " + "removed.", + debug_page_no); + debug_space->release(); + }); return; } } @@ -3014,8 +3040,11 @@ void IORequest::write_complete(int io_error) const noexcept else ut_ad(type == IORequest::WRITE_ASYNC); } - else + else { + DBUG_EXECUTE_IF( + "ib_ext_bp_write_io_error", if (ext_buf()) { io_error= 1; }); buf_page_write_complete(*this, io_error); + } if (!ext_buf()) space->complete_write();