############################################################################### # Bug #19975697 SLAVE IO_THREAD MAY GET STUCK WHEN USING GTID AND LOW # SLAVE_NET_TIMEOUTS # # Problem: Dump thread is not checking the necessity of heartbeat event # while it is scanning through the binary log files and skipping some groups # which were already present at Slave. # Fix: Dump thread will check whether it is time to send an heartbeat event or # not before skipping an event. If so, it will send one to Slave. ############################################################################### --source include/have_binlog_format_statement.inc --source include/have_gtid.inc --source include/have_debug_sync.inc --source include/force_restart.inc # Problem is reproduced only with gtid auto position protocol --let $use_gtids=1 --let $rpl_skip_start_slave=1; --source include/master-slave.inc # set slave_net_timeout to 10 seconds and then start slave threads --connection slave CHANGE MASTER TO MASTER_HEARTBEAT_PERIOD=5; SET @save_old_slave_net_timeout=@@global.slave_net_timeout; SET @@global.slave_net_timeout=10; --source include/start_slave.inc # In order to reproduce the bug with a smaller test data, # inject 2 seconds delay after an skipped event. --connection master SET @save_debug=@@global.debug; SET GLOBAL DEBUG='+d,inject_2sec_sleep_when_skipping_an_event'; # Have some gtid transactions in binary log # Below create and two insert statements would have created '6' sub events # # GTID 1 # # CREATE # # GTID 2 # # BEGIN # # INSERT # # COMMIT # CREATE TABLE t1(i INT) engine=innodb; INSERT INTO t1 VALUES (1); # Sync it with Slave --source include/sync_slave_sql_with_master.inc # Stop the Replication --source include/stop_slave.inc # Create another gtid transaction at Master --connection master # Also, we introduce 'hold_dump_thread_inside_inner_loop' debug simulation now # to see what happens if dump thread is killed when the logic is # in inner while loop. The server code resets this debug point # after it is trigged once. So the sync of the following 'drop table' # statement will happen with the second dump thread that will be created # after slave triggers for another dump request after slave_net_timeout(10) # seconds. --source include/stop_dump_threads.inc SET GLOBAL DEBUG='+d,hold_dump_thread_inside_inner_loop'; DROP TABLE t1; # Start the Replication # This will create a dump thread on Master and because of # debug point, it will wait in inner loop. --connection slave --source include/start_slave.inc --connection master SET DEBUG_SYNC='now WAIT_FOR signal_inside_inner_loop'; # When a dump thread is waiting inside inner loop, # restart slave threads which will kill the existing # dump thread and will recreate it. --connection slave --source include/stop_slave.inc # Reset debug point for the next dump thread to ignore it --connection master SET GLOBAL DEBUG='-d,hold_dump_thread_inside_inner_loop'; # Start slave thread --connection slave --source include/start_slave.inc # Signal the zombie dump thread to go ahead. --connection master SET DEBUG_SYNC='now signal signal_continue'; # Now Dump thread-I/O thread communitcation protocol will detect that # it does not have recent gtid transaction(drop table) and Master # should send that transaction to Slave. This detection will take # 12 seconds because of the injected 2 seconds delay after every skipped # event (Please see above) which is more than the slave_net_timeout. # But now, after the fix, the server will check for necessity of sending an # heartbeat event even while it is skipping the events, the below # sync should not be timed out after the fix. --connection master --source include/sync_slave_sql_with_master.inc # Cleanup --connection master SET GLOBAL DEBUG=@save_debug; --connection slave SET @@global.slave_net_timeout=@save_old_slave_net_timeout; --source include/rpl_end.inc