[Commits] Rev 3678: bMDEV-4906: When event apply fails, next SQL thread start errorneously commits the failing GTID to gtid_slave_pos in http://bazaar.launchpad.net/~maria-captains/maria/10.0

knielsen at knielsen-hq.org knielsen at knielsen-hq.org
Tue Aug 20 14:53:25 EEST 2013


At http://bazaar.launchpad.net/~maria-captains/maria/10.0

------------------------------------------------------------
revno: 3678
revision-id: knielsen at knielsen-hq.org-20130820114450-91f91xkg2omt3yrh
parent: sanja at montyprogram.com-20130820114829-ze320del4ej58tyg
committer: knielsen at knielsen-hq.org
branch nick: work-10.0-mdev26
timestamp: Tue 2013-08-20 13:44:50 +0200
message:
  bMDEV-4906: When event apply fails, next SQL thread start errorneously commits the failing GTID to gtid_slave_pos
  
  When a GTID event is executed, we remember the contained GTID position so that
  when we have applied the entire event group we can commit it to
  gtid_slave_pos.
  
  However, if the event group fails to apply due to some error and the SQL
  thread aborts, the code did not correctly clear the remembered GTID. Thus,
  when SQL thread was restarted, the old GTID of the failing event group was
  incorrectly updated to gtid_slave_pos when the initial rotate event was
  executed, corrupting the GTID position.
=== modified file 'mysql-test/suite/rpl/r/rpl_gtid_errorhandling.result'
--- a/mysql-test/suite/rpl/r/rpl_gtid_errorhandling.result	2013-06-21 09:53:46 +0000
+++ b/mysql-test/suite/rpl/r/rpl_gtid_errorhandling.result	2013-08-20 11:44:50 +0000
@@ -173,6 +173,33 @@ SELECT * FROM t2;
 SET sql_log_bin=0;
 CALL mtr.add_suppression("Slave: Could not update replication slave gtid state");
 SET sql_log_bin=1;
+*** MDEV-4906: When event apply fails, next SQL thread start errorneously commits the failing GTID to gtid_slave_pos ***
+include/stop_slave.inc
+SET sql_log_bin=0;
+DELETE FROM t2;
+SET sql_log_bin=1;
+SET @old_format=@@binlog_format;
+SET GLOBAL binlog_format='row';
+include/start_slave.inc
+SET @old_format=@@binlog_format;
+SET binlog_format='row';
+DELETE FROM t2;
+SET binlog_format=@old_format;
+include/wait_for_slave_sql_error.inc [errno=1032]
+result
+OK
+STOP SLAVE IO_THREAD;
+START SLAVE;
+include/wait_for_slave_sql_error.inc [errno=1032]
+result
+OK
+STOP SLAVE IO_THREAD;
+SET sql_log_bin=0;
+INSERT INTO t2 VALUES (1);
+CALL mtr.add_suppression("Slave: Can't find record in 't2' Error_code: 1032");
+SET sql_log_bin=1;
+include/start_slave.inc
+SET GLOBAL binlog_format=@old_format;
 DROP TABLE t1;
 DROP TABLE t2;
 include/rpl_end.inc

=== modified file 'mysql-test/suite/rpl/t/rpl_gtid_errorhandling.test'
--- a/mysql-test/suite/rpl/t/rpl_gtid_errorhandling.test	2013-06-21 09:53:46 +0000
+++ b/mysql-test/suite/rpl/t/rpl_gtid_errorhandling.test	2013-08-20 11:44:50 +0000
@@ -230,6 +230,57 @@ CALL mtr.add_suppression("Slave: Could n
 SET sql_log_bin=1;
 
 
+--echo *** MDEV-4906: When event apply fails, next SQL thread start errorneously commits the failing GTID to gtid_slave_pos ***
+
+--connection slave
+--source include/stop_slave.inc
+SET sql_log_bin=0;
+DELETE FROM t2;
+SET sql_log_bin=1;
+SET @old_format=@@binlog_format;
+SET GLOBAL binlog_format='row';
+--source include/start_slave.inc
+
+--connection master
+SET @old_format=@@binlog_format;
+SET binlog_format='row';
+--let $gtid_pos1=`SELECT @@GLOBAL.gtid_binlog_pos`
+DELETE FROM t2;
+SET binlog_format=@old_format;
+--save_master_pos
+
+--connection slave
+--let $slave_sql_errno= 1032
+--source include/wait_for_slave_sql_error.inc
+# Disable query to avoid result file update if precise GTID value changes.
+--disable_query_log
+SET @x=@@GLOBAL.gtid_slave_pos;
+eval SELECT IF(@x='$gtid_pos1', "OK", CONCAT("ERROR: expected $gtid_pos1 got ", @x)) AS result;
+--enable_query_log
+
+# The bug was that upon restarting the SQL thread, the GTID for the
+# failing event group was not cleared, so we would update it in the
+# gtid_slave_pos as part of the first rotate event, corrupting the
+# replication.
+STOP SLAVE IO_THREAD;
+START SLAVE;
+--let $slave_sql_errno= 1032
+--source include/wait_for_slave_sql_error.inc
+# Disable query to avoid result file update if precise GTID value changes.
+--disable_query_log
+SET @x=@@GLOBAL.gtid_slave_pos;
+eval SELECT IF(@x='$gtid_pos1', "OK", CONCAT("ERROR: expected $gtid_pos1 got ", @x)) AS result;
+--enable_query_log
+
+STOP SLAVE IO_THREAD;
+SET sql_log_bin=0;
+INSERT INTO t2 VALUES (1);
+CALL mtr.add_suppression("Slave: Can't find record in 't2' Error_code: 1032");
+SET sql_log_bin=1;
+--source include/start_slave.inc
+--sync_with_master
+SET GLOBAL binlog_format=@old_format;
+
 --connection master
 DROP TABLE t1;
 DROP TABLE t2;

=== modified file 'sql/slave.cc'
--- a/sql/slave.cc	2013-07-17 19:24:29 +0000
+++ b/sql/slave.cc	2013-08-20 11:44:50 +0000
@@ -3160,6 +3160,14 @@ int apply_event_and_update_pos(Log_event
       DBUG_RETURN(2);
     }
   }
+  else
+  {
+    /*
+      Make sure we do not errorneously update gtid_slave_pos with a lingering
+      GTID from this failed event group (MDEV-4906).
+    */
+    rli->gtid_sub_id= 0;
+  }
 
   DBUG_RETURN(exec_res ? 1 : 0);
 }
@@ -4094,6 +4102,7 @@ pthread_handler_t handle_slave_sql(void
   rli->trans_retries= 0; // start from "no error"
   DBUG_PRINT("info", ("rli->trans_retries: %lu", rli->trans_retries));
 
+  rli->gtid_sub_id= 0;
   if (init_relay_log_pos(rli,
                          rli->group_relay_log_name,
                          rli->group_relay_log_pos,



More information about the commits mailing list