[Commits] Rev 3434: MDEV-232: Remove one fsync() from commit phase. in http://bazaar.launchpad.net/~maria-captains/maria/10.0

knielsen at knielsen-hq.org knielsen at knielsen-hq.org
Thu Sep 13 15:31:31 EEST 2012


At http://bazaar.launchpad.net/~maria-captains/maria/10.0

------------------------------------------------------------
revno: 3434
revision-id: knielsen at knielsen-hq.org-20120913123129-kaujy4cw0jc9o08k
parent: knielsen at knielsen-hq.org-20120622094628-80irmpvbe6o7z0jq
committer: knielsen at knielsen-hq.org
branch nick: work-10.0-mdev225-181-232
timestamp: Thu 2012-09-13 14:31:29 +0200
message:
  MDEV-232: Remove one fsync() from commit phase.
  
  Introduce a new storage engine API method commit_checkpoint_request().
  This is used to replace the fsync() at the end of every storage engine
  commit with a single fsync() when a binlog is rotated.
  
  Binlog rotation is now done during group commit instead of being
  delayed until unlog(), removing some server stall and avoiding an
  expensive lock/unlock of LOCK_log inside unlog().
=== modified file 'mysql-test/extra/rpl_tests/rpl_insert_delayed.test'
--- a/mysql-test/extra/rpl_tests/rpl_insert_delayed.test	2012-02-06 21:55:17 +0000
+++ b/mysql-test/extra/rpl_tests/rpl_insert_delayed.test	2012-09-13 12:31:29 +0000
@@ -133,7 +133,7 @@ if  (`SELECT @@global.binlog_format = 'S
 {
   #must show two INSERT DELAYED
   --let $binlog_file= query_get_value(SHOW MASTER STATUS, File, 1)
-  --let $binlog_limit= 1,6
+  --let $binlog_limit= 2,6
   --source include/show_binlog_events.inc
 }
 select * from t1;

=== modified file 'mysql-test/r/mysqlbinlog.result'
--- a/mysql-test/r/mysqlbinlog.result	2012-08-24 13:29:01 +0000
+++ b/mysql-test/r/mysqlbinlog.result	2012-09-13 12:31:29 +0000
@@ -892,6 +892,7 @@ DROP DATABASE test1;
 FLUSH LOGS;
 show binlog events in 'master-bin.000002' from <binlog_start>;
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info
+master-bin.000002       #       Binlog_checkpoint       #       #       master-bin.000002
 master-bin.000002       #       Query   #       #       CREATE DATABASE test1
 master-bin.000002       #       Query   #       #       use `test1`; CREATE TABLE t1(id int)
 master-bin.000002       #       Query   #       #       DROP DATABASE test1

=== modified file 'mysql-test/r/mysqlbinlog2.result'
--- a/mysql-test/r/mysqlbinlog2.result	2012-08-24 13:29:01 +0000
+++ b/mysql-test/r/mysqlbinlog2.result	2012-09-13 12:31:29 +0000
@@ -697,7 +697,6 @@ SET @@session.lc_time_names=0/*!*/;
 SET @@session.collation_database=DEFAULT/*!*/;
 BEGIN
 /*!*/;
-SET INSERT_ID=6/*!*/;
 DELIMITER ;
 # End of log file
 ROLLBACK /* added by mysqlbinlog */;
@@ -1483,17 +1482,6 @@ COMMIT
 /*!*/;
 DELIMITER ;
 DELIMITER /*!*/;
-SET TIMESTAMP=1579609943/*!*/;
-SET @@session.pseudo_thread_id=999999999/*!*/;
-SET @@session.foreign_key_checks=1, @@session.sql_auto_is_null=0, @@session.unique_checks=1, @@session.autocommit=1/*!*/;
-SET @@session.sql_mode=0/*!*/;
-SET @@session.auto_increment_increment=1, @@session.auto_increment_offset=1/*!*/;
-/*!\C latin1 *//*!*/;
-SET @@session.character_set_client=8,@@session.collation_connection=8,@@session.collation_server=8/*!*/;
-SET @@session.lc_time_names=0/*!*/;
-SET @@session.collation_database=DEFAULT/*!*/;
-BEGIN
-/*!*/;
 DELIMITER ;
 # End of log file
 ROLLBACK /* added by mysqlbinlog */;

=== added file 'mysql-test/suite/binlog/r/binlog_checkpoint.result'
--- a/mysql-test/suite/binlog/r/binlog_checkpoint.result	1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/binlog/r/binlog_checkpoint.result	2012-09-13 12:31:29 +0000
@@ -0,0 +1,88 @@
+SET @old_max_binlog_size= @@global.max_binlog_size;
+SET GLOBAL max_binlog_size= 4096;
+SET @old_innodb_flush_log_at_trx_commit= @@global.innodb_flush_log_at_trx_commit;
+SET GLOBAL innodb_flush_log_at_trx_commit= 1;
+RESET MASTER;
+CREATE TABLE t1 (a INT PRIMARY KEY, b MEDIUMTEXT) ENGINE=Innodb;
+CREATE TABLE t2 (a INT PRIMARY KEY, b MEDIUMTEXT) ENGINE=Myisam;
+*** Test that RESET MASTER waits for pending commit checkpoints to complete.
+SET DEBUG_SYNC= "commit_after_group_release_commit_ordered SIGNAL con1_ready WAIT_FOR con1_go";
+INSERT INTO t1 VALUES (1, REPEAT("x", 4100));
+SET DEBUG_SYNC= "now WAIT_FOR con1_ready";
+INSERT INTO t2 VALUES (1, REPEAT("x", 4100));
+INSERT INTO t2 VALUES (2, REPEAT("x", 4100));
+show binary logs;
+Log_name        File_size
+master-bin.000001       #
+master-bin.000002       #
+master-bin.000003       #
+master-bin.000004       #
+show binlog events in 'master-bin.00000<binlog_start>' from <binlog_start>;
+Log_name        Pos     Event_type      Server_id       End_log_pos     Info
+master-bin.00000<binlog_start>  #       Format_desc     #       #       SERVER_VERSION, BINLOG_VERSION
+master-bin.00000<binlog_start>  #       Binlog_checkpoint       #       #       master-bin.000001
+SET DEBUG_SYNC= "execute_command_after_close_tables SIGNAL reset_master_done";
+RESET MASTER;
+This will timeout, as RESET MASTER is blocked
+SET DEBUG_SYNC= "now WAIT_FOR reset_master_done TIMEOUT 1";
+Warnings:
+Warning 1639    debug sync point wait timed out
+SET DEBUG_SYNC= "now SIGNAL con1_go";
+show binary logs;
+Log_name        File_size
+master-bin.000001       #
+show binlog events in 'master-bin.000001' from <binlog_start>;
+Log_name        Pos     Event_type      Server_id       End_log_pos     Info
+master-bin.000001       #       Format_desc     #       #       SERVER_VERSION, BINLOG_VERSION
+master-bin.000001       #       Binlog_checkpoint       #       #       master-bin.000001
+*** Test that binlog N is active, and commit checkpoint for (N-1) is
+*** done while there is still a pending commit checkpoint for (N-2).
+SET DEBUG_SYNC= "commit_after_group_release_commit_ordered SIGNAL con1_ready WAIT_FOR con1_continue";
+INSERT INTO t1 VALUES (20, REPEAT("x", 4100));
+SET DEBUG_SYNC= "now WAIT_FOR con1_ready";
+SET DEBUG_SYNC= "commit_after_group_release_commit_ordered SIGNAL con2_ready WAIT_FOR con2_continue";
+INSERT INTO t1 VALUES (21, REPEAT("x", 4100));
+SET DEBUG_SYNC= "now WAIT_FOR con2_ready";
+show binary logs;
+Log_name        File_size
+master-bin.000001       #
+master-bin.000002       #
+master-bin.000003       #
+show binlog events in 'master-bin.000001' from <binlog_start>;
+Log_name        Pos     Event_type      Server_id       End_log_pos     Info
+master-bin.000001       #       Format_desc     #       #       SERVER_VERSION, BINLOG_VERSION
+master-bin.000001       #       Binlog_checkpoint       #       #       master-bin.000001
+master-bin.000001       #       Query   #       #       BEGIN
+master-bin.000001       #       Table_map       #       #       table_id: # (test.t1)
+master-bin.000001       #       Write_rows      #       #       table_id: # flags: STMT_END_F
+master-bin.000001       #       Xid     #       #       COMMIT /* XID */
+master-bin.000001       #       Rotate  #       #       master-bin.000002;pos=<binlog_start>
+show binlog events in 'master-bin.000002' from <binlog_start>;
+Log_name        Pos     Event_type      Server_id       End_log_pos     Info
+master-bin.000002       #       Format_desc     #       #       SERVER_VERSION, BINLOG_VERSION
+master-bin.000002       #       Binlog_checkpoint       #       #       master-bin.000001
+master-bin.000002       #       Query   #       #       BEGIN
+master-bin.000002       #       Table_map       #       #       table_id: # (test.t1)
+master-bin.000002       #       Write_rows      #       #       table_id: # flags: STMT_END_F
+master-bin.000002       #       Xid     #       #       COMMIT /* XID */
+master-bin.000002       #       Rotate  #       #       master-bin.000003;pos=<binlog_start>
+show binlog events in 'master-bin.000003' from <binlog_start>;
+Log_name        Pos     Event_type      Server_id       End_log_pos     Info
+master-bin.000003       #       Format_desc     #       #       SERVER_VERSION, BINLOG_VERSION
+master-bin.000003       #       Binlog_checkpoint       #       #       master-bin.000001
+SET DEBUG_SYNC= "now SIGNAL con2_continue";
+con1 is still pending, no new binlog checkpoint should have been logged.
+show binlog events in 'master-bin.000003' from <binlog_start>;
+Log_name        Pos     Event_type      Server_id       End_log_pos     Info
+master-bin.000003       #       Format_desc     #       #       SERVER_VERSION, BINLOG_VERSION
+master-bin.000003       #       Binlog_checkpoint       #       #       master-bin.000001
+SET DEBUG_SYNC= "now SIGNAL con1_continue";
+No commit checkpoints are pending, a new binlog checkpoint should have been logged.
+show binlog events in 'master-bin.000003' from <binlog_start>;
+Log_name        Pos     Event_type      Server_id       End_log_pos     Info
+master-bin.000003       #       Format_desc     #       #       SERVER_VERSION, BINLOG_VERSION
+master-bin.000003       #       Binlog_checkpoint       #       #       master-bin.000001
+master-bin.000003       #       Binlog_checkpoint       #       #       master-bin.000003
+DROP TABLE t1, t2;
+SET GLOBAL max_binlog_size= @old_max_binlog_size;
+SET GLOBAL innodb_flush_log_at_trx_commit= @old_innodb_flush_log_at_trx_commit;

=== modified file 'mysql-test/suite/binlog/r/binlog_mdev342.result'
--- a/mysql-test/suite/binlog/r/binlog_mdev342.result	2012-06-22 08:42:55 +0000
+++ b/mysql-test/suite/binlog/r/binlog_mdev342.result	2012-09-13 12:31:29 +0000
@@ -21,6 +21,7 @@ master-bin.000002	#
 show binlog events in 'master-bin.000001' from <binlog_start>;
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info
 master-bin.000001       #       Format_desc     #       #       SERVER_VERSION, BINLOG_VERSION
+master-bin.000001       #       Binlog_checkpoint       #       #       master-bin.000001
 master-bin.000001       #       Query   #       #       use `test`; CREATE TABLE t1 (a INT PRIMARY KEY, b MEDIUMTEXT) ENGINE=Innodb
 master-bin.000001       #       Query   #       #       BEGIN
 master-bin.000001       #       Table_map       #       #       table_id: # (test.t1)

=== modified file 'mysql-test/suite/binlog/r/binlog_row_binlog.result'
--- a/mysql-test/suite/binlog/r/binlog_row_binlog.result	2012-06-22 09:46:28 +0000
+++ b/mysql-test/suite/binlog/r/binlog_row_binlog.result	2012-09-13 12:31:29 +0000
@@ -234,6 +234,7 @@ master-bin.000001	#	Xid	#	#	COMMIT /* XI
 master-bin.000001       #       Rotate  #       #       master-bin.000002;pos=4
 show binlog events in 'master-bin.000002' from <binlog_start>;
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info
+master-bin.000002       #       Binlog_checkpoint       #       #       master-bin.000002
 master-bin.000002       #       Query   #       #       use `test`; DROP TABLE `t1` /* generated by server */
 set @ac = @@autocommit;
 set autocommit= 0;

=== modified file 'mysql-test/suite/binlog/r/binlog_row_mysqlbinlog_options.result'
--- a/mysql-test/suite/binlog/r/binlog_row_mysqlbinlog_options.result	2012-06-22 09:46:28 +0000
+++ b/mysql-test/suite/binlog/r/binlog_row_mysqlbinlog_options.result	2012-09-13 12:31:29 +0000
@@ -34,8 +34,8 @@ DELIMITER /*!*/;
 # at #
 #010909  4:46:40 server id #  end_log_pos #     Start: binlog v 4, server v #.##.## created 010909  4:46:40 at startup
 ROLLBACK/*!*/;
-#010909  4:46:40 server id #  end_log_pos #     Binlog checkpoint master-bin.000001
 # at #
+#010909  4:46:40 server id #  end_log_pos #     Binlog checkpoint master-bin.000001
 # at #
 use `new_test1`/*!*/;
 #010909  4:46:40 server id #  end_log_pos #     Query   thread_id=#     exec_time=#     error_code=0
@@ -230,8 +230,8 @@ DELIMITER /*!*/;
 # at #
 #010909  4:46:40 server id #  end_log_pos #     Start: binlog v 4, server v #.##.## created 010909  4:46:40 at startup
 ROLLBACK/*!*/;
-#010909  4:46:40 server id #  end_log_pos #     Binlog checkpoint master-bin.000001
 # at #
+#010909  4:46:40 server id #  end_log_pos #     Binlog checkpoint master-bin.000001
 # at #
 use `new_test1`/*!*/;
 #010909  4:46:40 server id #  end_log_pos #     Query   thread_id=#     exec_time=#     error_code=0

=== modified file 'mysql-test/suite/binlog/r/binlog_stm_binlog.result'
--- a/mysql-test/suite/binlog/r/binlog_stm_binlog.result	2012-06-22 09:46:28 +0000
+++ b/mysql-test/suite/binlog/r/binlog_stm_binlog.result	2012-09-13 12:31:29 +0000
@@ -145,6 +145,7 @@ master-bin.000001	#	Xid	#	#	COMMIT /* XI
 master-bin.000001       #       Rotate  #       #       master-bin.000002;pos=4
 show binlog events in 'master-bin.000002' from <binlog_start>;
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info
+master-bin.000002       #       Binlog_checkpoint       #       #       master-bin.000002
 master-bin.000002       #       Query   #       #       use `test`; DROP TABLE `t1` /* generated by server */
 set @ac = @@autocommit;
 set autocommit= 0;

=== modified file 'mysql-test/suite/binlog/r/binlog_xa_recover.result'
--- a/mysql-test/suite/binlog/r/binlog_xa_recover.result	2012-06-22 09:46:28 +0000
+++ b/mysql-test/suite/binlog/r/binlog_xa_recover.result	2012-09-13 12:31:29 +0000
@@ -1,175 +1,198 @@
 SET GLOBAL max_binlog_size= 4096;
+SET GLOBAL innodb_flush_log_at_trx_commit= 1;
+RESET MASTER;
 CREATE TABLE t1 (a INT PRIMARY KEY, b MEDIUMTEXT) ENGINE=Innodb;
-CREATE TABLE t2 (a INT PRIMARY KEY, b MEDIUMTEXT) ENGINE=Myisam;
-SET @@global.debug_dbug='+d,skip_commit_ordered';
-INSERT INTO t1 VALUES (0, REPEAT("x", 4100));
-SET DEBUG_SYNC= "ha_commit_trans_after_log_and_order SIGNAL con1_ready WAIT_FOR _ever";
+INSERT INTO t1 VALUES (100, REPEAT("x", 4100));
+INSERT INTO t1 VALUES (101, REPEAT("x", 4100));
+INSERT INTO t1 VALUES (102, REPEAT("x", 4100));
+SET DEBUG_SYNC= "ha_commit_trans_before_log_and_order SIGNAL con1_wait WAIT_FOR con1_cont";
+SET DEBUG_SYNC= "commit_after_group_release_commit_ordered SIGNAL con1_ready WAIT_FOR _ever";
 INSERT INTO t1 VALUES (1, REPEAT("x", 4100));
-SET DEBUG_SYNC= "now WAIT_FOR con1_ready";
-INSERT INTO t2 VALUES (1, "force binlog rotation");
-SET DEBUG_SYNC= "ha_commit_trans_after_log_and_order SIGNAL con2_ready WAIT_FOR _ever";
+SET DEBUG_SYNC= "now WAIT_FOR con1_wait";
+SET DEBUG_SYNC= "ha_commit_trans_before_log_and_order SIGNAL con2_wait WAIT_FOR con2_cont";
+SET DEBUG_SYNC= "commit_after_group_release_commit_ordered SIGNAL con2_ready WAIT_FOR _ever";
 INSERT INTO t1 VALUES (2, NULL);
-SET DEBUG_SYNC= "now WAIT_FOR con2_ready";
-SET DEBUG_SYNC= "ha_commit_trans_after_log_and_order SIGNAL con3_ready WAIT_FOR _ever";
+SET DEBUG_SYNC= "now WAIT_FOR con2_wait";
+SET DEBUG_SYNC= "ha_commit_trans_before_log_and_order SIGNAL con3_wait WAIT_FOR con3_cont";
+SET DEBUG_SYNC= "commit_after_group_release_commit_ordered SIGNAL con3_ready WAIT_FOR _ever";
 INSERT INTO t1 VALUES (3, REPEAT("x", 4100));
+SET DEBUG_SYNC= "now WAIT_FOR con3_wait";
+SET DEBUG_SYNC= "ha_commit_trans_before_log_and_order SIGNAL con4_wait WAIT_FOR con4_cont";
+SET SESSION debug_dbug="+d,crash_commit_after_log";
+INSERT INTO t1 VALUES (4, NULL);
+SET DEBUG_SYNC= "now WAIT_FOR con4_wait";
+SET DEBUG_SYNC= "now SIGNAL con1_cont";
+SET DEBUG_SYNC= "now WAIT_FOR con1_ready";
+SET DEBUG_SYNC= "now SIGNAL con2_cont";
+SET DEBUG_SYNC= "now WAIT_FOR con2_ready";
+SET DEBUG_SYNC= "now SIGNAL con3_cont";
 SET DEBUG_SYNC= "now WAIT_FOR con3_ready";
-INSERT INTO t2 VALUES (2, "force binlog rotation");
-FLUSH TABLES t2;
 show binary logs;
 Log_name        File_size
 master-bin.000001       #
 master-bin.000002       #
 master-bin.000003       #
 master-bin.000004       #
-show binlog events in 'master-bin.000001' from <binlog_start>;
-Log_name        Pos     Event_type      Server_id       End_log_pos     Info
-master-bin.000001       #       Format_desc     #       #       SERVER_VERSION, BINLOG_VERSION
-master-bin.000001       #       Binlog_checkpoint       #       #       master-bin.000001
-master-bin.000001       #       Query   #       #       use `test`; CREATE TABLE t1 (a INT PRIMARY KEY, b MEDIUMTEXT) ENGINE=Innodb
-master-bin.000001       #       Query   #       #       use `test`; CREATE TABLE t2 (a INT PRIMARY KEY, b MEDIUMTEXT) ENGINE=Myisam
-master-bin.000001       #       Query   #       #       BEGIN
-master-bin.000001       #       Table_map       #       #       table_id: # (test.t1)
-master-bin.000001       #       Write_rows      #       #       table_id: # flags: STMT_END_F
-master-bin.000001       #       Xid     #       #       COMMIT /* XID */
-master-bin.000001       #       Rotate  #       #       master-bin.000002;pos=<binlog_start>
-show binlog events in 'master-bin.000002' from <binlog_start>;
-Log_name        Pos     Event_type      Server_id       End_log_pos     Info
-master-bin.000002       #       Format_desc     #       #       SERVER_VERSION, BINLOG_VERSION
-master-bin.000002       #       Binlog_checkpoint       #       #       master-bin.000002
-master-bin.000002       #       Query   #       #       BEGIN
-master-bin.000002       #       Table_map       #       #       table_id: # (test.t1)
-master-bin.000002       #       Write_rows      #       #       table_id: # flags: STMT_END_F
-master-bin.000002       #       Xid     #       #       COMMIT /* XID */
-master-bin.000002       #       Query   #       #       BEGIN
-master-bin.000002       #       Table_map       #       #       table_id: # (test.t2)
-master-bin.000002       #       Write_rows      #       #       table_id: # flags: STMT_END_F
-master-bin.000002       #       Query   #       #       COMMIT
-master-bin.000002       #       Rotate  #       #       master-bin.000003;pos=<binlog_start>
+master-bin.000005       #
+master-bin.000006       #
 show binlog events in 'master-bin.000003' from <binlog_start>;
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info
 master-bin.000003       #       Format_desc     #       #       SERVER_VERSION, BINLOG_VERSION
 master-bin.000003       #       Binlog_checkpoint       #       #       master-bin.000002
+master-bin.000003       #       Binlog_checkpoint       #       #       master-bin.000003
 master-bin.000003       #       Query   #       #       BEGIN
 master-bin.000003       #       Table_map       #       #       table_id: # (test.t1)
 master-bin.000003       #       Write_rows      #       #       table_id: # flags: STMT_END_F
 master-bin.000003       #       Xid     #       #       COMMIT /* XID */
-master-bin.000003       #       Query   #       #       BEGIN
-master-bin.000003       #       Table_map       #       #       table_id: # (test.t1)
-master-bin.000003       #       Write_rows      #       #       table_id: # flags: STMT_END_F
-master-bin.000003       #       Xid     #       #       COMMIT /* XID */
-master-bin.000003       #       Query   #       #       BEGIN
-master-bin.000003       #       Table_map       #       #       table_id: # (test.t2)
-master-bin.000003       #       Write_rows      #       #       table_id: # flags: STMT_END_F
-master-bin.000003       #       Query   #       #       COMMIT
 master-bin.000003       #       Rotate  #       #       master-bin.00000<binlog_start>;pos=<binlog_start>
 show binlog events in 'master-bin.00000<binlog_start>' from <binlog_start>;
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info
 master-bin.00000<binlog_start>  #       Format_desc     #       #       SERVER_VERSION, BINLOG_VERSION
-master-bin.00000<binlog_start>  #       Binlog_checkpoint       #       #       master-bin.000002
-master-bin.00000<binlog_start>  #       Query   #       #       use `test`; FLUSH TABLES t2
-We should see only one entry here, a=0:
-SELECT a FROM t1 ORDER BY a;
-a
-0
-PURGE BINARY LOGS TO "master-bin.000004";
+master-bin.00000<binlog_start>  #       Binlog_checkpoint       #       #       master-bin.000003
+master-bin.00000<binlog_start>  #       Binlog_checkpoint       #       #       master-bin.00000<binlog_start>
+master-bin.00000<binlog_start>  #       Query   #       #       BEGIN
+master-bin.00000<binlog_start>  #       Table_map       #       #       table_id: # (test.t1)
+master-bin.00000<binlog_start>  #       Write_rows      #       #       table_id: # flags: STMT_END_F
+master-bin.00000<binlog_start>  #       Xid     #       #       COMMIT /* XID */
+master-bin.00000<binlog_start>  #       Rotate  #       #       master-bin.000005;pos=<binlog_start>
+show binlog events in 'master-bin.000005' from <binlog_start>;
+Log_name        Pos     Event_type      Server_id       End_log_pos     Info
+master-bin.000005       #       Format_desc     #       #       SERVER_VERSION, BINLOG_VERSION
+master-bin.000005       #       Binlog_checkpoint       #       #       master-bin.00000<binlog_start>
+master-bin.000005       #       Query   #       #       BEGIN
+master-bin.000005       #       Table_map       #       #       table_id: # (test.t1)
+master-bin.000005       #       Write_rows      #       #       table_id: # flags: STMT_END_F
+master-bin.000005       #       Xid     #       #       COMMIT /* XID */
+master-bin.000005       #       Query   #       #       BEGIN
+master-bin.000005       #       Table_map       #       #       table_id: # (test.t1)
+master-bin.000005       #       Write_rows      #       #       table_id: # flags: STMT_END_F
+master-bin.000005       #       Xid     #       #       COMMIT /* XID */
+master-bin.000005       #       Rotate  #       #       master-bin.000006;pos=<binlog_start>
+show binlog events in 'master-bin.000006' from <binlog_start>;
+Log_name        Pos     Event_type      Server_id       End_log_pos     Info
+master-bin.000006       #       Format_desc     #       #       SERVER_VERSION, BINLOG_VERSION
+master-bin.000006       #       Binlog_checkpoint       #       #       master-bin.00000<binlog_start>
+PURGE BINARY LOGS TO "master-bin.000006";
 show binary logs;
 Log_name        File_size
-master-bin.000002       #
-master-bin.000003       #
 master-bin.000004       #
-SET SESSION debug_dbug="+d,crash_commit_after_log";
-INSERT INTO t1 VALUES (4, NULL);
+master-bin.000005       #
+master-bin.000006       #
+SET DEBUG_SYNC= "now SIGNAL con4_cont";
 Got one of the listed errors
 SELECT a FROM t1 ORDER BY a;
 a
-0
 1
 2
 3
 4
-*** Test that RESET MASTER waits for pending XIDs to be unlogged.
-SET @old_max_binlog_size= @@global.max_binlog_size;
+100
+101
+102
+Test that with multiple binlog checkpoints, recovery starts from the last one.
 SET GLOBAL max_binlog_size= 4096;
-SET DEBUG_SYNC= "ha_commit_trans_after_log_and_order SIGNAL con10_ready WAIT_FOR con10_go";
-INSERT INTO t1 VALUES (10, NULL);
+SET GLOBAL innodb_flush_log_at_trx_commit= 1;
+RESET MASTER;
+SET DEBUG_SYNC= "commit_after_group_release_commit_ordered SIGNAL con10_ready WAIT_FOR con10_cont";
+INSERT INTO t1 VALUES (10, REPEAT("x", 4100));
 SET DEBUG_SYNC= "now WAIT_FOR con10_ready";
-INSERT INTO t2 VALUES (10, REPEAT("x", 4100));
-INSERT INTO t2 VALUES (11, REPEAT("x", 4100));
+SET DEBUG_SYNC= "commit_after_group_release_commit_ordered SIGNAL con11_ready WAIT_FOR con11_cont";
+INSERT INTO t1 VALUES (11, REPEAT("x", 4100));
+SET DEBUG_SYNC= "now WAIT_FOR con11_ready";
+SET DEBUG_SYNC= "commit_after_group_release_commit_ordered SIGNAL con12_ready WAIT_FOR con12_cont";
+INSERT INTO t1 VALUES (12, REPEAT("x", 4100));
+SET DEBUG_SYNC= "now WAIT_FOR con12_ready";
+INSERT INTO t1 VALUES (13, NULL);
 show binary logs;
 Log_name        File_size
+master-bin.000001       #
 master-bin.000002       #
 master-bin.000003       #
 master-bin.000004       #
-master-bin.000005       #
-master-bin.000006       #
-master-bin.000007       #
-SET DEBUG_SYNC= "execute_command_after_close_tables SIGNAL reset_master_done";
+show binlog events in 'master-bin.00000<binlog_start>' from <binlog_start>;
+Log_name        Pos     Event_type      Server_id       End_log_pos     Info
+master-bin.00000<binlog_start>  #       Format_desc     #       #       SERVER_VERSION, BINLOG_VERSION
+master-bin.00000<binlog_start>  #       Binlog_checkpoint       #       #       master-bin.000001
+master-bin.00000<binlog_start>  #       Query   #       #       BEGIN
+master-bin.00000<binlog_start>  #       Table_map       #       #       table_id: # (test.t1)
+master-bin.00000<binlog_start>  #       Write_rows      #       #       table_id: # flags: STMT_END_F
+master-bin.00000<binlog_start>  #       Xid     #       #       COMMIT /* XID */
+SET DEBUG_SYNC= "now SIGNAL con10_cont";
+SET DEBUG_SYNC= "now SIGNAL con12_cont";
+SET DEBUG_SYNC= "now SIGNAL con11_cont";
+Checking that master-bin.000004 is the last binlog checkpoint
+show binlog events in 'master-bin.00000<binlog_start>' from <binlog_start>;
+Log_name        Pos     Event_type      Server_id       End_log_pos     Info
+master-bin.00000<binlog_start>  #       Format_desc     #       #       SERVER_VERSION, BINLOG_VERSION
+master-bin.00000<binlog_start>  #       Binlog_checkpoint       #       #       master-bin.000001
+master-bin.00000<binlog_start>  #       Query   #       #       BEGIN
+master-bin.00000<binlog_start>  #       Table_map       #       #       table_id: # (test.t1)
+master-bin.00000<binlog_start>  #       Write_rows      #       #       table_id: # flags: STMT_END_F
+master-bin.00000<binlog_start>  #       Xid     #       #       COMMIT /* XID */
+master-bin.00000<binlog_start>  #       Binlog_checkpoint       #       #       master-bin.000002
+master-bin.00000<binlog_start>  #       Binlog_checkpoint       #       #       master-bin.00000<binlog_start>
+Now crash the server
+SET SESSION debug_dbug="+d,crash_commit_after_log";
+INSERT INTO t1 VALUES (14, NULL);
+Got one of the listed errors
+SELECT a FROM t1 ORDER BY a;
+a
+1
+2
+3
+4
+10
+11
+12
+13
+14
+100
+101
+102
+*** Check that recovery works if we crashed early during rotate, before
+*** binlog checkpoint event could be written.
+SET GLOBAL max_binlog_size= 4096;
+SET GLOBAL innodb_flush_log_at_trx_commit= 1;
 RESET MASTER;
-This will timeout, as RESET MASTER is blocked
-SET DEBUG_SYNC= "now WAIT_FOR reset_master_done TIMEOUT 1";
-Warnings:
-Warning 1639    debug sync point wait timed out
-SET DEBUG_SYNC= "now SIGNAL con10_go";
-show binary logs;
-Log_name        File_size
-master-bin.000001       #
-*** Test that binlog N is active, and last pending trx in (N-1) is
-unlogged while there is still a pending trx in (N-2).
-SET DEBUG_SYNC= "ha_commit_trans_after_log_and_order SIGNAL con10_ready WAIT_FOR con10_continue";
-INSERT INTO t1 VALUES (20, REPEAT("x", 4100));
-SET DEBUG_SYNC= "now WAIT_FOR con10_ready";
-INSERT INTO t2 VALUES (3, "force binlog rotation");
-SET DEBUG_SYNC= "ha_commit_trans_after_log_and_order SIGNAL con11_ready WAIT_FOR con11_continue";
 INSERT INTO t1 VALUES (21, REPEAT("x", 4100));
-SET DEBUG_SYNC= "now WAIT_FOR con11_ready";
-INSERT INTO t2 VALUES (4, "force binlog rotation");
+INSERT INTO t1 VALUES (22, REPEAT("x", 4100));
+INSERT INTO t1 VALUES (23, REPEAT("x", 4100));
+SET SESSION debug_dbug="+d,crash_before_write_checkpoint_event";
+INSERT INTO t1 VALUES (24, REPEAT("x", 4100));
+Got one of the listed errors
+SELECT a FROM t1 ORDER BY a;
+a
+1
+2
+3
+4
+10
+11
+12
+13
+14
+21
+22
+23
+24
+100
+101
+102
 show binary logs;
 Log_name        File_size
 master-bin.000001       #
 master-bin.000002       #
 master-bin.000003       #
-show binlog events in 'master-bin.000001' from <binlog_start>;
-Log_name        Pos     Event_type      Server_id       End_log_pos     Info
-master-bin.000001       #       Format_desc     #       #       SERVER_VERSION, BINLOG_VERSION
-master-bin.000001       #       Binlog_checkpoint       #       #       master-bin.000001
-master-bin.000001       #       Query   #       #       BEGIN
-master-bin.000001       #       Table_map       #       #       table_id: # (test.t1)
-master-bin.000001       #       Write_rows      #       #       table_id: # flags: STMT_END_F
-master-bin.000001       #       Xid     #       #       COMMIT /* XID */
-master-bin.000001       #       Query   #       #       BEGIN
-master-bin.000001       #       Table_map       #       #       table_id: # (test.t2)
-master-bin.000001       #       Write_rows      #       #       table_id: # flags: STMT_END_F
-master-bin.000001       #       Query   #       #       COMMIT
-master-bin.000001       #       Rotate  #       #       master-bin.000002;pos=<binlog_start>
-show binlog events in 'master-bin.000002' from <binlog_start>;
-Log_name        Pos     Event_type      Server_id       End_log_pos     Info
-master-bin.000002       #       Format_desc     #       #       SERVER_VERSION, BINLOG_VERSION
-master-bin.000002       #       Binlog_checkpoint       #       #       master-bin.000001
-master-bin.000002       #       Query   #       #       BEGIN
-master-bin.000002       #       Table_map       #       #       table_id: # (test.t1)
-master-bin.000002       #       Write_rows      #       #       table_id: # flags: STMT_END_F
-master-bin.000002       #       Xid     #       #       COMMIT /* XID */
-master-bin.000002       #       Query   #       #       BEGIN
-master-bin.000002       #       Table_map       #       #       table_id: # (test.t2)
-master-bin.000002       #       Write_rows      #       #       table_id: # flags: STMT_END_F
-master-bin.000002       #       Query   #       #       COMMIT
-master-bin.000002       #       Rotate  #       #       master-bin.000003;pos=<binlog_start>
-show binlog events in 'master-bin.000003' from <binlog_start>;
-Log_name        Pos     Event_type      Server_id       End_log_pos     Info
-master-bin.000003       #       Format_desc     #       #       SERVER_VERSION, BINLOG_VERSION
-master-bin.000003       #       Binlog_checkpoint       #       #       master-bin.000001
-SET DEBUG_SYNC= "now SIGNAL con11_continue";
-con10 is still pending, no new binlog checkpoint should have been logged.
-show binlog events in 'master-bin.000003' from <binlog_start>;
-Log_name        Pos     Event_type      Server_id       End_log_pos     Info
-master-bin.000003       #       Format_desc     #       #       SERVER_VERSION, BINLOG_VERSION
-master-bin.000003       #       Binlog_checkpoint       #       #       master-bin.000001
-SET DEBUG_SYNC= "now SIGNAL con10_continue";
-No XIDs are pending, a new binlog checkpoint should have been logged.
-show binlog events in 'master-bin.000003' from <binlog_start>;
+master-bin.000004       #
+master-bin.000005       #
+show binlog events in 'master-bin.00000<binlog_start>' from <binlog_start>;
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info
-master-bin.000003       #       Format_desc     #       #       SERVER_VERSION, BINLOG_VERSION
-master-bin.000003       #       Binlog_checkpoint       #       #       master-bin.000001
-master-bin.000003       #       Binlog_checkpoint       #       #       master-bin.000003
-DROP TABLE t1, t2;
-SET GLOBAL max_binlog_size= @old_max_binlog_size;
+master-bin.00000<binlog_start>  #       Format_desc     #       #       SERVER_VERSION, BINLOG_VERSION
+master-bin.00000<binlog_start>  #       Binlog_checkpoint       #       #       master-bin.000003
+master-bin.00000<binlog_start>  #       Binlog_checkpoint       #       #       master-bin.00000<binlog_start>
+master-bin.00000<binlog_start>  #       Query   #       #       BEGIN
+master-bin.00000<binlog_start>  #       Table_map       #       #       table_id: # (test.t1)
+master-bin.00000<binlog_start>  #       Write_rows      #       #       table_id: # flags: STMT_END_F
+master-bin.00000<binlog_start>  #       Xid     #       #       COMMIT /* XID */
+master-bin.00000<binlog_start>  #       Rotate  #       #       master-bin.000005;pos=<binlog_start>
+DROP TABLE t1;

=== added file 'mysql-test/suite/binlog/t/binlog_checkpoint.test'
--- a/mysql-test/suite/binlog/t/binlog_checkpoint.test	1970-01-01 00:00:00 +0000
+++ b/mysql-test/suite/binlog/t/binlog_checkpoint.test	2012-09-13 12:31:29 +0000
@@ -0,0 +1,108 @@
+--source include/have_innodb.inc
+--source include/have_debug.inc
+--source include/have_debug_sync.inc
+--source include/have_binlog_format_row.inc
+
+SET @old_max_binlog_size= @@global.max_binlog_size;
+SET GLOBAL max_binlog_size= 4096;
+SET @old_innodb_flush_log_at_trx_commit= @@global.innodb_flush_log_at_trx_commit;
+SET GLOBAL innodb_flush_log_at_trx_commit= 1;
+RESET MASTER;
+
+CREATE TABLE t1 (a INT PRIMARY KEY, b MEDIUMTEXT) ENGINE=Innodb;
+CREATE TABLE t2 (a INT PRIMARY KEY, b MEDIUMTEXT) ENGINE=Myisam;
+
+--echo *** Test that RESET MASTER waits for pending commit checkpoints to complete.
+
+# con1 will hang before doing commit checkpoint, blocking RESET MASTER.
+connect(con1,localhost,root,,);
+SET DEBUG_SYNC= "commit_after_group_release_commit_ordered SIGNAL con1_ready WAIT_FOR con1_go";
+send INSERT INTO t1 VALUES (1, REPEAT("x", 4100));
+
+connection default;
+SET DEBUG_SYNC= "now WAIT_FOR con1_ready";
+# Let's add a few binlog rotations just for good measure.
+INSERT INTO t2 VALUES (1, REPEAT("x", 4100));
+INSERT INTO t2 VALUES (2, REPEAT("x", 4100));
+--source include/show_binary_logs.inc
+--let $binlog_file= master-bin.000004
+--let $binlog_start= 4
+--source include/show_binlog_events.inc
+SET DEBUG_SYNC= "execute_command_after_close_tables SIGNAL reset_master_done";
+send RESET MASTER;
+
+connect(con2,localhost,root,,);
+--echo This will timeout, as RESET MASTER is blocked
+SET DEBUG_SYNC= "now WAIT_FOR reset_master_done TIMEOUT 1";
+# Wake up transaction to allow RESET MASTER to complete.
+SET DEBUG_SYNC= "now SIGNAL con1_go";
+
+connection con1;
+reap;
+
+connection default;
+reap;
+--source include/show_binary_logs.inc
+--let $binlog_file= master-bin.000001
+--let $binlog_start= 4
+--source include/show_binlog_events.inc
+
+--echo *** Test that binlog N is active, and commit checkpoint for (N-1) is
+--echo *** done while there is still a pending commit checkpoint for (N-2).
+
+connection con1;
+SET DEBUG_SYNC= "commit_after_group_release_commit_ordered SIGNAL con1_ready WAIT_FOR con1_continue";
+send INSERT INTO t1 VALUES (20, REPEAT("x", 4100));
+
+connection default;
+SET DEBUG_SYNC= "now WAIT_FOR con1_ready";
+
+connection con2;
+SET DEBUG_SYNC= "commit_after_group_release_commit_ordered SIGNAL con2_ready WAIT_FOR con2_continue";
+send INSERT INTO t1 VALUES (21, REPEAT("x", 4100));
+
+connection default;
+SET DEBUG_SYNC= "now WAIT_FOR con2_ready";
+--source include/show_binary_logs.inc
+--let $binlog_file= master-bin.000001
+--source include/show_binlog_events.inc
+--let $binlog_file= master-bin.000002
+--source include/show_binlog_events.inc
+--let $binlog_file= master-bin.000003
+--source include/show_binlog_events.inc
+
+SET DEBUG_SYNC= "now SIGNAL con2_continue";
+
+connection con2;
+reap;
+
+connection default;
+--echo con1 is still pending, no new binlog checkpoint should have been logged.
+--let $binlog_file= master-bin.000003
+--source include/show_binlog_events.inc
+
+SET DEBUG_SYNC= "now SIGNAL con1_continue";
+
+connection con1;
+reap;
+
+connection default;
+
+--echo No commit checkpoints are pending, a new binlog checkpoint should have been logged.
+--let $binlog_file= master-bin.000003
+
+# Wait for the master-bin.000003 binlog checkpoint to appear.
+--let $wait_for_all= 0
+--let $show_statement= SHOW BINLOG EVENTS IN "$binlog_file"
+--let $field= Info
+--let $condition= = "master-bin.000003"
+--source include/wait_show_condition.inc
+
+--source include/show_binlog_events.inc
+
+
+# Cleanup
+connection default;
+DROP TABLE t1, t2;
+SET GLOBAL max_binlog_size= @old_max_binlog_size;
+SET GLOBAL innodb_flush_log_at_trx_commit= @old_innodb_flush_log_at_trx_commit;

=== modified file 'mysql-test/suite/binlog/t/binlog_xa_recover-master.opt'
--- a/mysql-test/suite/binlog/t/binlog_xa_recover-master.opt	2012-06-22 09:46:28 +0000
+++ b/mysql-test/suite/binlog/t/binlog_xa_recover-master.opt	2012-09-13 12:31:29 +0000
@@ -1 +1 @@
---skip-stack-trace --skip-core-file
+--skip-stack-trace --skip-core-file --loose-debug-dbug=+d,xa_recover_expect_master_bin_000004

=== modified file 'mysql-test/suite/binlog/t/binlog_xa_recover.test'
--- a/mysql-test/suite/binlog/t/binlog_xa_recover.test	2012-06-22 09:46:28 +0000
+++ b/mysql-test/suite/binlog/t/binlog_xa_recover.test	2012-09-13 12:31:29 +0000
@@ -5,81 +5,105 @@
 # Valgrind does not work well with test that crashes the server
 --source include/not_valgrind.inc
 
+# (We do not need to restore these settings, as we crash the server).
 SET GLOBAL max_binlog_size= 4096;
+SET GLOBAL innodb_flush_log_at_trx_commit= 1;
+RESET MASTER;
 
 CREATE TABLE t1 (a INT PRIMARY KEY, b MEDIUMTEXT) ENGINE=Innodb;
-CREATE TABLE t2 (a INT PRIMARY KEY, b MEDIUMTEXT) ENGINE=Myisam;
-
-# Transactions are not guaranteed stored durably on disk in the engine until
-# they are fsync()ed, which normally happens during commit(). But there is no
-# guarantee that they will _not_ be durable, in particular loosing results
-# of a write(2) system call normally requires a kernel crash (as opposed to
-# just mysqld crash), which is inconvenient to do in a test suite.
-# So instead we do an error insert to prevent commit_ordered() from being
-# called in the engine - so nothing will be written to disk at all, and crash
-# recovery is sure to be needed.
-SET @@global.debug_dbug='+d,skip_commit_ordered';
-
-INSERT INTO t1 VALUES (0, REPEAT("x", 4100));
+# Insert some data to force a couple binlog rotations (3), so we get some
+# normal binlog checkpoints before starting the test.
+INSERT INTO t1 VALUES (100, REPEAT("x", 4100));
+INSERT INTO t1 VALUES (101, REPEAT("x", 4100));
+INSERT INTO t1 VALUES (102, REPEAT("x", 4100));
 
 # Now start a bunch of transactions that span multiple binlog
 # files. Leave then in the state prepared-but-not-committed in the engine
 # and crash the server. Check that crash recovery is able to recover all
 # of them.
+#
+# We use debug_sync to get all the transactions into the prepared state before
+# we commit any of them. This is because the prepare step flushes the InnoDB
+# redo log - including any commits made before, so recovery would become
+# unnecessary, decreasing the value of this test.
+#
+# We arrange to have con1 with a prepared transaction in master-bin.000004,
+# con2 and con3 with a prepared transaction in master-bin.000005, and a new
+# empty master-bin.000006. So the latest binlog checkpoint should be
+# master-bin.000006.
 
 connect(con1,localhost,root,,);
-SET DEBUG_SYNC= "ha_commit_trans_after_log_and_order SIGNAL con1_ready WAIT_FOR _ever";
+# First wait after prepare and before write to binlog.
+SET DEBUG_SYNC= "ha_commit_trans_before_log_and_order SIGNAL con1_wait WAIT_FOR con1_cont";
+# Then complete InnoDB commit in memory (but not commit checkpoint / write to
+# disk), and hang until crash, leaving a transaction to be XA recovered.
+SET DEBUG_SYNC= "commit_after_group_release_commit_ordered SIGNAL con1_ready WAIT_FOR _ever";
 send INSERT INTO t1 VALUES (1, REPEAT("x", 4100));
 
 connection default;
-SET DEBUG_SYNC= "now WAIT_FOR con1_ready";
-INSERT INTO t2 VALUES (1, "force binlog rotation");
+SET DEBUG_SYNC= "now WAIT_FOR con1_wait";
 
 connect(con2,localhost,root,,);
-SET DEBUG_SYNC= "ha_commit_trans_after_log_and_order SIGNAL con2_ready WAIT_FOR _ever";
+SET DEBUG_SYNC= "ha_commit_trans_before_log_and_order SIGNAL con2_wait WAIT_FOR con2_cont";
+SET DEBUG_SYNC= "commit_after_group_release_commit_ordered SIGNAL con2_ready WAIT_FOR _ever";
 send INSERT INTO t1 VALUES (2, NULL);
 
 connection default;
-SET DEBUG_SYNC= "now WAIT_FOR con2_ready";
+SET DEBUG_SYNC= "now WAIT_FOR con2_wait";
 
 connect(con3,localhost,root,,);
-SET DEBUG_SYNC= "ha_commit_trans_after_log_and_order SIGNAL con3_ready WAIT_FOR _ever";
+SET DEBUG_SYNC= "ha_commit_trans_before_log_and_order SIGNAL con3_wait WAIT_FOR con3_cont";
+SET DEBUG_SYNC= "commit_after_group_release_commit_ordered SIGNAL con3_ready WAIT_FOR _ever";
 send INSERT INTO t1 VALUES (3, REPEAT("x", 4100));
+
 connection default;
+SET DEBUG_SYNC= "now WAIT_FOR con3_wait";
+
+connect(con4,localhost,root,,);
+SET DEBUG_SYNC= "ha_commit_trans_before_log_and_order SIGNAL con4_wait WAIT_FOR con4_cont";
+SET SESSION debug_dbug="+d,crash_commit_after_log";
+send INSERT INTO t1 VALUES (4, NULL);
+
+connection default;
+SET DEBUG_SYNC= "now WAIT_FOR con4_wait";
+
+SET DEBUG_SYNC= "now SIGNAL con1_cont";
+SET DEBUG_SYNC= "now WAIT_FOR con1_ready";
+SET DEBUG_SYNC= "now SIGNAL con2_cont";
+SET DEBUG_SYNC= "now WAIT_FOR con2_ready";
+SET DEBUG_SYNC= "now SIGNAL con3_cont";
 SET DEBUG_SYNC= "now WAIT_FOR con3_ready";
-INSERT INTO t2 VALUES (2, "force binlog rotation");
-# So we won't get warnings about t2 being crashed.
-FLUSH TABLES t2;
 
 # Check that everything is committed in binary log.
 --source include/show_binary_logs.inc
---let $binlog_file= master-bin.000001
+--let $binlog_file= master-bin.000003
 --let $binlog_start= 4
 --source include/show_binlog_events.inc
---let $binlog_file= master-bin.000002
+--let $binlog_file= master-bin.000004
 --source include/show_binlog_events.inc
---let $binlog_file= master-bin.000003
+--let $binlog_file= master-bin.000005
 --source include/show_binlog_events.inc
---let $binlog_file= master-bin.000004
+--let $binlog_file= master-bin.000006
 --source include/show_binlog_events.inc
 
-# Check that transactions really are not yet committed in engine.
-# (This works because of debug_dbug='+d,skip_commit_ordered').
---echo We should see only one entry here, a=0:
-SELECT a FROM t1 ORDER BY a;
-
 
 # Check that server will not purge too much.
-PURGE BINARY LOGS TO "master-bin.000004";
+PURGE BINARY LOGS TO "master-bin.000006";
 --source include/show_binary_logs.inc
 
 # Now crash the server with one more transaction in prepared state.
-system echo wait-binlog_xa_recover.test >> $MYSQLTEST_VARDIR/tmp/mysqld.1.expect;
-SET SESSION debug_dbug="+d,crash_commit_after_log";
+--write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
+wait-binlog_xa_recover.test
+EOF
+SET DEBUG_SYNC= "now SIGNAL con4_cont";
+connection con4;
 --error 2006,2013
-INSERT INTO t1 VALUES (4, NULL);
+reap;
 
-system echo restart-group_commit_binlog_pos.test >> $MYSQLTEST_VARDIR/tmp/mysqld.1.expect;
+--remove_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
+--write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
+restart-group_commit_binlog_pos.test
+EOF
 
 connection default;
 --enable_reconnect
@@ -88,87 +112,128 @@ connection default;
 # Check that all transactions are recovered.
 SELECT a FROM t1 ORDER BY a;
 
+--echo Test that with multiple binlog checkpoints, recovery starts from the last one.
+SET GLOBAL max_binlog_size= 4096;
+SET GLOBAL innodb_flush_log_at_trx_commit= 1;
+RESET MASTER;
 
---echo *** Test that RESET MASTER waits for pending XIDs to be unlogged.
+# Rotate to binlog master-bin.000003 while delaying binlog checkpoints.
+# So we get multiple binlog checkpoints in master-bin.000003.
+# Then complete the checkpoints, crash, and check that we only scan
+# the necessary binlog file (ie. that we use the _last_ checkpoint).
 
-SET @old_max_binlog_size= @@global.max_binlog_size;
-SET GLOBAL max_binlog_size= 4096;
-# con10 will hang with a pending XID, blocking RESET MASTER.
 connect(con10,localhost,root,,);
-SET DEBUG_SYNC= "ha_commit_trans_after_log_and_order SIGNAL con10_ready WAIT_FOR con10_go";
-send INSERT INTO t1 VALUES (10, NULL);
+SET DEBUG_SYNC= "commit_after_group_release_commit_ordered SIGNAL con10_ready WAIT_FOR con10_cont";
+send INSERT INTO t1 VALUES (10, REPEAT("x", 4100));
 
 connection default;
 SET DEBUG_SYNC= "now WAIT_FOR con10_ready";
-# Let's add a few binlog rotations just for good measure.
-INSERT INTO t2 VALUES (10, REPEAT("x", 4100));
-INSERT INTO t2 VALUES (11, REPEAT("x", 4100));
---source include/show_binary_logs.inc
-SET DEBUG_SYNC= "execute_command_after_close_tables SIGNAL reset_master_done";
-send RESET MASTER;
 
 connect(con11,localhost,root,,);
---echo This will timeout, as RESET MASTER is blocked
-SET DEBUG_SYNC= "now WAIT_FOR reset_master_done TIMEOUT 1";
-# Wake up transaction to allow RESET MASTER to complete.
-SET DEBUG_SYNC= "now SIGNAL con10_go";
-
-connection con10;
-reap;
+SET DEBUG_SYNC= "commit_after_group_release_commit_ordered SIGNAL con11_ready WAIT_FOR con11_cont";
+send INSERT INTO t1 VALUES (11, REPEAT("x", 4100));
 
 connection default;
-reap;
---source include/show_binary_logs.inc
+SET DEBUG_SYNC= "now WAIT_FOR con11_ready";
 
+connect(con12,localhost,root,,);
+SET DEBUG_SYNC= "commit_after_group_release_commit_ordered SIGNAL con12_ready WAIT_FOR con12_cont";
+send INSERT INTO t1 VALUES (12, REPEAT("x", 4100));
 
---echo *** Test that binlog N is active, and last pending trx in (N-1) is
---echo unlogged while there is still a pending trx in (N-2).
+connection default;
+SET DEBUG_SYNC= "now WAIT_FOR con12_ready";
+INSERT INTO t1 VALUES (13, NULL);
 
+--source include/show_binary_logs.inc
+--let $binlog_file= master-bin.000004
+--let $binlog_start= 4
+--source include/show_binlog_events.inc
+
+SET DEBUG_SYNC= "now SIGNAL con10_cont";
 connection con10;
-SET DEBUG_SYNC= "ha_commit_trans_after_log_and_order SIGNAL con10_ready WAIT_FOR con10_continue";
-send INSERT INTO t1 VALUES (20, REPEAT("x", 4100));
+reap;
+connection default;
+SET DEBUG_SYNC= "now SIGNAL con12_cont";
+connection con12;
+reap;
+connection default;
+SET DEBUG_SYNC= "now SIGNAL con11_cont";
+connection con11;
+reap;
 
 connection default;
-SET DEBUG_SYNC= "now WAIT_FOR con10_ready";
-INSERT INTO t2 VALUES (3, "force binlog rotation");
+# Wait for the last (master-bin.000004) binlog checkpoint to appear.
+--let $wait_for_all= 0
+--let $show_statement= SHOW BINLOG EVENTS IN "master-bin.000004"
+--let $field= Info
+--let $condition= = "master-bin.000004"
+--source include/wait_show_condition.inc
+
+--echo Checking that master-bin.000004 is the last binlog checkpoint
+--source include/show_binlog_events.inc
+
+--echo Now crash the server
+# It is not too easy to test XA recovery, as it runs early during server
+# startup, before any connections can be made.
+# What we do is set a DBUG error insert which will crash if XA recovery
+# starts from any other binlog than master-bin.000004 (check the file
+# binlog_xa_recover-master.opt). Then we will fail here if XA recovery
+# would start from the wrong place.
+--write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
+wait-binlog_xa_recover.test
+EOF
+SET SESSION debug_dbug="+d,crash_commit_after_log";
+--error 2006,2013
+INSERT INTO t1 VALUES (14, NULL);
 
-connection con11;
-SET DEBUG_SYNC= "ha_commit_trans_after_log_and_order SIGNAL con11_ready WAIT_FOR con11_continue";
-send INSERT INTO t1 VALUES (21, REPEAT("x", 4100));
+--remove_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
+--write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
+restart-group_commit_binlog_pos.test
+EOF
 
 connection default;
-SET DEBUG_SYNC= "now WAIT_FOR con11_ready";
-INSERT INTO t2 VALUES (4, "force binlog rotation");
---source include/show_binary_logs.inc
---let $binlog_file= master-bin.000001
---source include/show_binlog_events.inc
---let $binlog_file= master-bin.000002
---source include/show_binlog_events.inc
---let $binlog_file= master-bin.000003
---source include/show_binlog_events.inc
+--enable_reconnect
+--source include/wait_until_connected_again.inc
 
-SET DEBUG_SYNC= "now SIGNAL con11_continue";
+# Check that all transactions are recovered.
+SELECT a FROM t1 ORDER BY a;
 
-connection con11;
-reap;
 
-connection default;
---echo con10 is still pending, no new binlog checkpoint should have been logged.
---let $binlog_file= master-bin.000003
---source include/show_binlog_events.inc
+--echo *** Check that recovery works if we crashed early during rotate, before
+--echo *** binlog checkpoint event could be written.
 
-SET DEBUG_SYNC= "now SIGNAL con10_continue";
+SET GLOBAL max_binlog_size= 4096;
+SET GLOBAL innodb_flush_log_at_trx_commit= 1;
+RESET MASTER;
 
-connection con10;
-reap;
+# We need some initial data to reach binlog master-bin.000004. Otherwise
+# crash recovery fails due to the error insert used for previous test.
+INSERT INTO t1 VALUES (21, REPEAT("x", 4100));
+INSERT INTO t1 VALUES (22, REPEAT("x", 4100));
+INSERT INTO t1 VALUES (23, REPEAT("x", 4100));
+--write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
+wait-binlog_xa_recover.test
+EOF
+SET SESSION debug_dbug="+d,crash_before_write_checkpoint_event";
+--error 2006,2013
+INSERT INTO t1 VALUES (24, REPEAT("x", 4100));
 
-connection default;
---echo No XIDs are pending, a new binlog checkpoint should have been logged.
---let $binlog_file= master-bin.000003
---source include/show_binlog_events.inc
+--remove_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
+--write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
+restart-group_commit_binlog_pos.test
+EOF
 
+--enable_reconnect
+--source include/wait_until_connected_again.inc
+
+# Check that all transactions are recovered.
+SELECT a FROM t1 ORDER BY a;
+
+--source include/show_binary_logs.inc
+--let $binlog_file= master-bin.000004
+--let $binlog_start= 4
+--source include/show_binlog_events.inc
 
 # Cleanup
 connection default;
-DROP TABLE t1, t2;
-SET GLOBAL max_binlog_size= @old_max_binlog_size;
+DROP TABLE t1;

=== modified file 'mysql-test/suite/innodb/r/binlog_consistent.result'
--- a/mysql-test/suite/innodb/r/binlog_consistent.result	2012-06-22 09:46:28 +0000
+++ b/mysql-test/suite/innodb/r/binlog_consistent.result	2012-09-13 12:31:29 +0000
@@ -63,15 +63,15 @@ binlog_snapshot_file	master-bin.000001
 binlog_snapshot_position        945
 SHOW MASTER STATUS;
 File    Position        Binlog_Do_DB    Binlog_Ignore_DB
-master-bin.000002       286             
+master-bin.000002       326             
 COMMIT;
 SHOW STATUS LIKE 'binlog_snapshot_%';
 Variable_name   Value
 binlog_snapshot_file    master-bin.000002
-binlog_snapshot_position        286
+binlog_snapshot_position        326
 SHOW MASTER STATUS;
 File    Position        Binlog_Do_DB    Binlog_Ignore_DB
-master-bin.000002       286             
+master-bin.000002       326             
 SHOW BINLOG EVENTS;
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info
 master-bin.000001       4       Format_desc     1       246     Server ver: #, Binlog ver: #

=== modified file 'mysql-test/suite/innodb/r/group_commit_binlog_pos.result'
--- a/mysql-test/suite/innodb/r/group_commit_binlog_pos.result	2012-06-22 09:46:28 +0000
+++ b/mysql-test/suite/innodb/r/group_commit_binlog_pos.result	2012-09-13 12:31:29 +0000
@@ -1,3 +1,4 @@
+SET GLOBAL innodb_flush_log_at_trx_commit=3;
 CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb;
 INSERT INTO t1 VALUES (0);
 SET DEBUG_SYNC= "commit_after_get_LOCK_log SIGNAL con1_waiting WAIT_FOR con3_queued";

=== modified file 'mysql-test/suite/innodb/r/group_commit_binlog_pos_no_optimize_thread.result'
--- a/mysql-test/suite/innodb/r/group_commit_binlog_pos_no_optimize_thread.result	2012-06-22 09:46:28 +0000
+++ b/mysql-test/suite/innodb/r/group_commit_binlog_pos_no_optimize_thread.result	2012-09-13 12:31:29 +0000
@@ -1,3 +1,4 @@
+SET GLOBAL innodb_flush_log_at_trx_commit=3;
 CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb;
 INSERT INTO t1 VALUES (0);
 SET DEBUG_SYNC= "commit_after_get_LOCK_log SIGNAL con1_waiting WAIT_FOR con3_queued";

=== modified file 'mysql-test/suite/innodb/t/group_commit_binlog_pos.test'
--- a/mysql-test/suite/innodb/t/group_commit_binlog_pos.test	2012-02-07 15:22:36 +0000
+++ b/mysql-test/suite/innodb/t/group_commit_binlog_pos.test	2012-09-13 12:31:29 +0000
@@ -17,6 +17,19 @@
 # Test that we get the correct position when we group commit several
 # transactions together.
 
+# What we really want to test here is what happens when a group of
+# transactions get written only partially to disk inside InnoDB before
+# the crash. But that is hard to test in mysql-test-run automated
+# tests. Instead, we use debug_sync to tightly control when each
+# transaction is written to the redo log. And we set
+# innodb_flush_log_at_trx_commit=3 so that we can write out
+# transactions individually - as with
+# innodb_flush_log_at_trx_commit=1, all commits are written together,
+# as part of a commit_checkpoint.
+# (Note that we do not have to restore innodb_flush_log_at_trx_commit, as
+# we crash the server).
+SET GLOBAL innodb_flush_log_at_trx_commit=3;
+
 CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb;
 INSERT INTO t1 VALUES (0);
 

=== modified file 'mysql-test/suite/innodb/t/group_commit_binlog_pos_no_optimize_thread.test'
--- a/mysql-test/suite/innodb/t/group_commit_binlog_pos_no_optimize_thread.test	2012-02-07 15:22:36 +0000
+++ b/mysql-test/suite/innodb/t/group_commit_binlog_pos_no_optimize_thread.test	2012-09-13 12:31:29 +0000
@@ -17,6 +17,19 @@
 # Test that we get the correct position when we group commit several
 # transactions together.
 
+# What we really want to test here is what happens when a group of
+# transactions get written only partially to disk inside InnoDB before
+# the crash. But that is hard to test in mysql-test-run automated
+# tests. Instead, we use debug_sync to tightly control when each
+# transaction is written to the redo log. And we set
+# innodb_flush_log_at_trx_commit=3 so that we can write out
+# transactions individually - as with
+# innodb_flush_log_at_trx_commit=1, all commits are written together,
+# as part of a commit_checkpoint.
+# (Note that we do not have to restore innodb_flush_log_at_trx_commit, as
+# we crash the server).
+SET GLOBAL innodb_flush_log_at_trx_commit=3;
+
 CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb;
 INSERT INTO t1 VALUES (0);
 

=== modified file 'mysql-test/suite/rpl/r/rpl_checksum.result'
--- a/mysql-test/suite/rpl/r/rpl_checksum.result	2012-06-22 09:46:28 +0000
+++ b/mysql-test/suite/rpl/r/rpl_checksum.result	2012-09-13 12:31:29 +0000
@@ -71,7 +71,7 @@ insert into t1 values (1) /* will not be
 set @@global.debug_dbug='d,simulate_slave_unaware_checksum';
 start slave;
 include/wait_for_slave_io_error.inc [errno=1236]
-Last_IO_Error = 'Got fatal error 1236 from master when reading data from binary log: 'Slave can not handle replication events with the checksum that master is configured to log; the first event 'master-bin.000009' at 286, the last event read from 'master-bin.000010' at 246, the last byte read from 'master-bin.000010' at 246.''
+Last_IO_Error = 'Got fatal error 1236 from master when reading data from binary log: 'Slave can not handle replication events with the checksum that master is configured to log; the first event 'master-bin.000009' at 326, the last event read from 'master-bin.000010' at 246, the last byte read from 'master-bin.000010' at 246.''
 select count(*) as zero from t1;
 zero
 0

=== modified file 'mysql-test/suite/rpl/r/rpl_insert_delayed,stmt.rdiff'
--- a/mysql-test/suite/rpl/r/rpl_insert_delayed,stmt.rdiff	2012-02-06 22:16:21 +0000
+++ b/mysql-test/suite/rpl/r/rpl_insert_delayed,stmt.rdiff	2012-09-13 12:31:29 +0000
@@ -36,7 +36,7 @@
  a
  1
  On slave
-+show binlog events in 'slave-bin.000002' from <binlog_start> limit 1,6;
++show binlog events in 'slave-bin.000002' from <binlog_start> limit 2,6;
 +Log_name       Pos     Event_type      Server_id       End_log_pos     Info
 +slave-bin.000002       #       Query   #       #       BEGIN
 +slave-bin.000002       #       Query   #       #       use `test`; INSERT  IGNORE INTO t1 VALUES(1)

=== modified file 'mysql-test/suite/rpl/r/rpl_mariadb_slave_capability.result'
--- a/mysql-test/suite/rpl/r/rpl_mariadb_slave_capability.result	2012-06-22 09:46:28 +0000
+++ b/mysql-test/suite/rpl/r/rpl_mariadb_slave_capability.result	2012-09-13 12:31:29 +0000
@@ -54,7 +54,7 @@ master-bin.000002	#	Query	#	#	COMMIT
 SELECT * FROM t1;
 a
 2
-show relaylog events in 'slave-relay-bin.000005' from <binlog_start> limit 4,5;
+show relaylog events in 'slave-relay-bin.000005' from <binlog_start> limit 5,5;
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info
 slave-relay-bin.000005  #       Query   #       #       BEGIN
 slave-relay-bin.000005  #       Query   #       #       # Dummy ev

=== modified file 'mysql-test/suite/rpl/r/rpl_row_log.result'
--- a/mysql-test/suite/rpl/r/rpl_row_log.result	2010-12-19 17:15:12 +0000
+++ b/mysql-test/suite/rpl/r/rpl_row_log.result	2012-09-13 12:31:29 +0000
@@ -205,6 +205,7 @@ master-bin.000001	#	Query	#	#	COMMIT
 master-bin.000001       #       Rotate  #       #       master-bin.000002;pos=4
 show binlog events in 'master-bin.000002' from <binlog_start>;
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info
+master-bin.000002       #       Binlog_checkpoint       #       #       master-bin.000002
 master-bin.000002       #       Query   #       #       use `test`; create table t3 (a int)ENGINE=MyISAM
 master-bin.000002       #       Query   #       #       use `test`; create table t2 (n int)ENGINE=MyISAM
 master-bin.000002       #       Query   #       #       BEGIN
@@ -236,6 +237,7 @@ slave-bin.000001	#	Query	#	#	use `test`;
 slave-bin.000001        #       Rotate  #       #       slave-bin.000002;pos=4
 show binlog events in 'slave-bin.000002' from <binlog_start>;
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info
+slave-bin.000002        #       Binlog_checkpoint       #       #       slave-bin.000002
 slave-bin.000002        #       Query   #       #       use `test`; create table t2 (n int)ENGINE=MyISAM
 slave-bin.000002        #       Query   #       #       BEGIN
 slave-bin.000002        #       Table_map       #       #       table_id: # (test.t2)

=== modified file 'mysql-test/suite/rpl/r/rpl_row_log_innodb.result'
--- a/mysql-test/suite/rpl/r/rpl_row_log_innodb.result	2010-12-19 17:15:12 +0000
+++ b/mysql-test/suite/rpl/r/rpl_row_log_innodb.result	2012-09-13 12:31:29 +0000
@@ -205,6 +205,7 @@ master-bin.000001	#	Xid	#	#	COMMIT /* XI
 master-bin.000001       #       Rotate  #       #       master-bin.000002;pos=4
 show binlog events in 'master-bin.000002' from <binlog_start>;
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info
+master-bin.000002       #       Binlog_checkpoint       #       #       master-bin.000002
 master-bin.000002       #       Query   #       #       use `test`; create table t3 (a int)ENGINE=InnoDB
 master-bin.000002       #       Query   #       #       use `test`; create table t2 (n int)ENGINE=InnoDB
 master-bin.000002       #       Query   #       #       BEGIN
@@ -236,6 +237,7 @@ slave-bin.000001	#	Query	#	#	use `test`;
 slave-bin.000001        #       Rotate  #       #       slave-bin.000002;pos=4
 show binlog events in 'slave-bin.000002' from <binlog_start>;
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info
+slave-bin.000002        #       Binlog_checkpoint       #       #       slave-bin.000002
 slave-bin.000002        #       Query   #       #       use `test`; create table t2 (n int)ENGINE=InnoDB
 slave-bin.000002        #       Query   #       #       BEGIN
 slave-bin.000002        #       Table_map       #       #       table_id: # (test.t2)

=== modified file 'mysql-test/suite/rpl/r/rpl_row_show_relaylog_events.result'
--- a/mysql-test/suite/rpl/r/rpl_row_show_relaylog_events.result	2012-06-22 09:46:28 +0000
+++ b/mysql-test/suite/rpl/r/rpl_row_show_relaylog_events.result	2012-09-13 12:31:29 +0000
@@ -128,14 +128,16 @@ DROP TABLE t1;
 ******** [master] SHOW BINLOG EVENTS IN <FILE> ********
 show binlog events in 'master-bin.000002' from <binlog_start>;
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info
+master-bin.000002       #       Binlog_checkpoint       #       #       master-bin.000002
 master-bin.000002       #       Query   #       #       use `test`; DROP TABLE `t1` /* generated by server */
 ******** [master] SHOW BINLOG EVENTS IN <FILE> LIMIT 1 ********
 show binlog events in 'master-bin.000002' from <binlog_start> limit 1;
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info
-master-bin.000002       #       Query   #       #       use `test`; DROP TABLE `t1` /* generated by server */
+master-bin.000002       #       Binlog_checkpoint       #       #       master-bin.000002
 ******** [master] SHOW BINLOG EVENTS IN <FILE> LIMIT 1,3 ********
 show binlog events in 'master-bin.000002' from <binlog_start> limit 1,3;
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info
+master-bin.000002       #       Query   #       #       use `test`; DROP TABLE `t1` /* generated by server */
 ******** [master] SHOW BINLOG EVENTS  ********
 show binlog events from <binlog_start>;
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info
@@ -156,14 +158,16 @@ master-bin.000001	#	Rotate	#	#	master-bi
 ******** [slave] SHOW BINLOG EVENTS IN <FILE> ********
 show binlog events in 'slave-bin.000002' from <binlog_start>;
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info
+slave-bin.000002        #       Binlog_checkpoint       #       #       slave-bin.000002
 slave-bin.000002        #       Query   #       #       use `test`; DROP TABLE `t1` /* generated by server */
 ******** [slave] SHOW BINLOG EVENTS IN <FILE> LIMIT 1 ********
 show binlog events in 'slave-bin.000002' from <binlog_start> limit 1;
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info
-slave-bin.000002        #       Query   #       #       use `test`; DROP TABLE `t1` /* generated by server */
+slave-bin.000002        #       Binlog_checkpoint       #       #       slave-bin.000002
 ******** [slave] SHOW BINLOG EVENTS IN <FILE> LIMIT 1,3 ********
 show binlog events in 'slave-bin.000002' from <binlog_start> limit 1,3;
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info
+slave-bin.000002        #       Query   #       #       use `test`; DROP TABLE `t1` /* generated by server */
 ******** [slave] SHOW BINLOG EVENTS  ********
 show binlog events from <binlog_start>;
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info
@@ -186,6 +190,7 @@ show relaylog events in 'slave-relay-bin
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info
 slave-relay-bin.000006  #       Rotate  #       #       master-bin.000002;pos=4
 slave-relay-bin.000006  #       Format_desc     #       #       SERVER_VERSION, BINLOG_VERSION
+slave-relay-bin.000006  #       Binlog_checkpoint       #       #       master-bin.000001
 slave-relay-bin.000006  #       Binlog_checkpoint       #       #       master-bin.000002
 slave-relay-bin.000006  #       Query   #       #       use `test`; DROP TABLE `t1` /* generated by server */
 ******** [slave] SHOW RELAYLOG EVENTS IN <FILE> LIMIT 1 ********
@@ -196,8 +201,8 @@ slave-relay-bin.000006	#	Rotate	#	#	mast
 show relaylog events in 'slave-relay-bin.000006' from <binlog_start> limit 1,3;
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info
 slave-relay-bin.000006  #       Format_desc     #       #       SERVER_VERSION, BINLOG_VERSION
+slave-relay-bin.000006  #       Binlog_checkpoint       #       #       master-bin.000001
 slave-relay-bin.000006  #       Binlog_checkpoint       #       #       master-bin.000002
-slave-relay-bin.000006  #       Query   #       #       use `test`; DROP TABLE `t1` /* generated by server */
 ******** [slave] SHOW RELAYLOG EVENTS  ********
 show relaylog events from <binlog_start>;
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info

=== modified file 'mysql-test/suite/rpl/r/rpl_stm_log.result'
--- a/mysql-test/suite/rpl/r/rpl_stm_log.result	2010-12-19 17:15:12 +0000
+++ b/mysql-test/suite/rpl/r/rpl_stm_log.result	2012-09-13 12:31:29 +0000
@@ -205,6 +205,7 @@ master-bin.000001	#	Query	#	#	COMMIT
 master-bin.000001       #       Rotate  #       #       master-bin.000002;pos=4
 show binlog events in 'master-bin.000002' from <binlog_start>;
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info
+master-bin.000002       #       Binlog_checkpoint       #       #       master-bin.000002
 master-bin.000002       #       Query   #       #       use `test`; create table t3 (a int)ENGINE=MyISAM
 master-bin.000002       #       Query   #       #       use `test`; create table t2 (n int)ENGINE=MyISAM
 master-bin.000002       #       Query   #       #       BEGIN
@@ -235,6 +236,7 @@ slave-bin.000001	#	Query	#	#	use `test`;
 slave-bin.000001        #       Rotate  #       #       slave-bin.000002;pos=4
 show binlog events in 'slave-bin.000002' from <binlog_start>;
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info
+slave-bin.000002        #       Binlog_checkpoint       #       #       slave-bin.000002
 slave-bin.000002        #       Query   #       #       use `test`; create table t2 (n int)ENGINE=MyISAM
 slave-bin.000002        #       Query   #       #       BEGIN
 slave-bin.000002        #       Query   #       #       use `test`; insert into t2 values (1)

=== modified file 'mysql-test/suite/rpl/r/rpl_stm_mix_show_relaylog_events.result'
--- a/mysql-test/suite/rpl/r/rpl_stm_mix_show_relaylog_events.result	2012-06-22 09:46:28 +0000
+++ b/mysql-test/suite/rpl/r/rpl_stm_mix_show_relaylog_events.result	2012-09-13 12:31:29 +0000
@@ -113,14 +113,16 @@ DROP TABLE t1;
 ******** [master] SHOW BINLOG EVENTS IN <FILE> ********
 show binlog events in 'master-bin.000002' from <binlog_start>;
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info
+master-bin.000002       #       Binlog_checkpoint       #       #       master-bin.000002
 master-bin.000002       #       Query   #       #       use `test`; DROP TABLE `t1` /* generated by server */
 ******** [master] SHOW BINLOG EVENTS IN <FILE> LIMIT 1 ********
 show binlog events in 'master-bin.000002' from <binlog_start> limit 1;
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info
-master-bin.000002       #       Query   #       #       use `test`; DROP TABLE `t1` /* generated by server */
+master-bin.000002       #       Binlog_checkpoint       #       #       master-bin.000002
 ******** [master] SHOW BINLOG EVENTS IN <FILE> LIMIT 1,3 ********
 show binlog events in 'master-bin.000002' from <binlog_start> limit 1,3;
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info
+master-bin.000002       #       Query   #       #       use `test`; DROP TABLE `t1` /* generated by server */
 ******** [master] SHOW BINLOG EVENTS  ********
 show binlog events from <binlog_start>;
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info
@@ -138,14 +140,16 @@ master-bin.000001	#	Rotate	#	#	master-bi
 ******** [slave] SHOW BINLOG EVENTS IN <FILE> ********
 show binlog events in 'slave-bin.000002' from <binlog_start>;
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info
+slave-bin.000002        #       Binlog_checkpoint       #       #       slave-bin.000002
 slave-bin.000002        #       Query   #       #       use `test`; DROP TABLE `t1` /* generated by server */
 ******** [slave] SHOW BINLOG EVENTS IN <FILE> LIMIT 1 ********
 show binlog events in 'slave-bin.000002' from <binlog_start> limit 1;
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info
-slave-bin.000002        #       Query   #       #       use `test`; DROP TABLE `t1` /* generated by server */
+slave-bin.000002        #       Binlog_checkpoint       #       #       slave-bin.000002
 ******** [slave] SHOW BINLOG EVENTS IN <FILE> LIMIT 1,3 ********
 show binlog events in 'slave-bin.000002' from <binlog_start> limit 1,3;
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info
+slave-bin.000002        #       Query   #       #       use `test`; DROP TABLE `t1` /* generated by server */
 ******** [slave] SHOW BINLOG EVENTS  ********
 show binlog events from <binlog_start>;
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info
@@ -165,6 +169,7 @@ show relaylog events in 'slave-relay-bin
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info
 slave-relay-bin.000006  #       Rotate  #       #       master-bin.000002;pos=4
 slave-relay-bin.000006  #       Format_desc     #       #       SERVER_VERSION, BINLOG_VERSION
+slave-relay-bin.000006  #       Binlog_checkpoint       #       #       master-bin.000001
 slave-relay-bin.000006  #       Binlog_checkpoint       #       #       master-bin.000002
 slave-relay-bin.000006  #       Query   #       #       use `test`; DROP TABLE `t1` /* generated by server */
 ******** [slave] SHOW RELAYLOG EVENTS IN <FILE> LIMIT 1 ********
@@ -175,8 +180,8 @@ slave-relay-bin.000006	#	Rotate	#	#	mast
 show relaylog events in 'slave-relay-bin.000006' from <binlog_start> limit 1,3;
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info
 slave-relay-bin.000006  #       Format_desc     #       #       SERVER_VERSION, BINLOG_VERSION
+slave-relay-bin.000006  #       Binlog_checkpoint       #       #       master-bin.000001
 slave-relay-bin.000006  #       Binlog_checkpoint       #       #       master-bin.000002
-slave-relay-bin.000006  #       Query   #       #       use `test`; DROP TABLE `t1` /* generated by server */
 ******** [slave] SHOW RELAYLOG EVENTS  ********
 show relaylog events from <binlog_start>;
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info

=== modified file 'mysql-test/suite/rpl/t/rpl_mariadb_slave_capability.test'
--- a/mysql-test/suite/rpl/t/rpl_mariadb_slave_capability.test	2012-06-22 09:46:28 +0000
+++ b/mysql-test/suite/rpl/t/rpl_mariadb_slave_capability.test	2012-09-13 12:31:29 +0000
@@ -61,7 +61,7 @@ connection slave;
 SELECT * FROM t1;
 let $binlog_file= query_get_value(SHOW SLAVE STATUS, Relay_Log_File, 1);
 let $binlog_start= 0;
-let $binlog_limit=4,5;
+let $binlog_limit=5,5;
 --source include/show_relaylog_events.inc
 
 --echo # Test that slave which cannot tolerate holes in binlog stream but

=== modified file 'mysql-test/suite/sys_vars/r/innodb_flush_log_at_trx_commit_basic.result'
--- a/mysql-test/suite/sys_vars/r/innodb_flush_log_at_trx_commit_basic.result	2011-07-18 21:04:24 +0000
+++ b/mysql-test/suite/sys_vars/r/innodb_flush_log_at_trx_commit_basic.result	2012-09-13 12:31:29 +0000
@@ -50,7 +50,7 @@ SET @@global.innodb_flush_log_at_trx_com
 Warning 1292    Truncated incorrect innodb_flush_log_at_trx_commit value: '1001'
 SELECT @@global.innodb_flush_log_at_trx_commit;
 @@global.innodb_flush_log_at_trx_commit
-2
+3
 '#----------------------FN_DYNVARS_046_05------------------------#'
 SELECT @@global.innodb_flush_log_at_trx_commit =
 VARIABLE_VALUE FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES
@@ -60,22 +60,22 @@ VARIABLE_VALUE
 1
 SELECT @@global.innodb_flush_log_at_trx_commit;
 @@global.innodb_flush_log_at_trx_commit
-2
+3
 SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.GLOBAL_VARIABLES
 WHERE VARIABLE_NAME='innodb_flush_log_at_trx_commit';
 VARIABLE_VALUE
-2
+3
 '#---------------------FN_DYNVARS_046_06-------------------------#'
 SET @@global.innodb_flush_log_at_trx_commit = OFF;
 ERROR 42000: Incorrect argument type to variable 'innodb_flush_log_at_trx_commit'
 SELECT @@global.innodb_flush_log_at_trx_commit;
 @@global.innodb_flush_log_at_trx_commit
-2
+3
 SET @@global.innodb_flush_log_at_trx_commit = ON;
 ERROR 42000: Incorrect argument type to variable 'innodb_flush_log_at_trx_commit'
 SELECT @@global.innodb_flush_log_at_trx_commit;
 @@global.innodb_flush_log_at_trx_commit
-2
+3
 '#---------------------FN_DYNVARS_046_07----------------------#'
 SET @@global.innodb_flush_log_at_trx_commit = TRUE;
 SELECT @@global.innodb_flush_log_at_trx_commit;

=== modified file 'sql/handler.cc'
--- a/sql/handler.cc	2012-06-22 09:46:28 +0000
+++ b/sql/handler.cc	2012-09-13 12:31:29 +0000
@@ -644,6 +644,43 @@ void ha_checkpoint_state(bool disable)
 }
 
 
+struct st_commit_checkpoint_request {
+  void *cookie;
+  void (*pre_hook)(void *);
+};
+
+static my_bool commit_checkpoint_request_handlerton(THD *unused1, plugin_ref plugin,
+                                           void *data)
+{
+  st_commit_checkpoint_request *st= (st_commit_checkpoint_request *)data;
+  handlerton *hton= plugin_data(plugin, handlerton *);
+  if (hton->state == SHOW_OPTION_YES && hton->commit_checkpoint_request)
+  {
+    void *cookie= st->cookie;
+    if (st->pre_hook)
+      (*st->pre_hook)(cookie);
+    (*hton->commit_checkpoint_request)(hton, cookie);
+  }
+  return FALSE;
+}
+
+
+/*
+  Invoke commit_checkpoint_request() in all storage engines that implement it.
+
+  If pre_hook is non-NULL, the hook will be called prior to each invocation.
+*/
+void
+ha_commit_checkpoint_request(void *cookie, void (*pre_hook)(void *))
+{
+  st_commit_checkpoint_request st;
+  st.cookie= cookie;
+  st.pre_hook= pre_hook;
+  plugin_foreach(NULL, commit_checkpoint_request_handlerton,
+                 MYSQL_STORAGE_ENGINE_PLUGIN, &st);
+}
+
+
 
 static my_bool closecon_handlerton(THD *thd, plugin_ref plugin,
                                    void *unused)
@@ -1281,6 +1318,7 @@ int ha_commit_trans(THD *thd, bool all)
     goto done;
   }
 
+  DEBUG_SYNC(thd, "ha_commit_trans_before_log_and_order");
   cookie= tc_log->log_and_order(thd, xid, all, need_prepare_ordered,
                                 need_commit_ordered);
   if (!cookie)
@@ -1778,6 +1816,17 @@ bool mysql_xa_recover(THD *thd)
   DBUG_RETURN(0);
 }
 
+/*
+  Called by engine to notify TC that a new commit checkpoint has been reached.
+  See comments on handlerton method commit_checkpoint_request() for details.
+*/
+void
+commit_checkpoint_notify_ha(handlerton *hton, void *cookie)
+{
+  tc_log->commit_checkpoint_notify(cookie);
+}
+
+
 /**
   @details
   This function should be called when MySQL sends rows of a SELECT result set

=== modified file 'sql/handler.h'
--- a/sql/handler.h	2012-07-16 07:48:03 +0000
+++ b/sql/handler.h	2012-09-13 12:31:29 +0000
@@ -976,6 +976,46 @@ struct handlerton
    int  (*recover)(handlerton *hton, XID *xid_list, uint len);
    int  (*commit_by_xid)(handlerton *hton, XID *xid);
    int  (*rollback_by_xid)(handlerton *hton, XID *xid);
+   /*
+     The commit_checkpoint_request() handlerton method is used to checkpoint
+     the XA recovery process for storage engines that support two-phase
+     commit.
+
+     The method is optional - an engine that does not implemented is expected
+     to work the traditional way, where every commit() durably flushes the
+     transaction to disk in the engine before completion, so XA recovery will
+     no longer be needed for that transaction.
+
+     An engine that does implement commit_checkpoint_request() is also
+     expected to implement commit_ordered(), so that ordering of commits is
+     consistent between 2pc participants. Such engine is no longer required to
+     durably flush to disk transactions in commit(), provided that the
+     transaction has been successfully prepare()d and commit_ordered(); thus
+     potentionally saving one fsync() call. (Engine must still durably flush
+     to disk in commit() when no prepare()/commit_ordered() steps took place,
+     at least if durable commits are wanted; this happens eg. if binlog is
+     disabled).
+
+     The TC will periodically (eg. once per binlog rotation) call
+     commit_checkpoint_request(). When this happens, the engine must arrange
+     for all transaction that have completed commit_ordered() to be durably
+     flushed to disk (this does not include transactions that might be in the
+     middle of executing commit_ordered()). When such flush has completed, the
+     engine must call commit_checkpoint_notify_ha(), passing back the opaque
+     "cookie".
+
+     The flush and call of commit_checkpoint_notify_ha() need not happen
+     immediately - it can be scheduled and performed asynchroneously (ie. as
+     part of next prepare(), or sync every second, or whatever), but should
+     not be postponed indefinitely. It is however also permissible to do it
+     immediately, before returning from commit_checkpoint_request().
+
+     When commit_checkpoint_notify_ha() is called, the TC will know that the
+     transactions are durably committed, and thus no longer require XA
+     recovery. It uses that to reduce the work needed for any subsequent XA
+     recovery process.
+   */
+   void (*commit_checkpoint_request)(handlerton *hton, void *cookie);
   /*
     "Disable or enable checkpointing internal to the storage engine. This is
     used for FLUSH TABLES WITH READ LOCK AND DISABLE CHECKPOINT to ensure that
@@ -2977,6 +3017,7 @@ void ha_close_connection(THD* thd);
 bool ha_flush_logs(handlerton *db_type);
 void ha_drop_database(char* path);
 void ha_checkpoint_state(bool disable);
+void ha_commit_checkpoint_request(void *cookie, void (*pre_hook)(void *));
 int ha_create_table(THD *thd, const char *path,
                     const char *db, const char *table_name,
                     HA_CREATE_INFO *create_info,
@@ -3057,6 +3098,7 @@ int ha_binlog_end(THD *thd);
 const char *get_canonical_filename(handler *file, const char *path,
                                    char *tmp_path);
 bool mysql_xa_recover(THD *thd);
+void commit_checkpoint_notify_ha(handlerton *hton, void *cookie);
 
 inline const char *table_case_name(HA_CREATE_INFO *info, const char *name)
 {

=== modified file 'sql/log.cc'
--- a/sql/log.cc	2012-06-22 09:46:28 +0000
+++ b/sql/log.cc	2012-09-13 12:31:29 +0000
@@ -479,7 +479,14 @@ class binlog_cache_mngr {
   */
   bool using_xa;
   my_xid xa_xid;
-  ulong cookie;
+  bool need_unlog;
+  /*
+    Id of binlog that transaction was written to; only needed if need_unlog is
+    true.
+  */
+  ulong binlog_id;
+  /* Set if we get an error during commit that must be returned from unlog(). */
+  bool delayed_error;
 
 private:
 
@@ -1678,8 +1685,7 @@ binlog_flush_cache(THD *thd, binlog_cach
       So there is no work to do. Therefore, we will not increment any XID
       count, so we must not decrement any XID count in unlog().
     */
-    if (cache_mngr->using_xa && cache_mngr->xa_xid)
-      cache_mngr->cookie= BINLOG_COOKIE_DUMMY;
+    cache_mngr->need_unlog= 0;
   }
   cache_mngr->reset(using_stmt, using_trx);
 
@@ -2904,16 +2910,16 @@ const char *MYSQL_LOG::generate_name(con
 
 
 MYSQL_BIN_LOG::MYSQL_BIN_LOG(uint *sync_period)
-  :current_binlog_id(BINLOG_COOKIE_START), reset_master_pending(false),
+  :reset_master_pending(false),
    bytes_written(0), file_id(1), open_count(1),
-   need_start_event(TRUE),
    group_commit_queue(0), group_commit_queue_busy(FALSE),
    num_commits(0), num_group_commits(0),
    sync_period_ptr(sync_period), sync_counter(0),
    is_relay_log(0), signal_cnt(0),
    checksum_alg_reset(BINLOG_CHECKSUM_ALG_UNDEF),
    relay_log_checksum_alg(BINLOG_CHECKSUM_ALG_UNDEF),
-   description_event_for_exec(0), description_event_for_queue(0)
+   description_event_for_exec(0), description_event_for_queue(0),
+   current_binlog_id(0)
 {
   /*
     We don't want to initialize locks here as such initialization depends on
@@ -2963,10 +2969,9 @@ void MYSQL_BIN_LOG::cleanup()
 
 
 /* Init binlog-specific vars */
-void MYSQL_BIN_LOG::init(bool no_auto_events_arg, ulong max_size_arg)
+void MYSQL_BIN_LOG::init(ulong max_size_arg)
 {
   DBUG_ENTER("MYSQL_BIN_LOG::init");
-  no_auto_events= no_auto_events_arg;
   max_size= max_size_arg;
   DBUG_PRINT("info",("max_size: %lu", max_size));
   DBUG_VOID_RETURN;
@@ -3070,12 +3075,12 @@ bool MYSQL_BIN_LOG::open(const char *log
                          enum_log_type log_type_arg,
                          const char *new_name,
                          enum cache_type io_cache_type_arg,
-                         bool no_auto_events_arg,
                          ulong max_size_arg,
                          bool null_created_arg,
                          bool need_mutex)
 {
   File file= -1;
+  xid_count_per_binlog *new_xid_list_entry= NULL, *b;
 
   DBUG_ENTER("MYSQL_BIN_LOG::open");
   DBUG_PRINT("enter",("log_type: %d",(int) log_type_arg));
@@ -3131,7 +3136,7 @@ bool MYSQL_BIN_LOG::open(const char *log
     DBUG_RETURN(1);                            /* all warnings issued */
   }
 
-  init(no_auto_events_arg, max_size_arg);
+  init(max_size_arg);
 
   open_count++;
 
@@ -3155,11 +3160,10 @@ bool MYSQL_BIN_LOG::open(const char *log
       write_file_name_to_index_file= 1;
     }
 
-    if (need_start_event && !no_auto_events)
     {
       /*
-        In 4.x we set need_start_event=0 here, but in 5.0 we want a Start event
-        even if this is not the very first binlog.
+        In 4.x we put Start event only in the first binlog. But from 5.0 we
+        want a Start event even if this is not the very first binlog.
       */
       Format_description_log_event s(BINLOG_VERSION);
       /*
@@ -3191,42 +3195,51 @@ bool MYSQL_BIN_LOG::open(const char *log
       {
         char buf[FN_REFLEN];
         /*
-          Put this one into the list of active binlogs.
+          Construct an entry in the binlog_xid_count_list for the new binlog
+          file (we will not link it into the list until we know the new file
+          is successfully created; otherwise we would have to remove it again
+          if creation failed, which gets tricky since other threads may have
+          seen the entry in the meantime - and we do not want to hold
+          LOCK_xid_list for long periods of time).
+
           Write the current binlog checkpoint into the log, so XA recovery will
           know from where to start recovery.
         */
         uint off= dirname_length(log_file_name);
         uint len= strlen(log_file_name) - off;
         char *entry_mem, *name_mem;
-        xid_count_per_binlog *b, *b2;
-        if (!(b = (xid_count_per_binlog *)
+        if (!(new_xid_list_entry = (xid_count_per_binlog *)
               my_multi_malloc(MYF(MY_WME),
                               &entry_mem, sizeof(xid_count_per_binlog),
                               &name_mem, len,
                               NULL)))
           goto err;
         memcpy(name_mem, log_file_name+off, len);
-        b->binlog_name= name_mem;
-        b->binlog_name_len= len;
-        b->xid_count= 0;
-
-        mysql_mutex_lock(&LOCK_xid_list);
-        b->binlog_id= ++current_binlog_id;
+        new_xid_list_entry->binlog_name= name_mem;
+        new_xid_list_entry->binlog_name_len= len;
+        new_xid_list_entry->xid_count= 0;
 
         /*
-          Remove any initial entries with no pending XIDs.
-          Normally this will be done in unlog(), but if there are no
-          transactions with an XA-capable engine at all in a given binlog
-          file, unlog() will never be used and we will remove the entry here.
-        */
-        while ((b2= binlog_xid_count_list.head()) && b2->xid_count == 0)
-          my_free(binlog_xid_count_list.get());
+          Find the name for the Initial binlog checkpoint.
 
-        binlog_xid_count_list.push_back(b);
-        b2= binlog_xid_count_list.head();
-        strmake(buf, b2->binlog_name, b2->binlog_name_len);
+          Normally this will just be the first entry, as we delete entries
+          when their count drops to zero. But we scan the list to handle any
+          corner case, eg. for the first binlog file opened after startup, the
+          list will be empty.
+        */
+        mysql_mutex_lock(&LOCK_xid_list);
+        I_List_iterator<xid_count_per_binlog> it(binlog_xid_count_list);
+        while ((b= it++) && b->xid_count == 0)
+          ;
         mysql_mutex_unlock(&LOCK_xid_list);
+        if (!b)
+          b= new_xid_list_entry;
+        strmake(buf, b->binlog_name, b->binlog_name_len);
         Binlog_checkpoint_log_event ev(buf, len);
+        DBUG_EXECUTE_IF("crash_before_write_checkpoint_event",
+                        flush_io_cache(&log_file);
+                        mysql_file_sync(log_file.file, MYF(MY_WME));
+                        DBUG_SUICIDE(););
         if (ev.write(&log_file))
           goto err;
         bytes_written+= ev.data_written;
@@ -3302,6 +3315,23 @@ bool MYSQL_BIN_LOG::open(const char *log
 #endif
     }
   }
+
+  if (!is_relay_log)
+  {
+    /*
+      Now the file was created successfully, so we can link in the entry for
+      the new binlog file in binlog_xid_count_list.
+    */
+    mysql_mutex_lock(&LOCK_xid_list);
+    ++current_binlog_id;
+    new_xid_list_entry->binlog_id= current_binlog_id;
+    /* Remove any initial entries with no pending XIDs.  */
+    while ((b= binlog_xid_count_list.head()) && b->xid_count == 0)
+      my_free(binlog_xid_count_list.get());
+    binlog_xid_count_list.push_back(new_xid_list_entry);
+    mysql_mutex_unlock(&LOCK_xid_list);
+  }
+
   log_state= LOG_OPENED;
 
 #ifdef HAVE_REPLICATION
@@ -3320,6 +3350,8 @@ bool MYSQL_BIN_LOG::open(const char *log
 Turning logging off for the whole duration of the MySQL server process. \
 To turn it on again: fix the cause, \
 shutdown the MySQL server and restart it.", name, errno);
+  if (new_xid_list_entry)
+    my_free(new_xid_list_entry);
   if (file >= 0)
     mysql_file_close(file, MYF(0));
   close(LOG_CLOSE_INDEX);
@@ -3599,12 +3631,40 @@ bool MYSQL_BIN_LOG::reset_logs(THD* thd)
   if (!is_relay_log)
   {
     /*
+      Mark that a RESET MASTER is in progress.
+      This ensures that a binlog checkpoint will not try to write binlog
+      checkpoint events, which would be useless (as we are deleting the binlog
+      anyway) and could deadlock, as we are holding LOCK_log.
+    */
+    mysql_mutex_lock(&LOCK_xid_list);
+    reset_master_pending= true;
+    mysql_mutex_unlock(&LOCK_xid_list);
+
+    /*
       We are going to nuke all binary log files.
-      So first wait until all pending binlog checkpoints have completed.
+      Without binlog, we cannot XA recover prepared-but-not-committed
+      transactions in engines. So force a commit checkpoint first.
+
+      Note that we take and immediately release LOCK_commit_ordered. This has
+      the effect to ensure that any on-going group commit (in
+      trx_group_commit_leader()) has completed before we request the checkpoint,
+      due to the chaining of LOCK_log and LOCK_commit_ordered in that function.
+      (We are holding LOCK_log, so no new group commit can start).
+
+      Without this, it is possible (though perhaps unlikely) that the RESET
+      MASTER could run in-between the write to the binlog and the
+      commit_ordered() in the engine of some transaction, and then a crash
+      later would leave such transaction not recoverable.
     */
+    mysql_mutex_lock(&LOCK_commit_ordered);
+    mysql_mutex_unlock(&LOCK_commit_ordered);
+
+    mark_xids_active(current_binlog_id, 1);
+    do_checkpoint_request(current_binlog_id);
+
+    /* Now wait for all checkpoint requests and pending unlog() to complete. */
     mysql_mutex_lock(&LOCK_xid_list);
     xid_count_per_binlog *b;
-    reset_master_pending= true;
     for (;;)
     {
       I_List_iterator<xid_count_per_binlog> it(binlog_xid_count_list);
@@ -3626,9 +3686,6 @@ bool MYSQL_BIN_LOG::reset_logs(THD* thd)
       Now all XIDs are fully flushed to disk, and we are holding LOCK_log so
       no new ones will be written. So we can proceed to delete the logs.
     */
-    while ((b= binlog_xid_count_list.get()))
-      my_free(b);
-    reset_master_pending= false;
     mysql_mutex_unlock(&LOCK_xid_list);
   }
 
@@ -3722,10 +3779,8 @@ bool MYSQL_BIN_LOG::reset_logs(THD* thd)
       goto err;
     }
   }
-  if (!thd->slave_thread)
-    need_start_event=1;
   if (!open_index_file(index_file_name, 0, FALSE))
-    if ((error= open(save_name, log_type, 0, io_cache_type, no_auto_events, max_size, 0, FALSE)))
+    if ((error= open(save_name, log_type, 0, io_cache_type, max_size, 0, FALSE)))
       goto err;
   my_free((void *) save_name);
 
@@ -3733,6 +3788,31 @@ bool MYSQL_BIN_LOG::reset_logs(THD* thd)
   if (error == 1)
     name= const_cast<char*>(save_name);
   mysql_mutex_unlock(&LOCK_thread_count);
+
+  if (!is_relay_log)
+  {
+    xid_count_per_binlog *b;
+    /*
+      Remove all entries in the xid_count list except the last.
+      Normally we will just be deleting all the entries that we waited for to
+      drop to zero above. But if we fail during RESET MASTER for some reason
+      then we will not have created any new log file, and we may keep the last
+      of the old entries.
+    */
+    mysql_mutex_lock(&LOCK_xid_list);
+    for (;;)
+    {
+      b= binlog_xid_count_list.head();
+      DBUG_ASSERT(b /* List can never become empty. */);
+      if (b->binlog_id == current_binlog_id)
+        break;
+      DBUG_ASSERT(b->xid_count == 0);
+      my_free(binlog_xid_count_list.get());
+    }
+    reset_master_pending= false;
+    mysql_mutex_unlock(&LOCK_xid_list);
+  }
+
   mysql_mutex_unlock(&LOCK_index);
   mysql_mutex_unlock(&LOCK_log);
   DBUG_RETURN(error);
@@ -4476,7 +4556,6 @@ int MYSQL_BIN_LOG::new_file_impl(bool ne
 
   if (log_type == LOG_BIN)
   {
-    if (!no_auto_events)
     {
       /*
         We log the whole file name for log file as the user may decide
@@ -4551,7 +4630,7 @@ int MYSQL_BIN_LOG::new_file_impl(bool ne
     /* reopen the binary log file. */
     file_to_open= new_name_ptr;
     error= open(old_name, log_type, new_name_ptr, io_cache_type,
-                no_auto_events, max_size, 1, FALSE);
+                max_size, 1, FALSE);
   }
 
   /* handle reopening errors */
@@ -5176,6 +5255,8 @@ bool MYSQL_BIN_LOG::write(Log_event *eve
   bool is_trans_cache= FALSE;
   bool using_trans= event_info->use_trans_cache();
   bool direct= event_info->use_direct_logging();
+  ulong prev_binlog_id;
+  LINT_INIT(prev_binlog_id);
 
   if (thd->binlog_evt_union.do_union)
   {
@@ -5227,6 +5308,7 @@ bool MYSQL_BIN_LOG::write(Log_event *eve
       file= &log_file;
       my_org_b_tell= my_b_tell(file);
       mysql_mutex_lock(&LOCK_log);
+      prev_binlog_id= current_binlog_id;
     }
     else
     {
@@ -5372,7 +5454,7 @@ bool MYSQL_BIN_LOG::write(Log_event *eve
       mysql_mutex_unlock(&LOCK_log);
 
       if (check_purge)
-        purge();
+        checkpoint_and_purge(prev_binlog_id);
     }
 
     if (error)
@@ -5457,6 +5539,64 @@ bool general_log_write(THD *thd, enum en
   return FALSE;
 }
 
+
+/*
+  I would like to make this function static, but this causes compiler warnings
+  when it is declared as friend function in log.h.
+*/
+void
+binlog_checkpoint_callback(void *cookie)
+{
+  MYSQL_BIN_LOG::xid_count_per_binlog *entry=
+    (MYSQL_BIN_LOG::xid_count_per_binlog *)cookie;
+  /*
+    For every supporting engine, we increment the xid_count and issue a
+    commit_checkpoint_request(). Then we can count when all
+    commit_checkpoint_notify() callbacks have occured, and then log a new
+    binlog checkpoint event.
+  */
+  mysql_bin_log.mark_xids_active(entry->binlog_id, 1);
+}
+
+
+/*
+  Request a commit checkpoint from each supporting engine.
+  This must be called after each binlog rotate, and after LOCK_log has been
+  released. The xid_count value in the xid_count_per_binlog entry was
+  incremented by 1 and will be decremented in this function; this ensures
+  that the entry will not go away early despite LOCK_log not being held.
+*/
+void
+MYSQL_BIN_LOG::do_checkpoint_request(ulong binlog_id)
+{
+  xid_count_per_binlog *entry;
+
+  /*
+    Find the binlog entry, and invoke commit_checkpoint_request() on it in
+    each supporting storage engine.
+  */
+  mysql_mutex_lock(&LOCK_xid_list);
+  I_List_iterator<xid_count_per_binlog> it(binlog_xid_count_list);
+  do {
+    entry= it++;
+    DBUG_ASSERT(entry /* binlog_id is always somewhere in the list. */);
+  } while (entry->binlog_id != binlog_id);
+  mysql_mutex_unlock(&LOCK_xid_list);
+
+  ha_commit_checkpoint_request(entry, binlog_checkpoint_callback);
+  /*
+    When we rotated the binlog, we incremented xid_count to make sure the
+    entry would not go away until this point, where we have done all necessary
+    commit_checkpoint_request() calls.
+    So now we can (and must) decrease the count - when it reaches zero, we
+    will know that both all pending unlog() and all pending
+    commit_checkpoint_notify() calls are done, and we can log a new binlog
+    checkpoint.
+  */
+  mark_xid_done(binlog_id, true);
+}
+
+
 /**
   The method executes rotation when LOCK_log is already acquired
   by the caller.
@@ -5465,6 +5605,15 @@ bool general_log_write(THD *thd, enum en
   @param check_purge   is set to true if rotation took place
 
   @note
+    Caller _must_ check the check_purge variable. If this is set, it means
+    that the binlog was rotated, and caller _must_ ensure that
+    do_checkpoint_request() is called later with the binlog_id of the rotated
+    binlog file. The call to do_checkpoint_request() must happen after
+    LOCK_log is released (which is why we cannot simply do it here).
+    Usually, checkpoint_and_purge() is appropriate, as it will both handle
+    the checkpointing and any needed purging of old logs.
+
+  @note
     If rotation fails, for instance the server was unable 
     to create a new log file, we still try to write an 
     incident event to the current log.
@@ -5482,7 +5631,27 @@ int MYSQL_BIN_LOG::rotate(bool force_rot
 
   if (force_rotate || (my_b_tell(&log_file) >= (my_off_t) max_size))
   {
+    ulong binlog_id= current_binlog_id;
+    /*
+      We rotate the binlog, so we need to start a commit checkpoint in all
+      supporting engines - when it finishes, we can log a new binlog checkpoint
+      event.
+
+      But we cannot start the checkpoint here - there could be a group commit
+      still in progress which needs to be included in the checkpoint, and
+      besides we do not want to do the (possibly expensive) checkpoint while
+      LOCK_log is held.
+
+      On the other hand, we must be sure that the xid_count entry for the
+      previous log does not go away until we start the checkpoint - which it
+      could do as it is no longer the most recent. So we increment xid_count
+      (to count the pending checkpoint request) - this will fix the entry in
+      place until we decrement again in do_checkpoint_request().
+    */
+    mark_xids_active(binlog_id, 1);
+
     if ((error= new_file_without_locking()))
+    {
       /** 
          Be conservative... There are possible lost events (eg, 
          failing to log the Execute_load_query_log_event
@@ -5495,7 +5664,14 @@ int MYSQL_BIN_LOG::rotate(bool force_rot
       if (!write_incident_already_locked(current_thd))
         flush_and_sync(0);
 
-    *check_purge= true;
+      /*
+        We failed to rotate - so we have to decrement the xid_count back that
+        we incremented before attempting the rotate.
+      */
+      mark_xid_done(binlog_id, false);
+    }
+    else
+      *check_purge= true;
   }
   DBUG_RETURN(error);
 }
@@ -5523,6 +5699,13 @@ void MYSQL_BIN_LOG::purge()
 #endif
 }
 
+
+void MYSQL_BIN_LOG::checkpoint_and_purge(ulong binlog_id)
+{
+  do_checkpoint_request(binlog_id);
+  purge();
+}
+
 /**
   The method is a shortcut of @c rotate() and @c purge().
   LOCK_log is acquired prior to rotate and is released after it.
@@ -5535,11 +5718,13 @@ void MYSQL_BIN_LOG::purge()
 int MYSQL_BIN_LOG::rotate_and_purge(bool force_rotate)
 {
   int error= 0;
+  ulong prev_binlog_id;
   DBUG_ENTER("MYSQL_BIN_LOG::rotate_and_purge");
   bool check_purge= false;
 
   //todo: fix the macro def and restore safe_mutex_assert_not_owner(&LOCK_log);
   mysql_mutex_lock(&LOCK_log);
+  prev_binlog_id= current_binlog_id;
   if ((error= rotate(force_rotate, &check_purge)))
     check_purge= false;
   /*
@@ -5549,7 +5734,7 @@ int MYSQL_BIN_LOG::rotate_and_purge(bool
   mysql_mutex_unlock(&LOCK_log);
 
   if (check_purge)
-    purge();
+    checkpoint_and_purge(prev_binlog_id);
 
   DBUG_RETURN(error);
 }
@@ -5880,11 +6065,13 @@ bool MYSQL_BIN_LOG::write_incident(THD *
   uint error= 0;
   my_off_t offset;
   bool check_purge= false;
+  ulong prev_binlog_id;
   DBUG_ENTER("MYSQL_BIN_LOG::write_incident");
 
   mysql_mutex_lock(&LOCK_log);
   if (likely(is_open()))
   {
+    prev_binlog_id= current_binlog_id;
     if (!(error= write_incident_already_locked(thd)) &&
         !(error= flush_and_sync(0)))
     {
@@ -5904,7 +6091,7 @@ bool MYSQL_BIN_LOG::write_incident(THD *
     mysql_mutex_unlock(&LOCK_log);
 
     if (check_purge)
-      purge();
+      checkpoint_and_purge(prev_binlog_id);
   }
 
   DBUG_RETURN(error);
@@ -5914,6 +6101,7 @@ void
 MYSQL_BIN_LOG::write_binlog_checkpoint_event_already_locked(const char *name,
                                                             uint len)
 {
+  my_off_t offset;
   Binlog_checkpoint_log_event ev(name, len);
   /*
     Note that we must sync the binlog checkpoint to disk.
@@ -5922,22 +6110,29 @@ MYSQL_BIN_LOG::write_binlog_checkpoint_e
   */
   if (!ev.write(&log_file) && !flush_and_sync(0))
   {
-    bool check_purge= false;
     signal_update();
-    rotate(false, &check_purge);
-    if (check_purge)
-      purge();
-    return;
+  }
+  else
+  {
+    /*
+      If we fail to write the checkpoint event, something is probably really
+      bad with the binlog. We complain in the error log.
+
+      Note that failure to write binlog checkpoint does not compromise the
+      ability to do crash recovery - crash recovery will just have to scan a
+      bit more of the binlog than strictly necessary.
+    */
+    sql_print_error("Failed to write binlog checkpoint event to binary log\n");
   }
 
+  offset= my_b_tell(&log_file);
   /*
-    If we fail to write the checkpoint event, something is probably really
-    bad with the binlog. We complain in the error log.
-    Note that failure to write binlog checkpoint does not compromise the
-    ability to do crash recovery - crash recovery will just have to scan a
-    bit more of the binlog than strictly necessary.
+    Take mutex to protect against a reader seeing partial writes of 64-bit
+    offset on 32-bit CPUs.
   */
-  sql_print_error("Failed to write binlog checkpoint event to binary log\n");
+  mysql_mutex_lock(&LOCK_commit_ordered);
+  last_commit_pos_offset= offset;
+  mysql_mutex_unlock(&LOCK_commit_ordered);
 }
 
 
@@ -5973,6 +6168,7 @@ MYSQL_BIN_LOG::write_transaction_to_binl
                                            bool using_trx_cache)
 {
   group_commit_entry entry;
+  Ha_trx_info *ha_info;
   DBUG_ENTER("MYSQL_BIN_LOG::write_transaction_to_binlog");
 
   entry.thd= thd;
@@ -5981,6 +6177,15 @@ MYSQL_BIN_LOG::write_transaction_to_binl
   entry.all= all;
   entry.using_stmt_cache= using_stmt_cache;
   entry.using_trx_cache= using_trx_cache;
+  entry.need_unlog= false;
+  ha_info= all ? thd->transaction.all.ha_list : thd->transaction.stmt.ha_list;
+  for (; ha_info; ha_info= ha_info->next())
+  {
+    if (ha_info->is_started() && ha_info->ht() != binlog_hton &&
+        !ha_info->ht()->commit_checkpoint_request)
+      entry.need_unlog= true;
+    break;
+  }
 
   /*
     Log "BEGIN" at the beginning of every transaction.  Here, a transaction is
@@ -6069,6 +6274,18 @@ MYSQL_BIN_LOG::write_transaction_to_binl
     {
       next->thd->signal_wakeup_ready();
     }
+    else
+    {
+      /*
+        If we rotated the binlog, and if we are using the unoptimized thread
+        scheduling where every thread runs its own commit_ordered(), then we
+        must do the commit checkpoint and log purge here, after all
+        commit_ordered() calls have finished, and locks have been released.
+      */
+      if (entry->check_purge)
+        checkpoint_and_purge(entry->binlog_id);
+    }
+
   }
 
   if (likely(!entry->error))
@@ -6099,8 +6316,9 @@ MYSQL_BIN_LOG::write_transaction_to_binl
     we need to mark it as not needed for recovery (unlog() is not called
     for a transaction if log_xid() fails).
   */
-  if (entry->cache_mngr->using_xa && entry->cache_mngr->xa_xid)
-    mark_xid_done(entry->cache_mngr->cookie);
+  if (entry->cache_mngr->using_xa && entry->cache_mngr->xa_xid &&
+      entry->cache_mngr->need_unlog)
+    mark_xid_done(entry->cache_mngr->binlog_id, true);
 
   return 1;
 }
@@ -6120,10 +6338,12 @@ MYSQL_BIN_LOG::trx_group_commit_leader(g
 {
   uint xid_count= 0;
   my_off_t UNINIT_VAR(commit_offset);
-  group_commit_entry *current;
+  group_commit_entry *current, *last_in_queue;
   group_commit_entry *queue= NULL;
   bool check_purge= false;
+  ulong binlog_id;
   DBUG_ENTER("MYSQL_BIN_LOG::trx_group_commit_leader");
+  LINT_INIT(binlog_id);
 
   DBUG_ASSERT(is_open());
   if (likely(is_open()))                       // Should always be true
@@ -6134,6 +6354,7 @@ MYSQL_BIN_LOG::trx_group_commit_leader(g
     */
     mysql_mutex_lock(&LOCK_log);
     DEBUG_SYNC(leader->thd, "commit_after_get_LOCK_log");
+    binlog_id= current_binlog_id;
 
     mysql_mutex_lock(&LOCK_prepare_ordered);
     current= group_commit_queue;
@@ -6141,6 +6362,7 @@ MYSQL_BIN_LOG::trx_group_commit_leader(g
     mysql_mutex_unlock(&LOCK_prepare_ordered);
 
     /* As the queue is in reverse order of entering, reverse it. */
+    last_in_queue= current;
     while (current)
     {
       group_commit_entry *next= current->next;
@@ -6180,8 +6402,22 @@ MYSQL_BIN_LOG::trx_group_commit_leader(g
       cache_mngr->last_commit_pos_offset= commit_offset;
       if (cache_mngr->using_xa && cache_mngr->xa_xid)
       {
-        xid_count++;
-        cache_mngr->cookie= current_binlog_id;
+        /*
+          If all storage engines support commit_checkpoint_request(), then we
+          do not need to keep track of when this XID is durably committed.
+          Instead we will just ask the storage engine to durably commit all its
+          XIDs when we rotate a binlog file.
+        */
+        if (current->need_unlog)
+        {
+          xid_count++;
+          cache_mngr->need_unlog= true;
+          cache_mngr->binlog_id= binlog_id;
+        }
+        else
+          cache_mngr->need_unlog= false;
+
+        cache_mngr->delayed_error= false;
       }
     }
 
@@ -6232,21 +6468,27 @@ MYSQL_BIN_LOG::trx_group_commit_leader(g
     */
     if (xid_count > 0)
     {
-      mark_xids_active(current_binlog_id, xid_count);
+      mark_xids_active(binlog_id, xid_count);
     }
-    else
+
+    if (rotate(false, &check_purge))
     {
-      if (rotate(false, &check_purge))
-      {
-        /*
-          If we fail to rotate, which thread should get the error?
-          We give the error to the leader, as any my_error() thrown inside
-          rotate() will have been registered for the leader THD.
-        */
-        leader->error= ER_ERROR_ON_WRITE;
-        leader->commit_errno= errno;
-        check_purge= false;
-      }
+      /*
+        If we fail to rotate, which thread should get the error?
+        We give the error to the leader, as any my_error() thrown inside
+        rotate() will have been registered for the leader THD.
+
+        However we must not return error from here - that would cause
+        ha_commit_trans() to abort and rollback the transaction, which would
+        leave an inconsistent state with the transaction committed in the
+        binlog but rolled back in the engine.
+
+        Instead set a flag so that we can return error later, from unlog(),
+        when the transaction has been safely committed in the engine.
+      */
+      leader->cache_mngr->delayed_error= true;
+      my_error(ER_ERROR_ON_WRITE, MYF(ME_NOREFRESH), name, errno);
+      check_purge= false;
     }
   }
 
@@ -6278,6 +6520,15 @@ MYSQL_BIN_LOG::trx_group_commit_leader(g
       mysql_cond_wait(&COND_queue_busy, &LOCK_commit_ordered);
     group_commit_queue_busy= TRUE;
 
+    /*
+      Set these so parent can run checkpoint_and_purge() in last thread.
+      (When using optimized thread scheduling, we run checkpoint_and_purge()
+      in this function, so parent does not need to and we need not set these
+      values).
+    */
+    last_in_queue->check_purge= check_purge;
+    last_in_queue->binlog_id= binlog_id;
+
     /* Note that we return with LOCK_commit_ordered locked! */
     DBUG_VOID_RETURN;
   }
@@ -6308,9 +6559,10 @@ MYSQL_BIN_LOG::trx_group_commit_leader(g
   }
   DEBUG_SYNC(leader->thd, "commit_after_group_run_commit_ordered");
   mysql_mutex_unlock(&LOCK_commit_ordered);
+  DEBUG_SYNC(leader->thd, "commit_after_group_release_commit_ordered");
 
   if (check_purge)
-    purge();
+    checkpoint_and_purge(binlog_id);
 
   DBUG_VOID_RETURN;
 }
@@ -6470,7 +6722,7 @@ void MYSQL_BIN_LOG::close(uint exiting)
   if (log_state == LOG_OPENED)
   {
 #ifdef HAVE_REPLICATION
-    if (log_type == LOG_BIN && !no_auto_events &&
+    if (log_type == LOG_BIN &&
         (exiting & LOG_CLOSE_STOP_EVENT))
     {
       Stop_log_event s;
@@ -7104,6 +7356,8 @@ int TC_LOG_MMAP::open(const char *opt_na
   mysql_mutex_init(key_LOCK_sync, &LOCK_sync, MY_MUTEX_INIT_FAST);
   mysql_mutex_init(key_LOCK_active, &LOCK_active, MY_MUTEX_INIT_FAST);
   mysql_mutex_init(key_LOCK_pool, &LOCK_pool, MY_MUTEX_INIT_FAST);
+  mysql_mutex_init(key_LOCK_pending_checkpoint, &LOCK_pending_checkpoint,
+                   MY_MUTEX_INIT_FAST);
   mysql_cond_init(key_COND_active, &COND_active, 0);
   mysql_cond_init(key_COND_pool, &COND_pool, 0);
   mysql_cond_init(key_TC_LOG_MMAP_COND_queue_busy, &COND_queue_busy, 0);
@@ -7356,17 +7610,93 @@ int TC_LOG_MMAP::sync()
   return err;
 }
 
+static void
+mmap_do_checkpoint_callback(void *data)
+{
+  TC_LOG_MMAP::pending_cookies *pending=
+    static_cast<TC_LOG_MMAP::pending_cookies *>(data);
+  ++pending->pending_count;
+}
+
+int TC_LOG_MMAP::unlog(ulong cookie, my_xid xid)
+{
+  pending_cookies *full_buffer= NULL;
+  DBUG_ASSERT(*(my_xid *)(data+cookie) == xid);
+
+  /*
+    Do not delete the entry immediately, as there may be participating storage
+    engines which implement commit_checkpoint_request(), and thus have not yet
+    flushed the commit durably to disk.
+
+    Instead put it in a queue - and periodically, we will request a checkpoint
+    from all engines and delete a whole batch at once.
+  */
+  mysql_mutex_lock(&LOCK_pending_checkpoint);
+  if (pending_checkpoint == NULL)
+  {
+    uint32 size= sizeof(*pending_checkpoint);
+    if (!(pending_checkpoint=
+          (pending_cookies *)my_malloc(size, MYF(MY_ZEROFILL))))
+    {
+      my_error(ER_OUTOFMEMORY, MYF(0), size);
+      mysql_mutex_unlock(&LOCK_pending_checkpoint);
+      return 1;
+    }
+  }
+
+  pending_checkpoint->cookies[pending_checkpoint->count++]= cookie;
+  if (pending_checkpoint->count == sizeof(pending_checkpoint->cookies) /
+      sizeof(pending_checkpoint->cookies[0]))
+  {
+    full_buffer= pending_checkpoint;
+    pending_checkpoint= NULL;
+  }
+  mysql_mutex_unlock(&LOCK_pending_checkpoint);
+
+  if (full_buffer)
+  {
+    /*
+      We do an extra increment and notify here - this ensures that
+      things work also if there are no engines at all that support
+      commit_checkpoint_request.
+    */
+    ++full_buffer->pending_count;
+    ha_commit_checkpoint_request(full_buffer, mmap_do_checkpoint_callback);
+    commit_checkpoint_notify(full_buffer);
+  }
+  return 0;
+}
+
+
+void
+TC_LOG_MMAP::commit_checkpoint_notify(void *cookie)
+{
+  uint count;
+  pending_cookies *pending= static_cast<pending_cookies *>(cookie);
+  mysql_mutex_lock(&LOCK_pending_checkpoint);
+  DBUG_ASSERT(pending->pending_count > 0);
+  count= --pending->pending_count;
+  mysql_mutex_unlock(&LOCK_pending_checkpoint);
+  if (count == 0)
+  {
+    uint i;
+    for (i= 0; i < sizeof(pending->cookies)/sizeof(pending->cookies[0]); ++i)
+      delete_entry(pending->cookies[i]);
+    my_free(pending);
+  }
+}
+
+
 /**
   erase xid from the page, update page free space counters/pointers.
   cookie points directly to the memory where xid was logged.
 */
 
-int TC_LOG_MMAP::unlog(ulong cookie, my_xid xid)
+int TC_LOG_MMAP::delete_entry(ulong cookie)
 {
   PAGE *p=pages+(cookie/tc_log_page_size);
   my_xid *x=(my_xid *)(data+cookie);
 
-  DBUG_ASSERT(*x == xid);
   DBUG_ASSERT(x >= p->start && x < p->end);
 
   mysql_mutex_lock(&p->lock);
@@ -7390,6 +7720,7 @@ void TC_LOG_MMAP::close()
     mysql_mutex_destroy(&LOCK_sync);
     mysql_mutex_destroy(&LOCK_active);
     mysql_mutex_destroy(&LOCK_pool);
+    mysql_mutex_destroy(&LOCK_pending_checkpoint);
     mysql_cond_destroy(&COND_pool);
     mysql_cond_destroy(&COND_active);
     mysql_cond_destroy(&COND_queue_busy);
@@ -7412,6 +7743,8 @@ void TC_LOG_MMAP::close()
   }
   if (inited>=5) // cannot do in the switch because of Windows
     mysql_file_delete(key_file_tclog, logname, MYF(MY_WME));
+  if (pending_checkpoint)
+    my_free(pending_checkpoint);
   inited=0;
 }
 
@@ -7518,7 +7852,7 @@ int TC_LOG_BINLOG::open(const char *opt_
   if (using_heuristic_recover())
   {
     /* generate a new binlog to mask a corrupted one */
-    open(opt_name, LOG_BIN, 0, WRITE_CACHE, 0, max_binlog_size, 0, TRUE);
+    open(opt_name, LOG_BIN, 0, WRITE_CACHE, max_binlog_size, 0, TRUE);
     cleanup();
     return 1;
   }
@@ -7606,9 +7940,6 @@ TC_LOG_BINLOG::log_and_order(THD *thd, m
 
   cache_mngr->using_xa= TRUE;
   cache_mngr->xa_xid= xid;
-#ifndef DBUG_OFF
-  cache_mngr->cookie= 0;
-#endif
   err= binlog_commit_flush_xid_caches(thd, cache_mngr, all, xid);
 
   DEBUG_SYNC(thd, "binlog_after_log_and_order");
@@ -7619,10 +7950,11 @@ TC_LOG_BINLOG::log_and_order(THD *thd, m
     If using explicit user XA, we will not have XID. We must still return a
     non-zero cookie (as zero cookie signals error).
   */
-  if (!xid)
-    DBUG_RETURN(BINLOG_COOKIE_DUMMY);
-  DBUG_ASSERT(cache_mngr->cookie != 0);
-  DBUG_RETURN(cache_mngr->cookie);
+  if (!xid || !cache_mngr->need_unlog)
+    DBUG_RETURN(BINLOG_COOKIE_DUMMY(cache_mngr->delayed_error));
+  else
+    DBUG_RETURN(BINLOG_COOKIE_MAKE(cache_mngr->binlog_id,
+                                   cache_mngr->delayed_error));
 }
 
 /*
@@ -7637,19 +7969,18 @@ TC_LOG_BINLOG::log_and_order(THD *thd, m
   binary log.
 */
 void
-TC_LOG_BINLOG::mark_xids_active(ulong cookie, uint xid_count)
+TC_LOG_BINLOG::mark_xids_active(ulong binlog_id, uint xid_count)
 {
   xid_count_per_binlog *b;
 
   DBUG_ENTER("TC_LOG_BINLOG::mark_xids_active");
-  DBUG_PRINT("info", ("cookie=%lu xid_count=%u", cookie, xid_count));
-  DBUG_ASSERT(cookie != 0 && cookie != BINLOG_COOKIE_DUMMY);
+  DBUG_PRINT("info", ("binlog_id=%lu xid_count=%u", binlog_id, xid_count));
 
   mysql_mutex_lock(&LOCK_xid_list);
   I_List_iterator<xid_count_per_binlog> it(binlog_xid_count_list);
   while ((b= it++))
   {
-    if (b->binlog_id == cookie)
+    if (b->binlog_id == binlog_id)
     {
       b->xid_count += xid_count;
       break;
@@ -7675,15 +8006,13 @@ TC_LOG_BINLOG::mark_xids_active(ulong co
   checkpoint.
 */
 void
-TC_LOG_BINLOG::mark_xid_done(ulong cookie)
+TC_LOG_BINLOG::mark_xid_done(ulong binlog_id, bool write_checkpoint)
 {
   xid_count_per_binlog *b;
   bool first;
   ulong current;
 
   DBUG_ENTER("TC_LOG_BINLOG::mark_xid_done");
-  if (cookie == BINLOG_COOKIE_DUMMY)
-    DBUG_VOID_RETURN;                           /* Nothing to do. */
 
   mysql_mutex_lock(&LOCK_xid_list);
   current= current_binlog_id;
@@ -7691,7 +8020,7 @@ TC_LOG_BINLOG::mark_xid_done(ulong cooki
   first= true;
   while ((b= it++))
   {
-    if (b->binlog_id == cookie)
+    if (b->binlog_id == binlog_id)
     {
       --b->xid_count;
       break;
@@ -7700,8 +8029,22 @@ TC_LOG_BINLOG::mark_xid_done(ulong cooki
   }
   /* Binlog is always found, as we do not remove until count reaches 0 */
   DBUG_ASSERT(b);
-  if (likely(cookie == current && !reset_master_pending) ||
-      b->xid_count != 0 || !first)
+  /*
+    If a RESET MASTER is pending, we are about to remove all log files, and
+    the RESET MASTER thread is waiting for all pending unlog() calls to
+    complete while holding LOCK_log. In this case we should not log a binlog
+    checkpoint event (it would be deleted immediately anyway and we would
+    deadlock on LOCK_log) but just signal the thread.
+  */
+  if (unlikely(reset_master_pending))
+  {
+    mysql_cond_signal(&COND_xid_list);
+    mysql_mutex_unlock(&LOCK_xid_list);
+    DBUG_VOID_RETURN;
+  }
+
+  if (likely(binlog_id == current) || b->xid_count != 0 || !first ||
+      !write_checkpoint)
   {
     /* No new binlog checkpoint reached yet. */
     mysql_mutex_unlock(&LOCK_xid_list);
@@ -7726,40 +8069,27 @@ TC_LOG_BINLOG::mark_xid_done(ulong cooki
     LOCK_log, then re-aquire LOCK_xid_list. If we were to take LOCK_log while
     holding LOCK_xid_list, we might deadlock with other threads that take the
     locks in the opposite order.
-
-    If a RESET MASTER is pending, we are about to remove all log files, and
-    the RESET MASTER thread is waiting for all pending unlog() calls to
-    complete while holding LOCK_log. In this case we should not log a binlog
-    checkpoint event (it would be deleted immediately anywat and we would
-    deadlock on LOCK_log) but just signal the thread.
   */
-  if (!reset_master_pending)
-  {
-    mysql_mutex_unlock(&LOCK_xid_list);
-    mysql_mutex_lock(&LOCK_log);
-    mysql_mutex_lock(&LOCK_xid_list);
-  }
+
+  mysql_mutex_unlock(&LOCK_xid_list);
+  mysql_mutex_lock(&LOCK_log);
+  mysql_mutex_lock(&LOCK_xid_list);
+  /* We need to reload current_binlog_id due to release/re-take of lock. */
+  current= current_binlog_id;
+
   for (;;)
   {
     /* Remove initial element(s) with zero count. */
     b= binlog_xid_count_list.head();
     /*
-      Normally, we must not remove all elements in the list.
-      Only if a RESET MASTER is in progress may we delete everything - RESET
-      MASTER has LOCK_log held, and will create a new initial element before
-      releasing the lock.
+      We must not remove all elements in the list - the entry for the current
+      binlog must be present always.
     */
-    DBUG_ASSERT(b || reset_master_pending);
-    if (unlikely(!b) || b->binlog_id == current || b->xid_count > 0)
+    DBUG_ASSERT(b);
+    if (b->binlog_id == current || b->xid_count > 0)
       break;
     my_free(binlog_xid_count_list.get());
   }
-  if (reset_master_pending)
-  {
-    mysql_cond_signal(&COND_xid_list);
-    mysql_mutex_unlock(&LOCK_xid_list);
-    DBUG_VOID_RETURN;
-  }
 
   mysql_mutex_unlock(&LOCK_xid_list);
   write_binlog_checkpoint_event_already_locked(b->binlog_name,
@@ -7771,10 +8101,22 @@ TC_LOG_BINLOG::mark_xid_done(ulong cooki
 int TC_LOG_BINLOG::unlog(ulong cookie, my_xid xid)
 {
   DBUG_ENTER("TC_LOG_BINLOG::unlog");
-  if (xid)
-    mark_xid_done(cookie);
-  /* As ::write_transaction_to_binlog() did not rotate, do it here. */
-  DBUG_RETURN(rotate_and_purge(0));
+  if (!xid)
+    DBUG_RETURN(0);
+
+  if (!BINLOG_COOKIE_IS_DUMMY(cookie))
+    mark_xid_done(BINLOG_COOKIE_GET_ID(cookie), true);
+  /*
+    See comment in trx_group_commit_leader() - if rotate() gave a failure,
+    we delay the return of error code to here.
+  */
+  DBUG_RETURN(BINLOG_COOKIE_GET_ERROR_FLAG(cookie));
+}
+
+void
+TC_LOG_BINLOG::commit_checkpoint_notify(void *cookie)
+{
+  mark_xid_done(((xid_count_per_binlog *)cookie)->binlog_id, true);
 }
 
 int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name,
@@ -7871,6 +8213,11 @@ int TC_LOG_BINLOG::recover(LOG_INFO *lin
       if (!binlog_checkpoint_found)
         break;
       first_round= false;
+      DBUG_EXECUTE_IF("xa_recover_expect_master_bin_000004",
+          if (0 != strcmp("./master-bin.000004", binlog_checkpoint_name) &&
+              0 != strcmp(".\\master-bin.000004", binlog_checkpoint_name))
+            DBUG_SUICIDE();
+        );
       if (find_log_pos(linfo, binlog_checkpoint_name, 1))
       {
         sql_print_error("Binlog file '%s' not found in binlog index, needed "
@@ -7983,10 +8330,13 @@ binlog_checksum_update(MYSQL_THD thd, st
 {
   ulong value=  *((ulong *)save);
   bool check_purge= false;
+  ulong prev_binlog_id;
+  LINT_INIT(prev_binlog_id);
 
   mysql_mutex_lock(mysql_bin_log.get_log_lock());
   if(mysql_bin_log.is_open())
   {
+    prev_binlog_id= mysql_bin_log.current_binlog_id;
     if (binlog_checksum_options != value)
       mysql_bin_log.checksum_alg_reset= (uint8) value;
     if (mysql_bin_log.rotate(true, &check_purge))
@@ -8000,7 +8350,7 @@ binlog_checksum_update(MYSQL_THD thd, st
   mysql_bin_log.checksum_alg_reset= BINLOG_CHECKSUM_ALG_UNDEF;
   mysql_mutex_unlock(mysql_bin_log.get_log_lock());
   if (check_purge)
-    mysql_bin_log.purge();
+    mysql_bin_log.checkpoint_and_purge(prev_binlog_id);
 }
 
 

=== modified file 'sql/log.h'
--- a/sql/log.h	2012-06-22 09:46:28 +0000
+++ b/sql/log.h	2012-09-13 12:31:29 +0000
@@ -49,6 +49,7 @@ class TC_LOG
                             bool need_prepare_ordered,
                             bool need_commit_ordered) = 0;
   virtual int unlog(ulong cookie, my_xid xid)=0;
+  virtual void commit_checkpoint_notify(void *cookie)= 0;
 
 protected:
   /*
@@ -98,8 +99,12 @@ class TC_LOG_DUMMY: public TC_LOG // use
     return 1;
   }
   int unlog(ulong cookie, my_xid xid)  { return 0; }
+  void commit_checkpoint_notify(void *cookie) { DBUG_ASSERT(0); };
 };
 
+#define TC_LOG_PAGE_SIZE   8192
+#define TC_LOG_MIN_SIZE    (3*TC_LOG_PAGE_SIZE)
+
 #ifdef HAVE_MMAP
 class TC_LOG_MMAP: public TC_LOG
 {
@@ -110,6 +115,12 @@ class TC_LOG_MMAP: public TC_LOG
     PS_DIRTY                 // new xids added since last sync
   } PAGE_STATE;
 
+  struct pending_cookies {
+    uint count;
+    uint pending_count;
+    ulong cookies[TC_LOG_PAGE_SIZE];
+  };
+
   private:
   typedef struct st_page {
     struct st_page *next; // page a linked in a fifo queue
@@ -141,7 +152,7 @@ class TC_LOG_MMAP: public TC_LOG
     one has to use active->lock.
     Same for LOCK_pool and LOCK_sync
   */
-  mysql_mutex_t LOCK_active, LOCK_pool, LOCK_sync;
+  mysql_mutex_t LOCK_active, LOCK_pool, LOCK_sync, LOCK_pending_checkpoint;
   mysql_cond_t COND_pool, COND_active;
   /*
     Queue of threads that need to call commit_ordered().
@@ -163,14 +174,16 @@ class TC_LOG_MMAP: public TC_LOG
   */
   mysql_cond_t COND_queue_busy;
   my_bool commit_ordered_queue_busy;
+  pending_cookies* pending_checkpoint;
 
   public:
-  TC_LOG_MMAP(): inited(0) {}
+  TC_LOG_MMAP(): inited(0), pending_checkpoint(0) {}
   int open(const char *opt_name);
   void close();
   int log_and_order(THD *thd, my_xid xid, bool all,
                     bool need_prepare_ordered, bool need_commit_ordered);
   int unlog(ulong cookie, my_xid xid);
+  void commit_checkpoint_notify(void *cookie);
   int recover();
 
   private:
@@ -178,6 +191,7 @@ class TC_LOG_MMAP: public TC_LOG
   void get_active_from_pool();
   int sync();
   int overflow();
+  int delete_entry(ulong cookie);
 };
 #else
 #define TC_LOG_MMAP TC_LOG_DUMMY
@@ -356,12 +370,32 @@ class MYSQL_QUERY_LOG: public MYSQL_LOG
 
 /*
   We assign each binlog file an internal ID, used to identify them for unlog().
-  Ids start from BINLOG_COOKIE_START; the value BINLOG_COOKIE_DUMMY is special
-  meaning "no binlog" (we cannot use zero as that is reserved for error return
-  from log_and_order).
-*/
-#define BINLOG_COOKIE_DUMMY 1
-#define BINLOG_COOKIE_START 2
+  The IDs start from 0 and increment for each new binlog created.
+
+  In unlog() we need to know the ID of the binlog file that the corresponding
+  transaction was written into. We also need a special value for a corner
+  case where there is no corresponding binlog id (since nothing was logged).
+  And we need an error flag to mark that unlog() must return failure.
+
+  We use the following macros to pack all of this information into the single
+  ulong available with log_and_order() / unlog().
+
+  Note that we cannot use the value 0 for cookie, as that is reserved as error
+  return value from log_and_order().
+  */
+#define BINLOG_COOKIE_ERROR_RETURN 0
+#define BINLOG_COOKIE_DUMMY_ID 1
+#define BINLOG_COOKIE_BASE 2
+#define BINLOG_COOKIE_DUMMY(error_flag) \
+  ( (BINLOG_COOKIE_DUMMY_ID<<1) | ((error_flag)&1) )
+#define BINLOG_COOKIE_MAKE(id, error_flag) \
+  ( (((id)+BINLOG_COOKIE_BASE)<<1) | ((error_flag)&1) )
+#define BINLOG_COOKIE_GET_ERROR_FLAG(c) ((c) & 1)
+#define BINLOG_COOKIE_GET_ID(c) ( ((ulong)(c)>>1) - BINLOG_COOKIE_BASE )
+#define BINLOG_COOKIE_IS_DUMMY(c) \
+  ( ((ulong)(c)>>1) == BINLOG_COOKIE_DUMMY_ID )
+
+void binlog_checkpoint_callback(void *cookie);
 
 class binlog_cache_mngr;
 class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
@@ -401,11 +435,25 @@ class MYSQL_BIN_LOG: public TC_LOG, priv
     IO_CACHE *error_cache;
     /* This is the `all' parameter for ha_commit_ordered(). */
     bool all;
+    /*
+      True if we need to increment xid_count in trx_group_commit_leader() and
+      decrement in unlog() (this is needed if there is a participating engine
+      that does not implement the commit_checkpoint_request() handlerton
+      method).
+    */
+    bool need_unlog;
+    /*
+      Fields used to pass the necessary information to the last thread in a
+      group commit, only used when opt_optimize_thread_scheduling is not set.
+    */
+    bool check_purge;
+    ulong binlog_id;
   };
 
   /*
     A list of struct xid_count_per_binlog is used to keep track of how many
-    XIDs are in prepared, but not committed, state in each binlog.
+    XIDs are in prepared, but not committed, state in each binlog. And how
+    many commit_checkpoint_request()'s are pending.
 
     When count drops to zero in a binlog after rotation, it means that there
     are no more XIDs in prepared state, so that binlog is no longer needed
@@ -418,10 +466,10 @@ class MYSQL_BIN_LOG: public TC_LOG, priv
     char *binlog_name;
     uint binlog_name_len;
     ulong binlog_id;
+    /* Total prepared XIDs and pending checkpoint requests in this binlog. */
     long xid_count;
     xid_count_per_binlog();   /* Give link error if constructor used. */
   };
-  ulong current_binlog_id;
   I_List<xid_count_per_binlog> binlog_xid_count_list;
   /*
     When this is set, a RESET MASTER is in progress.
@@ -432,6 +480,7 @@ class MYSQL_BIN_LOG: public TC_LOG, priv
     checkpoint arrives - when all have arrived, RESET MASTER will complete.
   */
   bool reset_master_pending;
+  friend void binlog_checkpoint_callback(void *cookie);
 
   /* LOCK_log and LOCK_index are inited by init_pthread_objects() */
   mysql_mutex_t LOCK_index;
@@ -464,15 +513,6 @@ class MYSQL_BIN_LOG: public TC_LOG, priv
   uint file_id;
   uint open_count;                              // For replication
   int readers_count;
-  bool need_start_event;
-  /*
-    no_auto_events means we don't want any of these automatic events :
-    Start/Rotate/Stop. That is, in 4.x when we rotate a relay log, we don't
-    want a Rotate_log event to be written to the relay log. When we start a
-    relay log etc. So in 4.x this is 1 for relay logs, 0 for binlogs.
-    In 5.0 it's 0 for relay logs too!
-  */
-  bool no_auto_events;
   /* Queue of transactions queued up to participate in group commit. */
   group_commit_entry *group_commit_queue;
   /*
@@ -508,10 +548,12 @@ class MYSQL_BIN_LOG: public TC_LOG, priv
   */
   int new_file_without_locking();
   int new_file_impl(bool need_lock);
+  void do_checkpoint_request(ulong binlog_id);
+  void purge();
   int write_transaction_or_stmt(group_commit_entry *entry);
   bool write_transaction_to_binlog_events(group_commit_entry *entry);
   void trx_group_commit_leader(group_commit_entry *leader);
-  void mark_xid_done(ulong cookie);
+  void mark_xid_done(ulong cookie, bool write_checkpoint);
   void mark_xids_active(ulong cookie, uint xid_count);
 
 public:
@@ -572,6 +614,7 @@ class MYSQL_BIN_LOG: public TC_LOG, priv
   */
   char last_commit_pos_file[FN_REFLEN];
   my_off_t last_commit_pos_offset;
+  ulong current_binlog_id;
 
   MYSQL_BIN_LOG(uint *sync_period);
   /*
@@ -600,6 +643,7 @@ class MYSQL_BIN_LOG: public TC_LOG, priv
   int log_and_order(THD *thd, my_xid xid, bool all,
                     bool need_prepare_ordered, bool need_commit_ordered);
   int unlog(ulong cookie, my_xid xid);
+  void commit_checkpoint_notify(void *cookie);
   int recover(LOG_INFO *linfo, const char *last_log_name, IO_CACHE *first_log,
               Format_description_log_event *fdle);
 #if !defined(MYSQL_CLIENT)
@@ -629,15 +673,14 @@ class MYSQL_BIN_LOG: public TC_LOG, priv
   void signal_update();
   void wait_for_update_relay_log(THD* thd);
   int  wait_for_update_bin_log(THD* thd, const struct timespec * timeout);
-  void set_need_start_event() { need_start_event = 1; }
-  void init(bool no_auto_events_arg, ulong max_size);
+  void init(ulong max_size);
   void init_pthread_objects();
   void cleanup();
   bool open(const char *log_name,
             enum_log_type log_type,
             const char *new_name,
             enum cache_type io_cache_type_arg,
-            bool no_auto_events_arg, ulong max_size,
+            ulong max_size,
             bool null_created,
             bool need_mutex);
   bool open_index_file(const char *index_file_name_arg,
@@ -674,7 +717,7 @@ class MYSQL_BIN_LOG: public TC_LOG, priv
   bool can_purge_log(const char *log_file_name);
   int update_log_index(LOG_INFO* linfo, bool need_update_threads);
   int rotate(bool force_rotate, bool* check_purge);
-  void purge();
+  void checkpoint_and_purge(ulong binlog_id);
   int rotate_and_purge(bool force_rotate);
   /**
      Flush binlog cache and synchronize to disk.

=== modified file 'sql/log_event.cc'
--- a/sql/log_event.cc	2012-06-22 09:46:28 +0000
+++ b/sql/log_event.cc	2012-09-13 12:31:29 +0000
@@ -5874,7 +5874,7 @@ Rotate_log_event::do_shall_skip(Relay_lo
 **************************************************************************/
 
 #if defined(HAVE_REPLICATION) && !defined(MYSQL_CLIENT)
-void Binlog_checkpoint_log_event::pack_info(Protocol *protocol)
+void Binlog_checkpoint_log_event::pack_info(THD *thd, Protocol *protocol)
 {
   protocol->store(binlog_file_name, binlog_file_len, &my_charset_bin);
 }

=== modified file 'sql/log_event.h'
--- a/sql/log_event.h	2012-06-22 09:46:28 +0000
+++ b/sql/log_event.h	2012-09-13 12:31:29 +0000
@@ -2911,7 +2911,7 @@ class Binlog_checkpoint_log_event: publi
   Binlog_checkpoint_log_event(const char *binlog_file_name_arg,
                               uint binlog_file_len_arg);
 #ifdef HAVE_REPLICATION
-  void pack_info(Protocol *protocol);
+  void pack_info(THD *thd, Protocol *protocol);
 #endif
 #else
   void print(FILE *file, PRINT_EVENT_INFO *print_event_info);

=== modified file 'sql/mysqld.cc'
--- a/sql/mysqld.cc	2012-06-22 09:46:28 +0000
+++ b/sql/mysqld.cc	2012-09-13 12:31:29 +0000
@@ -715,7 +715,8 @@ char **orig_argv;
 
 #ifdef HAVE_PSI_INTERFACE
 #ifdef HAVE_MMAP
-PSI_mutex_key key_PAGE_lock, key_LOCK_sync, key_LOCK_active, key_LOCK_pool;
+PSI_mutex_key key_PAGE_lock, key_LOCK_sync, key_LOCK_active, key_LOCK_pool,
+  key_LOCK_pending_checkpoint;
 #endif /* HAVE_MMAP */
 
 #ifdef HAVE_OPENSSL
@@ -756,6 +757,7 @@ static PSI_mutex_info all_server_mutexes
   { &key_LOCK_sync, "TC_LOG_MMAP::LOCK_sync", 0},
   { &key_LOCK_active, "TC_LOG_MMAP::LOCK_active", 0},
   { &key_LOCK_pool, "TC_LOG_MMAP::LOCK_pool", 0},
+  { &key_LOCK_pool, "TC_LOG_MMAP::LOCK_pending_checkpoint", 0},
 #endif /* HAVE_MMAP */
 
 #ifdef HAVE_OPENSSL
@@ -4418,7 +4420,7 @@ a file name for --log-bin-index option",
   }
 
   if (opt_bin_log && mysql_bin_log.open(opt_bin_logname, LOG_BIN, 0,
-                                        WRITE_CACHE, 0, max_binlog_size, 0, TRUE))
+                                        WRITE_CACHE, max_binlog_size, 0, TRUE))
     unireg_abort(1);
 
 #ifdef HAVE_REPLICATION

=== modified file 'sql/mysqld.h'
--- a/sql/mysqld.h	2012-06-22 09:46:28 +0000
+++ b/sql/mysqld.h	2012-09-13 12:31:29 +0000
@@ -218,7 +218,7 @@ extern pthread_key(MEM_ROOT**,THR_MALLOC
 #ifdef HAVE_PSI_INTERFACE
 #ifdef HAVE_MMAP
 extern PSI_mutex_key key_PAGE_lock, key_LOCK_sync, key_LOCK_active,
-       key_LOCK_pool;
+       key_LOCK_pool, key_LOCK_pending_checkpoint;
 #endif /* HAVE_MMAP */
 
 #ifdef HAVE_OPENSSL

=== modified file 'sql/rpl_rli.cc'
--- a/sql/rpl_rli.cc	2012-08-27 16:13:17 +0000
+++ b/sql/rpl_rli.cc	2012-09-13 12:31:29 +0000
@@ -213,7 +213,7 @@ a file name for --relay-log-index option
       but a destructor will take care of that
     */
     if (rli->relay_log.open_index_file(opt_relaylog_index_name, ln, TRUE) ||
-        rli->relay_log.open(ln, LOG_BIN, 0, SEQ_READ_APPEND, 0,
+        rli->relay_log.open(ln, LOG_BIN, 0, SEQ_READ_APPEND,
                             (max_relay_log_size ? max_relay_log_size :
                             max_binlog_size), 1, TRUE))
     {

=== modified file 'sql/slave.cc'
--- a/sql/slave.cc	2012-06-22 09:40:40 +0000
+++ b/sql/slave.cc	2012-09-13 12:31:29 +0000
@@ -1757,13 +1757,12 @@ when it try to get the value of TIME_ZON
   /* Announce MariaDB slave capabilities. */
   DBUG_EXECUTE_IF("simulate_slave_capability_none", goto after_set_capability;);
   {
-    const char *q=
-      DBUG_EVALUATE_IF("simulate_slave_capability_old_53",
-                       "SET @mariadb_slave_capability="
-                           STRINGIFY_ARG(MARIA_SLAVE_CAPABILITY_ANNOTATE),
-                       "SET @mariadb_slave_capability="
-                           STRINGIFY_ARG(MARIA_SLAVE_CAPABILITY_MINE));
-    if (mysql_real_query(mysql, q, strlen(q)))
+    int rc= DBUG_EVALUATE_IF("simulate_slave_capability_old_53",
+        mysql_real_query(mysql, STRING_WITH_LEN("SET @mariadb_slave_capability="
+                         STRINGIFY_ARG(MARIA_SLAVE_CAPABILITY_ANNOTATE))),
+        mysql_real_query(mysql, STRING_WITH_LEN("SET @mariadb_slave_capability="
+                         STRINGIFY_ARG(MARIA_SLAVE_CAPABILITY_MINE))));
+    if (rc)
     {
       err_code= mysql_errno(mysql);
       if (is_network_error(err_code))

=== modified file 'sql/sql_class.h'
--- a/sql/sql_class.h	2012-09-04 16:26:30 +0000
+++ b/sql/sql_class.h	2012-09-13 12:31:29 +0000
@@ -149,9 +149,6 @@ class CSET_STRING
 };
 
 
-#define TC_LOG_PAGE_SIZE   8192
-#define TC_LOG_MIN_SIZE    (3*TC_LOG_PAGE_SIZE)
-
 #define TC_HEURISTIC_RECOVER_COMMIT   1
 #define TC_HEURISTIC_RECOVER_ROLLBACK 2
 extern ulong tc_heuristic_recover;

=== modified file 'storage/innobase/handler/ha_innodb.cc'
--- a/storage/innobase/handler/ha_innodb.cc	2012-08-31 21:54:54 +0000
+++ b/storage/innobase/handler/ha_innodb.cc	2012-09-13 12:31:29 +0000
@@ -347,6 +347,7 @@ static int innobase_rollback_to_savepoin
 static int innobase_savepoint(handlerton *hton, THD* thd, void *savepoint);
 static int innobase_release_savepoint(handlerton *hton, THD* thd,
            void *savepoint);
+static void innobase_checkpoint_request(handlerton *hton, void *cookie);
 static handler *innobase_create_handler(handlerton *hton,
                                         TABLE_SHARE *table,
                                         MEM_ROOT *mem_root);
@@ -2250,6 +2251,7 @@ innobase_init(
         innobase_hton->recover=innobase_xa_recover;
         innobase_hton->commit_by_xid=innobase_commit_by_xid;
         innobase_hton->rollback_by_xid=innobase_rollback_by_xid;
+        innobase_hton->commit_checkpoint_request=innobase_checkpoint_request;
         innobase_hton->create_cursor_read_view=innobase_create_cursor_view;
         innobase_hton->set_cursor_read_view=innobase_set_cursor_view;
         innobase_hton->close_cursor_read_view=innobase_close_cursor_view;
@@ -3007,6 +3009,19 @@ innobase_rollback_trx(
 }
 
 /*****************************************************************//**
+Handle a commit checkpoint request from server layer.
+We simply flush the redo log immediately and do the notify call.*/
+static
+void
+innobase_checkpoint_request(
+        handlerton *hton,
+        void *cookie)
+{
+        log_buffer_flush_to_disk();
+        commit_checkpoint_notify_ha(hton, cookie);
+}
+
+/*****************************************************************//**
 Rolls back a transaction to a savepoint.
 @return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the
 given name */
@@ -11460,10 +11475,17 @@ static MYSQL_SYSVAR_STR(file_format_max,
 
 static MYSQL_SYSVAR_ULONG(flush_log_at_trx_commit, srv_flush_log_at_trx_commit,
   PLUGIN_VAR_OPCMDARG,
-  "Set to 0 (write and flush once per second),"
-  " 1 (write and flush at each commit)"
-  " or 2 (write at commit, flush once per second).",
-  NULL, NULL, 1, 0, 2, 0);
+  "Controls the durability/speed trade-off for commits."
+  " Set to 0 (write and flush redo log to disk only once per second),"
+  " 1 (flush to disk at each commit),"
+  " 2 (write to log at commit but flush to disk only once per second)"
+  " or 3 (flush to disk at prepare and at commit, slower and usually redundant)."
+  " 1 and 3 guarantees that after a crash, committed transactions will"
+  " not be lost and will be consistent with the binlog and other transactional"
+  " engines. 2 can get inconsistent and lose transactions if there is a"
+  " power failure or kernel crash but not if mysqld crashes. 0 has no"
+  " guarantees in case of crash. 0 and 2 can be faster than 1 or 3.",
+  NULL, NULL, 1, 0, 3, 0);
 
 static MYSQL_SYSVAR_STR(flush_method, innobase_file_flush_method,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,

=== modified file 'storage/innobase/trx/trx0trx.c'
--- a/storage/innobase/trx/trx0trx.c	2012-04-07 13:58:46 +0000
+++ b/storage/innobase/trx/trx0trx.c	2012-09-13 12:31:29 +0000
@@ -1025,7 +1025,8 @@ trx_commit_off_kernel(
                         trx->must_flush_log_later = TRUE;
                 } else if (srv_flush_log_at_trx_commit == 0) {
                         /* Do nothing */
-                } else if (srv_flush_log_at_trx_commit == 1) {
+                } else if (srv_flush_log_at_trx_commit == 1 ||
+                           srv_flush_log_at_trx_commit == 3) {
                         if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
                                 /* Write the log but do not flush it to disk */
 
@@ -1712,7 +1713,11 @@ trx_commit_complete_for_mysql(
                 /* Do nothing */
         } else if (srv_flush_log_at_trx_commit == 0) {
                 /* Do nothing */
-        } else if (srv_flush_log_at_trx_commit == 1) {
+        } else if (srv_flush_log_at_trx_commit == 1 && trx->active_commit_ordered) {
+                /* Do nothing - we already flushed the prepare and binlog write
+                to disk, so transaction is durable (will be recovered from
+                binlog if necessary) */
+        } else if (srv_flush_log_at_trx_commit == 1 || srv_flush_log_at_trx_commit == 3) {
                 if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
                         /* Write the log but do not flush it to disk */
 
@@ -1992,7 +1997,7 @@ trx_prepare_off_kernel(
 
                 if (srv_flush_log_at_trx_commit == 0) {
                         /* Do nothing */
-                } else if (srv_flush_log_at_trx_commit == 1) {
+                } else if (srv_flush_log_at_trx_commit == 1 || srv_flush_log_at_trx_commit == 3) {
                         if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
                                 /* Write the log but do not flush it to disk */
 

=== modified file 'storage/xtradb/handler/ha_innodb.cc'
--- a/storage/xtradb/handler/ha_innodb.cc	2012-08-31 21:54:54 +0000
+++ b/storage/xtradb/handler/ha_innodb.cc	2012-09-13 12:31:29 +0000
@@ -383,6 +383,7 @@ static int innobase_rollback_to_savepoin
 static int innobase_savepoint(handlerton *hton, THD* thd, void *savepoint);
 static int innobase_release_savepoint(handlerton *hton, THD* thd,
            void *savepoint);
+static void innobase_checkpoint_request(handlerton *hton, void *cookie);
 static handler *innobase_create_handler(handlerton *hton,
                                         TABLE_SHARE *table,
                                         MEM_ROOT *mem_root);
@@ -483,10 +484,17 @@ static MYSQL_THDVAR_ULONG(lock_wait_time
   NULL, NULL, 50, 1, 1024 * 1024 * 1024, 0);
 
 static MYSQL_THDVAR_ULONG(flush_log_at_trx_commit, PLUGIN_VAR_OPCMDARG,
-  "Set to 0 (write and flush once per second),"
-  " 1 (write and flush at each commit)"
-  " or 2 (write at commit, flush once per second).",
-  NULL, NULL, 1, 0, 2, 0);
+  "Controls the durability/speed trade-off for commits."
+  " Set to 0 (write and flush redo log to disk only once per second),"
+  " 1 (flush to disk at each commit),"
+  " 2 (write to log at commit but flush to disk only once per second)"
+  " or 3 (flush to disk at prepare and at commit, slower and usually redundant)."
+  " 1 and 3 guarantees that after a crash, committed transactions will"
+  " not be lost and will be consistent with the binlog and other transactional"
+  " engines. 2 can get inconsistent and lose transactions if there is a"
+  " power failure or kernel crash but not if mysqld crashes. 0 has no"
+  " guarantees in case of crash. 0 and 2 can be faster than 1 or 3.",
+  NULL, NULL, 1, 0, 3, 0);
 
 static MYSQL_THDVAR_BOOL(fake_changes, PLUGIN_VAR_OPCMDARG,
   "In the transaction after enabled, UPDATE, INSERT and DELETE only move the cursor to the records "
@@ -2469,6 +2477,7 @@ innobase_init(
         innobase_hton->recover=innobase_xa_recover;
         innobase_hton->commit_by_xid=innobase_commit_by_xid;
         innobase_hton->rollback_by_xid=innobase_rollback_by_xid;
+        innobase_hton->commit_checkpoint_request=innobase_checkpoint_request;
         innobase_hton->checkpoint_state= innobase_checkpoint_state;
         innobase_hton->create_cursor_read_view=innobase_create_cursor_view;
         innobase_hton->set_cursor_read_view=innobase_set_cursor_view;
@@ -3492,6 +3501,19 @@ innobase_rollback_trx(
 }
 
 /*****************************************************************//**
+Handle a commit checkpoint request from server layer.
+We simply flush the redo log immediately and do the notify call.*/
+static
+void
+innobase_checkpoint_request(
+        handlerton *hton,
+        void *cookie)
+{
+        log_buffer_flush_to_disk();
+        commit_checkpoint_notify_ha(hton, cookie);
+}
+
+/*****************************************************************//**
 Rolls back a transaction to a savepoint.
 @return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the
 given name */

=== modified file 'storage/xtradb/include/trx0trx.h'
--- a/storage/xtradb/include/trx0trx.h	2012-02-15 14:37:38 +0000
+++ b/storage/xtradb/include/trx0trx.h	2012-09-13 12:31:29 +0000
@@ -494,7 +494,6 @@ struct trx_struct{
                                         this is set to 1 then registered should
                                         also be set to 1. This is used in the
                                         XA code */
-        unsigned        called_commit_ordered:1;/* 1 if innobase_commit_ordered has run. */
         /*------------------------------*/
         ulint           isolation_level;/* TRX_ISO_REPEATABLE_READ, ... */
         ulint           check_foreigns; /* normally TRUE, but if the user

=== modified file 'storage/xtradb/trx/trx0trx.c'
--- a/storage/xtradb/trx/trx0trx.c	2012-08-27 16:13:17 +0000
+++ b/storage/xtradb/trx/trx0trx.c	2012-09-13 12:31:29 +0000
@@ -1099,7 +1099,8 @@ trx_commit_off_kernel(
                         trx->must_flush_log_later = TRUE;
                 } else if (flush_log_at_trx_commit == 0) {
                         /* Do nothing */
-                } else if (flush_log_at_trx_commit == 1) {
+                } else if (flush_log_at_trx_commit == 1 ||
+                           flush_log_at_trx_commit == 3) {
                         if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
                                 /* Write the log but do not flush it to disk */
 
@@ -1809,7 +1810,11 @@ trx_commit_complete_for_mysql(
                 /* Do nothing */
         } else if (flush_log_at_trx_commit == 0) {
                 /* Do nothing */
-        } else if (flush_log_at_trx_commit == 1) {
+        } else if (flush_log_at_trx_commit == 1 && trx->active_commit_ordered) {
+                /* Do nothing - we already flushed the prepare and binlog write
+                to disk, so transaction is durable (will be recovered from
+                binlog if necessary) */
+        } else if (flush_log_at_trx_commit == 1 || flush_log_at_trx_commit == 3) {
                 if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
                         /* Write the log but do not flush it to disk */
 
@@ -2097,7 +2102,7 @@ trx_prepare_off_kernel(
 
                 if (flush_log_at_trx_commit == 0) {
                         /* Do nothing */
-                } else if (flush_log_at_trx_commit == 1) {
+                } else if (flush_log_at_trx_commit == 1 || flush_log_at_trx_commit == 3) {
                         if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
                                 /* Write the log but do not flush it to disk */
 



More information about the commits mailing list