[Commits] Rev 3435: MDEV-532: Async InnoDB commit checkpoint. in http://bazaar.launchpad.net/~maria-captains/maria/10.0

knielsen at knielsen-hq.org knielsen at knielsen-hq.org
Thu Sep 13 16:51:09 EEST 2012


At http://bazaar.launchpad.net/~maria-captains/maria/10.0

------------------------------------------------------------
revno: 3435
revision-id: knielsen at knielsen-hq.org-20120913135109-134ycuybjmr88nbv
parent: knielsen at knielsen-hq.org-20120913123129-kaujy4cw0jc9o08k
committer: knielsen at knielsen-hq.org
branch nick: work-10.0-mdev225-181-232
timestamp: Thu 2012-09-13 15:51:09 +0200
message:
  MDEV-532: Async InnoDB commit checkpoint.
  
  Make the commit checkpoint inside InnoDB be asynchroneous.
  Implement a background thread in binlog to do the writing and flushing of
  binlog checkpoint events to disk.
=== modified file 'mysql-test/suite/binlog/r/binlog_checkpoint.result'
--- a/mysql-test/suite/binlog/r/binlog_checkpoint.result	2012-09-13 12:31:29 +0000
+++ b/mysql-test/suite/binlog/r/binlog_checkpoint.result	2012-09-13 13:51:09 +0000
@@ -70,8 +70,14 @@ show binlog events in 'master-bin.000003
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info
 master-bin.000003       #       Format_desc     #       #       SERVER_VERSION, BINLOG_VERSION
 master-bin.000003       #       Binlog_checkpoint       #       #       master-bin.000001
+SET DEBUG_SYNC= "RESET";
+SET @old_dbug= @@global.DEBUG_DBUG;
+SET GLOBAL debug_dbug="+d,binlog_background_checkpoint_processed";
 SET DEBUG_SYNC= "now SIGNAL con2_continue";
 con1 is still pending, no new binlog checkpoint should have been logged.
+SET DEBUG_SYNC= "now WAIT_FOR binlog_background_checkpoint_processed";
+SET GLOBAL debug_dbug= @old_dbug;
+SET DEBUG_SYNC= "RESET";
 show binlog events in 'master-bin.000003' from <binlog_start>;
 Log_name        Pos     Event_type      Server_id       End_log_pos     Info
 master-bin.000003       #       Format_desc     #       #       SERVER_VERSION, BINLOG_VERSION

=== modified file 'mysql-test/suite/binlog/r/binlog_xa_recover.result'
--- a/mysql-test/suite/binlog/r/binlog_xa_recover.result	2012-09-13 12:31:29 +0000
+++ b/mysql-test/suite/binlog/r/binlog_xa_recover.result	2012-09-13 13:51:09 +0000
@@ -118,7 +118,11 @@ master-bin.00000<binlog_start>	#	Table_m
 master-bin.00000<binlog_start>  #       Write_rows      #       #       table_id: # flags: STMT_END_F
 master-bin.00000<binlog_start>  #       Xid     #       #       COMMIT /* XID */
 SET DEBUG_SYNC= "now SIGNAL con10_cont";
+SET @old_dbug= @@global.DEBUG_DBUG;
+SET GLOBAL debug_dbug="+d,binlog_background_checkpoint_processed";
 SET DEBUG_SYNC= "now SIGNAL con12_cont";
+SET DEBUG_SYNC= "now WAIT_FOR binlog_background_checkpoint_processed";
+SET GLOBAL debug_dbug= @old_dbug;
 SET DEBUG_SYNC= "now SIGNAL con11_cont";
 Checking that master-bin.000004 is the last binlog checkpoint
 show binlog events in 'master-bin.00000<binlog_start>' from <binlog_start>;

=== modified file 'mysql-test/suite/binlog/t/binlog_checkpoint.test'
--- a/mysql-test/suite/binlog/t/binlog_checkpoint.test	2012-09-13 12:31:29 +0000
+++ b/mysql-test/suite/binlog/t/binlog_checkpoint.test	2012-09-13 13:51:09 +0000
@@ -71,6 +71,12 @@ SET DEBUG_SYNC= "now WAIT_FOR con2_ready
 --let $binlog_file= master-bin.000003
 --source include/show_binlog_events.inc
 
+# We need to sync the test case with the background processing of the
+# commit checkpoint, otherwise we get nondeterministic results.
+SET DEBUG_SYNC= "RESET";
+SET @old_dbug= @@global.DEBUG_DBUG;
+SET GLOBAL debug_dbug="+d,binlog_background_checkpoint_processed";
+
 SET DEBUG_SYNC= "now SIGNAL con2_continue";
 
 connection con2;
@@ -78,6 +84,12 @@ reap;
 
 connection default;
 --echo con1 is still pending, no new binlog checkpoint should have been logged.
+# Make sure commit checkpoint is processed before we check that no checkpoint
+# event has been binlogged.
+SET DEBUG_SYNC= "now WAIT_FOR binlog_background_checkpoint_processed";
+SET GLOBAL debug_dbug= @old_dbug;
+SET DEBUG_SYNC= "RESET";
+
 --let $binlog_file= master-bin.000003
 --source include/show_binlog_events.inc
 

=== modified file 'mysql-test/suite/binlog/t/binlog_xa_recover.test'
--- a/mysql-test/suite/binlog/t/binlog_xa_recover.test	2012-09-13 12:31:29 +0000
+++ b/mysql-test/suite/binlog/t/binlog_xa_recover.test	2012-09-13 13:51:09 +0000
@@ -14,8 +14,24 @@ CREATE TABLE t1 (a INT PRIMARY KEY, b ME
 # Insert some data to force a couple binlog rotations (3), so we get some
 # normal binlog checkpoints before starting the test.
 INSERT INTO t1 VALUES (100, REPEAT("x", 4100));
+# Wait for the master-bin.000002 binlog checkpoint to appear.
+--let $wait_for_all= 0
+--let $show_statement= SHOW BINLOG EVENTS IN "master-bin.000002"
+--let $field= Info
+--let $condition= = "master-bin.000002"
+--source include/wait_show_condition.inc
 INSERT INTO t1 VALUES (101, REPEAT("x", 4100));
+--let $wait_for_all= 0
+--let $show_statement= SHOW BINLOG EVENTS IN "master-bin.000003"
+--let $field= Info
+--let $condition= = "master-bin.000003"
+--source include/wait_show_condition.inc
 INSERT INTO t1 VALUES (102, REPEAT("x", 4100));
+--let $wait_for_all= 0
+--let $show_statement= SHOW BINLOG EVENTS IN "master-bin.000004"
+--let $field= Info
+--let $condition= = "master-bin.000004"
+--source include/wait_show_condition.inc
 
 # Now start a bunch of transactions that span multiple binlog
 # files. Leave then in the state prepared-but-not-committed in the engine
@@ -153,10 +169,19 @@ SET DEBUG_SYNC= "now SIGNAL con10_cont";
 connection con10;
 reap;
 connection default;
+
+# We need to sync the test case with the background processing of the
+# commit checkpoint, otherwise we get nondeterministic results.
+SET @old_dbug= @@global.DEBUG_DBUG;
+SET GLOBAL debug_dbug="+d,binlog_background_checkpoint_processed";
+
 SET DEBUG_SYNC= "now SIGNAL con12_cont";
 connection con12;
 reap;
 connection default;
+SET DEBUG_SYNC= "now WAIT_FOR binlog_background_checkpoint_processed";
+SET GLOBAL debug_dbug= @old_dbug;
+
 SET DEBUG_SYNC= "now SIGNAL con11_cont";
 connection con11;
 reap;
@@ -210,7 +235,20 @@ RESET MASTER;
 # crash recovery fails due to the error insert used for previous test.
 INSERT INTO t1 VALUES (21, REPEAT("x", 4100));
 INSERT INTO t1 VALUES (22, REPEAT("x", 4100));
+# Wait for the master-bin.000003 binlog checkpoint to appear.
+--let $wait_for_all= 0
+--let $show_statement= SHOW BINLOG EVENTS IN "master-bin.000003"
+--let $field= Info
+--let $condition= = "master-bin.000003"
+--source include/wait_show_condition.inc
 INSERT INTO t1 VALUES (23, REPEAT("x", 4100));
+# Wait for the last (master-bin.000004) binlog checkpoint to appear.
+--let $wait_for_all= 0
+--let $show_statement= SHOW BINLOG EVENTS IN "master-bin.000004"
+--let $field= Info
+--let $condition= = "master-bin.000004"
+--source include/wait_show_condition.inc
+
 --write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
 wait-binlog_xa_recover.test
 EOF

=== modified file 'mysql-test/suite/perfschema/r/all_instances.result'
--- a/mysql-test/suite/perfschema/r/all_instances.result	2012-06-22 09:46:28 +0000
+++ b/mysql-test/suite/perfschema/r/all_instances.result	2012-09-13 13:51:09 +0000
@@ -76,6 +76,7 @@ wait/synch/mutex/sql/Master_info::run_lo
 wait/synch/mutex/sql/Master_info::sleep_lock
 wait/synch/mutex/sql/MDL_map::mutex
 wait/synch/mutex/sql/MDL_wait::LOCK_wait_status
+wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_binlog_thread
 wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_index
 wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_xid_list
 wait/synch/mutex/sql/MYSQL_RELAY_LOG::LOCK_index
@@ -129,6 +130,8 @@ wait/synch/cond/sql/Master_info::sleep_c
 wait/synch/cond/sql/Master_info::start_cond
 wait/synch/cond/sql/Master_info::stop_cond
 wait/synch/cond/sql/MDL_context::COND_wait_status
+wait/synch/cond/sql/MYSQL_BIN_LOG::COND_binlog_thread
+wait/synch/cond/sql/MYSQL_BIN_LOG::COND_binlog_thread_end
 wait/synch/cond/sql/MYSQL_BIN_LOG::COND_queue_busy
 wait/synch/cond/sql/MYSQL_BIN_LOG::COND_xid_list
 wait/synch/cond/sql/MYSQL_BIN_LOG::update_cond

=== modified file 'mysql-test/suite/perfschema/r/relaylog.result'
--- a/mysql-test/suite/perfschema/r/relaylog.result	2012-06-22 09:46:28 +0000
+++ b/mysql-test/suite/perfschema/r/relaylog.result	2012-09-13 13:51:09 +0000
@@ -56,8 +56,11 @@ where event_name like "%MYSQL_BIN_LOG%"
   and event_name not like "%MYSQL_BIN_LOG::update_cond"
   order by event_name;
 EVENT_NAME      COUNT_STAR
+wait/synch/cond/sql/MYSQL_BIN_LOG::COND_binlog_thread   NONE
+wait/synch/cond/sql/MYSQL_BIN_LOG::COND_binlog_thread_end       NONE
 wait/synch/cond/sql/MYSQL_BIN_LOG::COND_queue_busy      NONE
 wait/synch/cond/sql/MYSQL_BIN_LOG::COND_xid_list        NONE
+wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_binlog_thread  MANY
 wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_index  MANY
 wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_xid_list       MANY
 "Expect no slave relay log"
@@ -131,8 +134,11 @@ where event_name like "%MYSQL_BIN_LOG%"
   and event_name not like "%MYSQL_BIN_LOG::update_cond"
   order by event_name;
 EVENT_NAME      COUNT_STAR
+wait/synch/cond/sql/MYSQL_BIN_LOG::COND_binlog_thread   MANY
+wait/synch/cond/sql/MYSQL_BIN_LOG::COND_binlog_thread_end       NONE
 wait/synch/cond/sql/MYSQL_BIN_LOG::COND_queue_busy      NONE
-wait/synch/cond/sql/MYSQL_BIN_LOG::COND_xid_list        NONE
+wait/synch/cond/sql/MYSQL_BIN_LOG::COND_xid_list        MANY
+wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_binlog_thread  MANY
 wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_index  MANY
 wait/synch/mutex/sql/MYSQL_BIN_LOG::LOCK_xid_list       MANY
 "Expect a slave relay log"

=== modified file 'sql/debug_sync.cc'
--- a/sql/debug_sync.cc	2012-03-28 17:26:00 +0000
+++ b/sql/debug_sync.cc	2012-09-13 13:51:09 +0000
@@ -984,6 +984,7 @@ static bool debug_sync_eval_action(THD *
   DBUG_ENTER("debug_sync_eval_action");
   DBUG_ASSERT(thd);
   DBUG_ASSERT(action_str);
+  DBUG_PRINT("debug_sync", ("action_str='%s'", action_str));
 
   /*
     Get debug sync point name. Or a special command.

=== modified file 'sql/log.cc'
--- a/sql/log.cc	2012-09-13 12:31:29 +0000
+++ b/sql/log.cc	2012-09-13 13:51:09 +0000
@@ -53,6 +53,7 @@
 #include "rpl_handler.h"
 #include "debug_sync.h"
 #include "sql_show.h"
+#include "my_pthread.h"
 
 /* max size of the log message */
 #define MAX_LOG_BUFFER_SIZE 1024
@@ -106,6 +107,14 @@ static SHOW_VAR binlog_status_vars_detai
   {NullS, NullS, SHOW_LONG}
 };
 
+/* Variables for the binlog background thread. */
+static bool binlog_thread_started= false;
+static bool binlog_background_thread_stop= false;
+static MYSQL_BIN_LOG::xid_count_per_binlog *
+    binlog_background_thread_queue= NULL;
+
+static bool start_binlog_background_thread();
+
 
 /**
    purge logs, master and slave sides both, related error code
@@ -2957,12 +2966,27 @@ void MYSQL_BIN_LOG::cleanup()
       my_free(b);
     }
 
+    /* Wait for the binlog thread to stop. */
+    if (log_type == LOG_BIN && binlog_thread_started)
+    {
+      mysql_mutex_lock(&LOCK_binlog_thread);
+      binlog_background_thread_stop= true;
+      mysql_cond_signal(&COND_binlog_thread);
+      while (binlog_background_thread_stop)
+        mysql_cond_wait(&COND_binlog_thread_end, &LOCK_binlog_thread);
+      mysql_mutex_unlock(&LOCK_binlog_thread);
+      binlog_thread_started= false;
+    }
+
     mysql_mutex_destroy(&LOCK_log);
     mysql_mutex_destroy(&LOCK_index);
     mysql_mutex_destroy(&LOCK_xid_list);
+    mysql_mutex_destroy(&LOCK_binlog_thread);
     mysql_cond_destroy(&update_cond);
     mysql_cond_destroy(&COND_queue_busy);
     mysql_cond_destroy(&COND_xid_list);
+    mysql_cond_destroy(&COND_binlog_thread);
+    mysql_cond_destroy(&COND_binlog_thread_end);
   }
   DBUG_VOID_RETURN;
 }
@@ -2988,6 +3012,11 @@ void MYSQL_BIN_LOG::init_pthread_objects
   mysql_cond_init(m_key_update_cond, &update_cond, 0);
   mysql_cond_init(m_key_COND_queue_busy, &COND_queue_busy, 0);
   mysql_cond_init(key_BINLOG_COND_xid_list, &COND_xid_list, 0);
+
+  mysql_mutex_init(key_BINLOG_LOCK_binlog_thread,
+                   &LOCK_binlog_thread, MY_MUTEX_INIT_FAST);
+  mysql_cond_init(key_BINLOG_COND_binlog_thread, &COND_binlog_thread, 0);
+  mysql_cond_init(key_BINLOG_COND_binlog_thread_end, &COND_binlog_thread_end, 0);
 }
 
 
@@ -3085,6 +3114,10 @@ bool MYSQL_BIN_LOG::open(const char *log
   DBUG_ENTER("MYSQL_BIN_LOG::open");
   DBUG_PRINT("enter",("log_type: %d",(int) log_type_arg));
 
+  if (log_type_arg == LOG_BIN && !binlog_thread_started &&
+      start_binlog_background_thread())
+    DBUG_RETURN(1);
+
   if (init_and_set_log_file_name(log_name, new_name, log_type_arg,
                                  io_cache_type_arg))
   {
@@ -5540,11 +5573,7 @@ bool general_log_write(THD *thd, enum en
 }
 
 
-/*
-  I would like to make this function static, but this causes compiler warnings
-  when it is declared as friend function in log.h.
-*/
-void
+static void
 binlog_checkpoint_callback(void *cookie)
 {
   MYSQL_BIN_LOG::xid_count_per_binlog *entry=
@@ -8116,9 +8145,128 @@ int TC_LOG_BINLOG::unlog(ulong cookie, m
 void
 TC_LOG_BINLOG::commit_checkpoint_notify(void *cookie)
 {
-  mark_xid_done(((xid_count_per_binlog *)cookie)->binlog_id, true);
+  xid_count_per_binlog *entry= static_cast<xid_count_per_binlog *>(cookie);
+  mysql_mutex_lock(&LOCK_binlog_thread);
+  entry->next_in_queue= binlog_background_thread_queue;
+  binlog_background_thread_queue= entry;
+  mysql_cond_signal(&COND_binlog_thread);
+  mysql_mutex_unlock(&LOCK_binlog_thread);
 }
 
+/*
+  Binlog service thread.
+
+  This thread is used to log binlog checkpoints in the background, rather than
+  in the context of random storage engine threads that happen to call
+  commit_checkpoint_notify_ha() and may not like the delays while syncing
+  binlog to disk or may not be setup with all my_thread_init() and other
+  necessary stuff.
+
+  In the future, this thread could also be used to do log rotation in the
+  background, which could elimiate all stalls around binlog rotations.
+*/
+pthread_handler_t
+binlog_background_thread(void *arg __attribute__((unused)))
+{
+  bool stop;
+  MYSQL_BIN_LOG::xid_count_per_binlog *queue, *next;
+  THD *thd;
+
+  my_thread_init();
+  thd= new THD;
+  thd->system_thread= SYSTEM_THREAD_BINLOG_BACKGROUND;
+  my_pthread_setspecific_ptr(THR_THD, thd);
+  mysql_mutex_lock(&LOCK_thread_count);
+  thd->thread_id= thread_id++;
+  mysql_mutex_unlock(&LOCK_thread_count);
+
+  for (;;)
+  {
+    /*
+      Wait until there is something in the queue to process, or we are asked
+      to shut down.
+    */
+    thd_proc_info(thd, "Waiting for background binlog tasks");
+    mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_thread);
+    for (;;)
+    {
+      stop= binlog_background_thread_stop;
+      queue= binlog_background_thread_queue;
+      if (stop || queue)
+        break;
+      mysql_cond_wait(&mysql_bin_log.COND_binlog_thread,
+                      &mysql_bin_log.LOCK_binlog_thread);
+    }
+    /* Grab the queue, if any. */
+    binlog_background_thread_queue= NULL;
+    mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_thread);
+
+    /* Process any incoming commit_checkpoint_notify() calls. */
+    while (queue)
+    {
+      thd_proc_info(thd, "Processing binlog checkpoint notification");
+      /* Grab next pointer first, as mark_xid_done() may free the element. */
+      next= queue->next_in_queue;
+      mysql_bin_log.mark_xid_done(queue->binlog_id, true);
+      queue= next;
+
+      DBUG_EXECUTE_IF("binlog_background_checkpoint_processed",
+        DBUG_ASSERT(!debug_sync_set_action(
+          thd,
+          STRING_WITH_LEN("now SIGNAL binlog_background_checkpoint_processed")));
+        );
+    }
+
+    if (stop)
+      break;
+  }
+
+  thd_proc_info(thd, "Stopping binlog background thread");
+
+  mysql_mutex_lock(&LOCK_thread_count);
+  delete thd;
+  mysql_mutex_unlock(&LOCK_thread_count);
+
+  my_thread_end();
+
+  /* Signal that we are (almost) stopped. */
+  mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_thread);
+  binlog_background_thread_stop= false;
+  mysql_cond_signal(&mysql_bin_log.COND_binlog_thread_end);
+  mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_thread);
+
+  return 0;
+}
+
+#ifdef HAVE_PSI_INTERFACE
+static PSI_thread_key key_thread_binlog;
+
+static PSI_thread_info all_binlog_threads[]=
+{
+  { &key_thread_binlog, "binlog_background", PSI_FLAG_GLOBAL},
+};
+#endif /* HAVE_PSI_INTERFACE */
+
+static bool
+start_binlog_background_thread()
+{
+  pthread_t th;
+
+#ifdef HAVE_PSI_INTERFACE
+  if (PSI_server)
+    PSI_server->register_thread("sql", all_binlog_threads,
+                                array_elements(all_binlog_threads));
+#endif
+
+  if (mysql_thread_create(key_thread_binlog, &th, NULL,
+                          binlog_background_thread, NULL))
+    return 1;
+
+  binlog_thread_started= true;
+  return 0;
+}
+
+
 int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name,
                            IO_CACHE *first_log,
                            Format_description_log_event *fdle)

=== modified file 'sql/log.h'
--- a/sql/log.h	2012-09-13 12:31:29 +0000
+++ b/sql/log.h	2012-09-13 13:51:09 +0000
@@ -395,8 +395,6 @@ class MYSQL_QUERY_LOG: public MYSQL_LOG
 #define BINLOG_COOKIE_IS_DUMMY(c) \
   ( ((ulong)(c)>>1) == BINLOG_COOKIE_DUMMY_ID )
 
-void binlog_checkpoint_callback(void *cookie);
-
 class binlog_cache_mngr;
 class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
 {
@@ -451,27 +449,6 @@ class MYSQL_BIN_LOG: public TC_LOG, priv
   };
 
   /*
-    A list of struct xid_count_per_binlog is used to keep track of how many
-    XIDs are in prepared, but not committed, state in each binlog. And how
-    many commit_checkpoint_request()'s are pending.
-
-    When count drops to zero in a binlog after rotation, it means that there
-    are no more XIDs in prepared state, so that binlog is no longer needed
-    for XA crash recovery, and we can log a new binlog checkpoint event.
-
-    The list is protected against simultaneous access from multiple
-    threads by LOCK_xid_list.
-  */
-  struct xid_count_per_binlog : public ilink {
-    char *binlog_name;
-    uint binlog_name_len;
-    ulong binlog_id;
-    /* Total prepared XIDs and pending checkpoint requests in this binlog. */
-    long xid_count;
-    xid_count_per_binlog();   /* Give link error if constructor used. */
-  };
-  I_List<xid_count_per_binlog> binlog_xid_count_list;
-  /*
     When this is set, a RESET MASTER is in progress.
 
     Then we should not write any binlog checkpoints into the binlog (that
@@ -480,7 +457,6 @@ class MYSQL_BIN_LOG: public TC_LOG, priv
     checkpoint arrives - when all have arrived, RESET MASTER will complete.
   */
   bool reset_master_pending;
-  friend void binlog_checkpoint_callback(void *cookie);
 
   /* LOCK_log and LOCK_index are inited by init_pthread_objects() */
   mysql_mutex_t LOCK_index;
@@ -553,10 +529,35 @@ class MYSQL_BIN_LOG: public TC_LOG, priv
   int write_transaction_or_stmt(group_commit_entry *entry);
   bool write_transaction_to_binlog_events(group_commit_entry *entry);
   void trx_group_commit_leader(group_commit_entry *leader);
-  void mark_xid_done(ulong cookie, bool write_checkpoint);
-  void mark_xids_active(ulong cookie, uint xid_count);
 
 public:
+  /*
+    A list of struct xid_count_per_binlog is used to keep track of how many
+    XIDs are in prepared, but not committed, state in each binlog. And how
+    many commit_checkpoint_request()'s are pending.
+
+    When count drops to zero in a binlog after rotation, it means that there
+    are no more XIDs in prepared state, so that binlog is no longer needed
+    for XA crash recovery, and we can log a new binlog checkpoint event.
+
+    The list is protected against simultaneous access from multiple
+    threads by LOCK_xid_list.
+  */
+  struct xid_count_per_binlog : public ilink {
+    char *binlog_name;
+    uint binlog_name_len;
+    ulong binlog_id;
+    /* Total prepared XIDs and pending checkpoint requests in this binlog. */
+    long xid_count;
+    /* For linking in requests to the binlog background thread. */
+    xid_count_per_binlog *next_in_queue;
+    xid_count_per_binlog();   /* Give link error if constructor used. */
+  };
+  I_List<xid_count_per_binlog> binlog_xid_count_list;
+  mysql_mutex_t LOCK_binlog_thread;
+  mysql_cond_t COND_binlog_thread;
+  mysql_cond_t COND_binlog_thread_end;
+
   using MYSQL_LOG::generate_name;
   using MYSQL_LOG::is_open;
 
@@ -712,6 +713,8 @@ class MYSQL_BIN_LOG: public TC_LOG, priv
   bool appendv(const char* buf,uint len,...);
   bool append(Log_event* ev);
 
+  void mark_xids_active(ulong cookie, uint xid_count);
+  void mark_xid_done(ulong cookie, bool write_checkpoint);
   void make_log_name(char* buf, const char* log_ident);
   bool is_active(const char* log_file_name);
   bool can_purge_log(const char *log_file_name);

=== modified file 'sql/mysqld.cc'
--- a/sql/mysqld.cc	2012-09-13 12:31:29 +0000
+++ b/sql/mysqld.cc	2012-09-13 13:51:09 +0000
@@ -724,6 +724,7 @@ PSI_mutex_key key_LOCK_des_key_file;
 #endif /* HAVE_OPENSSL */
 
 PSI_mutex_key key_BINLOG_LOCK_index, key_BINLOG_LOCK_xid_list,
+  key_BINLOG_LOCK_binlog_thread,
   key_delayed_insert_mutex, key_hash_filo_lock, key_LOCK_active_mi,
   key_LOCK_connection_count, key_LOCK_crypt, key_LOCK_delayed_create,
   key_LOCK_delayed_insert, key_LOCK_delayed_status, key_LOCK_error_log,
@@ -766,6 +767,7 @@ static PSI_mutex_info all_server_mutexes
 
   { &key_BINLOG_LOCK_index, "MYSQL_BIN_LOG::LOCK_index", 0},
   { &key_BINLOG_LOCK_xid_list, "MYSQL_BIN_LOG::LOCK_xid_list", 0},
+  { &key_BINLOG_LOCK_binlog_thread, "MYSQL_BIN_LOG::LOCK_binlog_thread", 0},
   { &key_RELAYLOG_LOCK_index, "MYSQL_RELAY_LOG::LOCK_index", 0},
   { &key_delayed_insert_mutex, "Delayed_insert::mutex", 0},
   { &key_hash_filo_lock, "hash_filo::lock", 0},
@@ -834,6 +836,7 @@ PSI_cond_key key_PAGE_cond, key_COND_act
 #endif /* HAVE_MMAP */
 
 PSI_cond_key key_BINLOG_COND_xid_list, key_BINLOG_update_cond,
+  key_BINLOG_COND_binlog_thread, key_BINLOG_COND_binlog_thread_end,
   key_COND_cache_status_changed, key_COND_manager,
   key_COND_rpl_status, key_COND_server_started,
   key_delayed_insert_cond, key_delayed_insert_cond_client,
@@ -863,6 +866,8 @@ static PSI_cond_info all_server_conds[]=
 #endif /* HAVE_MMAP */
   { &key_BINLOG_COND_xid_list, "MYSQL_BIN_LOG::COND_xid_list", 0},
   { &key_BINLOG_update_cond, "MYSQL_BIN_LOG::update_cond", 0},
+  { &key_BINLOG_COND_binlog_thread, "MYSQL_BIN_LOG::COND_binlog_thread", 0},
+  { &key_BINLOG_COND_binlog_thread_end, "MYSQL_BIN_LOG::COND_binlog_thread_end", 0},
   { &key_BINLOG_COND_queue_busy, "MYSQL_BIN_LOG::COND_queue_busy", 0},
   { &key_RELAYLOG_update_cond, "MYSQL_RELAY_LOG::update_cond", 0},
   { &key_RELAYLOG_COND_queue_busy, "MYSQL_RELAY_LOG::COND_queue_busy", 0},

=== modified file 'sql/mysqld.h'
--- a/sql/mysqld.h	2012-09-13 12:31:29 +0000
+++ b/sql/mysqld.h	2012-09-13 13:51:09 +0000
@@ -226,6 +226,7 @@ extern PSI_mutex_key key_LOCK_des_key_fi
 #endif
 
 extern PSI_mutex_key key_BINLOG_LOCK_index, key_BINLOG_LOCK_xid_list,
+  key_BINLOG_LOCK_binlog_thread,
   key_delayed_insert_mutex, key_hash_filo_lock, key_LOCK_active_mi,
   key_LOCK_connection_count, key_LOCK_crypt, key_LOCK_delayed_create,
   key_LOCK_delayed_insert, key_LOCK_delayed_status, key_LOCK_error_log,
@@ -257,6 +258,7 @@ extern PSI_cond_key key_PAGE_cond, key_C
 #endif /* HAVE_MMAP */
 
 extern PSI_cond_key key_BINLOG_COND_xid_list, key_BINLOG_update_cond,
+  key_BINLOG_COND_binlog_thread, key_BINLOG_COND_binlog_thread_end,
   key_COND_cache_status_changed, key_COND_manager,
   key_COND_rpl_status, key_COND_server_started,
   key_delayed_insert_cond, key_delayed_insert_cond_client,

=== modified file 'sql/sql_class.h'
--- a/sql/sql_class.h	2012-09-13 12:31:29 +0000
+++ b/sql/sql_class.h	2012-09-13 13:51:09 +0000
@@ -1244,7 +1244,8 @@ enum enum_thread_type
   SYSTEM_THREAD_SLAVE_SQL= 4,
   SYSTEM_THREAD_NDBCLUSTER_BINLOG= 8,
   SYSTEM_THREAD_EVENT_SCHEDULER= 16,
-  SYSTEM_THREAD_EVENT_WORKER= 32
+  SYSTEM_THREAD_EVENT_WORKER= 32,
+  SYSTEM_THREAD_BINLOG_BACKGROUND= 64
 };
 
 inline char const *

=== modified file 'storage/innobase/handler/ha_innodb.cc'
--- a/storage/innobase/handler/ha_innodb.cc	2012-09-13 12:31:29 +0000
+++ b/storage/innobase/handler/ha_innodb.cc	2012-09-13 13:51:09 +0000
@@ -106,6 +106,7 @@ static ulong commit_threads = 0;
 static mysql_mutex_t commit_threads_m;
 static mysql_cond_t commit_cond;
 static mysql_mutex_t commit_cond_m;
+static mysql_mutex_t pending_checkpoint_mutex;
 static bool innodb_inited = 0;
 
 #define INSIDE_HA_INNOBASE_CC
@@ -222,11 +223,13 @@ static mysql_pfs_key_t	innobase_share_mu
 static mysql_pfs_key_t  commit_threads_m_key;
 static mysql_pfs_key_t  commit_cond_mutex_key;
 static mysql_pfs_key_t  commit_cond_key;
+static mysql_pfs_key_t  pending_checkpoint_mutex_key;
 
 static PSI_mutex_info   all_pthread_mutexes[] = {
         {&commit_threads_m_key, "commit_threads_m", 0},
         {&commit_cond_mutex_key, "commit_cond_mutex", 0},
-        {&innobase_share_mutex_key, "innobase_share_mutex", 0}
+        {&innobase_share_mutex_key, "innobase_share_mutex", 0},
+        {&pending_checkpoint_mutex_key, "pending_checkpoint_mutex", 0}
 };
 
 static PSI_cond_info    all_innodb_conds[] = {
@@ -2592,6 +2595,9 @@ innobase_init(
         mysql_mutex_init(commit_cond_mutex_key,
                          &commit_cond_m, MY_MUTEX_INIT_FAST);
         mysql_cond_init(commit_cond_key, &commit_cond, NULL);
+        mysql_mutex_init(pending_checkpoint_mutex_key,
+                         &pending_checkpoint_mutex,
+                         MY_MUTEX_INIT_FAST);
         innodb_inited= 1;
 #ifdef MYSQL_DYNAMIC_PLUGIN
         if (innobase_hton != p) {
@@ -2639,6 +2645,7 @@ innobase_end(
                 mysql_mutex_destroy(&commit_threads_m);
                 mysql_mutex_destroy(&commit_cond_m);
                 mysql_cond_destroy(&commit_cond);
+                mysql_mutex_destroy(&pending_checkpoint_mutex);
         }
 
         DBUG_RETURN(err);
@@ -3008,6 +3015,16 @@ innobase_rollback_trx(
         DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
 }
 
+
+struct pending_checkpoint {
+        struct pending_checkpoint *next;
+        handlerton *hton;
+        void *cookie;
+        ib_uint64_t lsn;
+};
+static struct pending_checkpoint *pending_checkpoint_list;
+static struct pending_checkpoint *pending_checkpoint_list_end;
+
 /*****************************************************************//**
 Handle a commit checkpoint request from server layer.
 We simply flush the redo log immediately and do the notify call.*/
@@ -3017,8 +3034,113 @@ innobase_checkpoint_request(
         handlerton *hton,
         void *cookie)
 {
-        log_buffer_flush_to_disk();
-        commit_checkpoint_notify_ha(hton, cookie);
+        ib_uint64_t                     lsn;
+        ib_uint64_t                     flush_lsn;
+        struct pending_checkpoint *     entry;
+
+        /* Do the allocation outside of lock to reduce contention. The normal
+        case is that not everything is flushed, so we will need to enqueue. */
+        entry = static_cast<struct pending_checkpoint *>
+                (my_malloc(sizeof(*entry), MYF(MY_WME)));
+        if (!entry) {
+                sql_print_error("Failed to allocate %u bytes."
+                                " Commit checkpoint will be skipped.",
+                                static_cast<unsigned>(sizeof(*entry)));
+                return;
+        }
+
+        entry->next = NULL;
+        entry->hton = hton;
+        entry->cookie = cookie;
+
+        mysql_mutex_lock(&pending_checkpoint_mutex);
+        lsn = log_get_lsn();
+        flush_lsn = log_get_flush_lsn();
+        if (lsn > flush_lsn) {
+                /* Put the request in queue.
+                When the log gets flushed past the lsn, we will remove the
+                entry from the queue and notify the upper layer. */
+                entry->lsn = lsn;
+                if (pending_checkpoint_list_end) {
+                        pending_checkpoint_list_end->next = entry;
+                } else {
+                        pending_checkpoint_list = entry;
+                }
+                pending_checkpoint_list_end = entry;
+                entry = NULL;
+        }
+        mysql_mutex_unlock(&pending_checkpoint_mutex);
+
+        if (entry) {
+                /* We are already flushed. Notify the checkpoint immediately. */
+                commit_checkpoint_notify_ha(entry->hton, entry->cookie);
+                my_free(entry);
+        }
+}
+
+/*****************************************************************//**
+Log code calls this whenever log has been written and/or flushed up
+to a new position. We use this to notify upper layer of a new commit
+checkpoint when necessary.*/
+extern "C" UNIV_INTERN
+void
+innobase_mysql_log_notify(
+/*===============*/
+        ib_uint64_t     write_lsn,      /*!< in: LSN written to log file */
+        ib_uint64_t     flush_lsn)      /*!< in: LSN flushed to disk */
+{
+        struct pending_checkpoint *     pending;
+        struct pending_checkpoint *     entry;
+        struct pending_checkpoint *     last_ready;
+
+        /* It is safe to do a quick check for NULL first without lock.
+        Even if we should race, we will at most skip one checkpoint and
+        take the next one, which is harmless. */
+        if (!pending_checkpoint_list)
+                return;
+
+        mysql_mutex_lock(&pending_checkpoint_mutex);
+        pending = pending_checkpoint_list;
+        if (!pending)
+        {
+                mysql_mutex_unlock(&pending_checkpoint_mutex);
+                return;
+        }
+
+        last_ready = NULL;
+        for (entry = pending; entry != NULL; entry = entry -> next)
+        {
+                if (entry->lsn > flush_lsn)
+                        break;
+                last_ready = entry;
+        }
+
+        if (last_ready)
+        {
+                /* We found some pending checkpoints that are now flushed to
+                disk. So remove them from the list. */
+                pending_checkpoint_list = entry;
+                if (!entry)
+                        pending_checkpoint_list_end = NULL;
+        }
+
+        mysql_mutex_unlock(&pending_checkpoint_mutex);
+
+        if (!last_ready)
+                return;
+
+        /* Now that we have released the lock, notify upper layer about all
+        commit checkpoints that have now completed. */
+        for (;;) {
+                entry = pending;
+                pending = pending->next;
+
+                commit_checkpoint_notify_ha(entry->hton, entry->cookie);
+
+                my_free(entry);
+                if (entry == last_ready)
+                        break;
+        }
 }
 
 /*****************************************************************//**

=== modified file 'storage/innobase/include/ha_prototypes.h'
--- a/storage/innobase/include/ha_prototypes.h	2011-04-26 17:55:52 +0000
+++ b/storage/innobase/include/ha_prototypes.h	2012-09-13 13:51:09 +0000
@@ -136,6 +136,17 @@ innobase_mysql_print_thd(
         uint    max_query_len); /*!< in: max query length to print, or 0 to
                                    use the default max length */
 
+/*****************************************************************//**
+Log code calls this whenever log has been written and/or flushed up
+to a new position. We use this to notify upper layer of a new commit
+checkpoint when necessary.*/
+UNIV_INTERN
+void
+innobase_mysql_log_notify(
+/*===============*/
+        ib_uint64_t     write_lsn,      /*!< in: LSN written to log file */
+        ib_uint64_t     flush_lsn);     /*!< in: LSN flushed to disk */
+
 /**************************************************************//**
 Converts a MySQL type to an InnoDB type. Note that this function returns
 the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1

=== modified file 'storage/innobase/include/log0log.h'
--- a/storage/innobase/include/log0log.h	2012-06-07 13:44:26 +0000
+++ b/storage/innobase/include/log0log.h	2012-09-13 13:51:09 +0000
@@ -151,6 +151,13 @@ UNIV_INLINE
 ib_uint64_t
 log_get_lsn(void);
 /*=============*/
+/************************************************************//**
+Gets the last lsn that is fully flushed to disk.
+ at return last flushed lsn */
+UNIV_INLINE
+ib_uint64_t
+log_get_flush_lsn(void);
+/*=============*/
 /****************************************************************
 Gets the log group capacity. It is OK to read the value without
 holding log_sys->mutex because it is constant.

=== modified file 'storage/innobase/include/log0log.ic'
--- a/storage/innobase/include/log0log.ic	2011-04-05 07:18:43 +0000
+++ b/storage/innobase/include/log0log.ic	2012-09-13 13:51:09 +0000
@@ -411,6 +411,25 @@ log_get_lsn(void)
         return(lsn);
 }
 
+/************************************************************//**
+Gets the last lsn that is fully flushed to disk.
+ at return last flushed lsn */
+UNIV_INLINE
+ib_uint64_t
+log_get_flush_lsn(void)
+/*=============*/
+{
+        ib_uint64_t     lsn;
+
+        mutex_enter(&(log_sys->mutex));
+
+        lsn = log_sys->flushed_to_disk_lsn;
+
+        mutex_exit(&(log_sys->mutex));
+
+        return(lsn);
+}
+
 /****************************************************************
 Gets the log group capacity. It is OK to read the value without
 holding log_sys->mutex because it is constant.

=== modified file 'storage/innobase/log/log0log.c'
--- a/storage/innobase/log/log0log.c	2012-03-21 03:48:12 +0000
+++ b/storage/innobase/log/log0log.c	2012-09-13 13:51:09 +0000
@@ -1353,6 +1353,8 @@ log_write_up_to(
         ulint           loop_count      = 0;
 #endif /* UNIV_DEBUG */
         ulint           unlock;
+        ib_uint64_t     write_lsn;
+        ib_uint64_t     flush_lsn;
 
         if (recv_no_ibuf_operations) {
                 /* Recovery is running and no operations on the log files are
@@ -1530,8 +1532,13 @@ log_write_up_to(
 
         log_flush_do_unlocks(unlock);
 
+        write_lsn = log_sys->write_lsn;
+        flush_lsn = log_sys->flushed_to_disk_lsn;
+
         mutex_exit(&(log_sys->mutex));
 
+        innobase_mysql_log_notify(write_lsn, flush_lsn);
+
         return;
 
 do_waits:



More information about the commits mailing list