[Commits] Rev 3571: MDEV-381: fdatasync() does not correctly flush growing binlog file. in http://bazaar.launchpad.net/~maria-captains/maria/5.3

knielsen at knielsen-hq.org knielsen at knielsen-hq.org
Thu Aug 30 11:55:28 EEST 2012


At http://bazaar.launchpad.net/~maria-captains/maria/5.3

------------------------------------------------------------
revno: 3571
revision-id: knielsen at knielsen-hq.org-20120830085349-3fso9kv80rel9lcs
parent: sergii at pisem.net-20120830070527-td90cio50bmntrkh
committer: knielsen at knielsen-hq.org
branch nick: work-5.3-mdev381
timestamp: Thu 2012-08-30 10:53:49 +0200
message:
  MDEV-381: fdatasync() does not correctly flush growing binlog file.
  
  When we append data to the binlog file, we use fdatasync() to ensure
  the data gets to disk so that crash recovery can work.
  
  Unfortunately there seems to be a bug in ext3/ext4 on linux, so that
  fdatasync() does not correctly sync all data when the size of a file
  is increased. This causes crash recovery to not work correctly (it
  loses transactions from the binlog).
  
  As a work-around, use fsync() for the binlog, not fdatasync(). Since
  we are increasing the file size, (correct) fdatasync() will most
  likely not be faster than fsync() on any file system, and fsync()
  does work correctly on ext3/ext4. This avoids the need to try to
  detect if we are running on buggy ext3/ext4.
=== modified file 'include/my_sys.h'
--- a/include/my_sys.h	2012-08-24 12:02:32 +0000
+++ b/include/my_sys.h	2012-08-30 08:53:49 +0000
@@ -70,6 +70,7 @@ extern int NEAR my_errno;		/* Last error
 #define MY_THREADSAFE 2048      /* my_seek(): lock fd mutex */
 #define MY_SYNC       4096      /* my_copy(): sync dst file */
 #define MY_SYNC_DIR   32768     /* my_create/delete/rename: sync directory */
+#define MY_SYNC_FILESIZE 65536  /* my_sync(): safe sync when file is extended */
 
 #define MY_CHECK_ERROR  1       /* Params to my_end; Check open-close */
 #define MY_GIVE_INFO    2       /* Give time info about process*/

=== modified file 'mysys/my_sync.c'
--- a/mysys/my_sync.c	2011-12-11 09:34:44 +0000
+++ b/mysys/my_sync.c	2012-08-30 08:53:49 +0000
@@ -39,6 +39,13 @@ ulong my_sync_count;
     (which is correct behaviour, if we know that the other thread synced the
     file before closing)
 
+    MY_SYNC_FILESIZE is useful when syncing a file after it has been extended.
+    On Linux, fdatasync() on ext3/ext4 file systems does not properly flush
+    to disk the inode data required to preserve the added data across a crash
+    (this looks to be a bug). But when a file is extended, inode data will most
+    likely need flushing in any case, so passing MY_SYNC_FILESIZE as flags
+    is not likely to be any slower, and will be crash safe on Linux ext3/ext4.
+
   RETURN
     0 ok
     -1 error
@@ -67,8 +74,12 @@ int my_sync(File fd, myf my_flags)
     DBUG_PRINT("info",("fcntl(F_FULLFSYNC) failed, falling back"));
 #endif
 #if defined(HAVE_FDATASYNC) && HAVE_DECL_FDATASYNC
-    res= fdatasync(fd);
-#elif defined(HAVE_FSYNC)
+    if (!(my_flags & MY_SYNC_FILESIZE))
+      res= fdatasync(fd);
+    else
+    {
+#endif
+#if defined(HAVE_FSYNC)
     res= fsync(fd);
     if (res == -1 && errno == ENOLCK)
       res= 0;                                   /* Result Bug in Old FreeBSD */
@@ -78,6 +89,9 @@ int my_sync(File fd, myf my_flags)
 #error Cannot find a way to sync a file, durability in danger
     res= 0;                                     /* No sync (strange OS) */
 #endif
+#if defined(HAVE_FDATASYNC) && HAVE_DECL_FDATASYNC
+    }
+#endif
   } while (res == -1 && errno == EINTR);
 
   if (res)

=== modified file 'sql/log.cc'
--- a/sql/log.cc	2012-08-24 13:39:34 +0000
+++ b/sql/log.cc	2012-08-30 08:53:49 +0000
@@ -2838,7 +2838,7 @@ bool MYSQL_BIN_LOG::open(const char *log
       bytes_written+= description_event_for_queue->data_written;
     }
     if (flush_io_cache(&log_file) ||
-        my_sync(log_file.file, MYF(MY_WME)))
+        my_sync(log_file.file, MYF(MY_WME|MY_SYNC_FILESIZE)))
       goto err;
     pthread_mutex_lock(&LOCK_commit_ordered);
     strmake(last_commit_pos_file, log_file_name,
@@ -2864,7 +2864,7 @@ bool MYSQL_BIN_LOG::open(const char *log
                      strlen(log_file_name)) ||
           my_b_write(&index_file, (uchar*) "\n", 1) ||
           flush_io_cache(&index_file) ||
-          my_sync(index_file.file, MYF(MY_WME)))
+          my_sync(index_file.file, MYF(MY_WME|MY_SYNC_FILESIZE)))
         goto err;
 
 #ifdef HAVE_REPLICATION
@@ -2956,7 +2956,7 @@ static bool copy_up_file_and_fill(IO_CAC
   }
   /* The following will either truncate the file or fill the end with \n' */
   if (my_chsize(file, offset - init_offset, '\n', MYF(MY_WME)) ||
-      my_sync(file, MYF(MY_WME)))
+      my_sync(file, MYF(MY_WME|MY_SYNC_FILESIZE)))
     goto err;
 
   /* Reset data in old index cache */
@@ -3549,7 +3549,7 @@ int MYSQL_BIN_LOG::sync_purge_index_file
   DBUG_ENTER("MYSQL_BIN_LOG::sync_purge_index_file");
 
   if ((error= flush_io_cache(&purge_index_file)) ||
-      (error= my_sync(purge_index_file.file, MYF(MY_WME))))
+      (error= my_sync(purge_index_file.file, MYF(MY_WME|MY_SYNC_FILESIZE))))
     DBUG_RETURN(error);
 
   DBUG_RETURN(error);
@@ -4139,7 +4139,7 @@ bool MYSQL_BIN_LOG::flush_and_sync()
   if (++sync_binlog_counter >= sync_binlog_period && sync_binlog_period)
   {
     sync_binlog_counter= 0;
-    err=my_sync(fd, MYF(MY_WME));
+    err=my_sync(fd, MYF(MY_WME|MY_SYNC_FILESIZE));
 #ifndef DBUG_OFF
     if (opt_binlog_dbug_fsync_sleep > 0)
       my_sleep(opt_binlog_dbug_fsync_sleep);



More information about the commits mailing list