[Commits] 616b4f9: Fix issue #14: Doubling space by primary key

Sergei Petrunia psergey at askmonty.org
Tue Feb 10 00:32:45 EET 2015


revision-id: 616b4f93dbd9ae668ee713ac3583b1320e6fd187
parent(s): 722973086891807cc2a22d165cc63e2644ba62c0
committer: Sergei Petrunia
branch nick: mysql-5.6-rocksdb-issue14-try2
timestamp: 2015-02-10 01:32:45 +0300
message:

Fix issue #14: Doubling space by primary key

Summary:
SQL table records are stored as (pk_keyval -> record_val) key-value
pairs. When PK column datatype allows to decode PK column value from
its mem-comparable form in pk_keyval, we don't need to store the
column in record_val.

Test Plan: ./mtr t/rocks*test && ./mtr --suite=rocksdb && ./mtr --suite=rocksdb_rpl

Reviewers: hermanlee4, jonahcohen, maykov, yoshinorim

Reviewed By: yoshinorim

Differential Revision: https://reviews.facebook.net/D33117

---
 storage/rocksdb/ha_rocksdb.cc  |   55 +++++++++++++++++++++++++++++++++++-----
 storage/rocksdb/ha_rocksdb.h   |    3 +++
 storage/rocksdb/rdb_datadic.cc |    6 +++++
 storage/rocksdb/rdb_datadic.h  |   13 +++++++++-
 4 files changed, 70 insertions(+), 7 deletions(-)

diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc
index fd9a49b..d98643f 100644
--- a/storage/rocksdb/ha_rocksdb.cc
+++ b/storage/rocksdb/ha_rocksdb.cc
@@ -691,7 +691,7 @@ public:
 
   /* The following is not needed by RocksDB, but conceptually should be here: */
   static ulong get_hashnr(const char *key, size_t key_len);
-  const char* Name() const { return "RocksDB_SE_v3.1"; }
+  const char* Name() const { return "RocksDB_SE_v3.2"; }
 
   //TODO: advanced funcs:
   // - FindShortestSeparator
@@ -708,7 +708,7 @@ class Reverse_comparator : public rocksdb::Comparator
   {
     return -bytewise_compare(a,b);
   }
-  const char* Name() const { return "rev:RocksDB_SE_v3.1"; }
+  const char* Name() const { return "rev:RocksDB_SE_v3.2"; }
   void FindShortestSeparator(std::string* start, const rocksdb::Slice& limit) const {}
   void FindShortSuccessor(std::string* key) const {}
 };
@@ -1536,6 +1536,10 @@ void ha_rocksdb::convert_record_to_storage_format(rocksdb::Slice *packed_rec)
   
   for (uint i=0; i < table->s->fields; i++)
   {
+    /* Don't pack decode-able PK key parts */
+    if (field_enc[i].skip)
+      continue;
+
     Field *field= table->field[i];
     if (field_enc[i].maybe_null())
     {
@@ -1604,7 +1608,9 @@ int ha_rocksdb::convert_record_from_storage_format(rocksdb::Slice *slice,
 
 /*
   Unpack the record in this->retrieved_record from storage format into 
-  buf (which can be table->record[0] or table->record[1])
+  buf (which can be table->record[0] or table->record[1]).
+
+  psergey-todo: this->last_rowkey has a part of the data.
 
   If the table has blobs, the unpacked data in buf may keep pointers to the
   data in this->retrieved_record.
@@ -1619,12 +1625,26 @@ int ha_rocksdb::convert_record_from_storage_format(uchar * buf)
   Stream_reader reader(retrieved_record);
   my_ptrdiff_t ptr_diff= buf - table->record[0];
   
-  const char *null_bytes;
-  if (!(null_bytes= reader.read(null_bytes_in_rec)))
-    return HA_ERR_INTERNAL_ERROR;
+  /*
+    Decode PK fields from the key
+  */
+  rocksdb::Slice rowkey_slice(last_rowkey.ptr(), last_rowkey.length());
 
+  DBUG_ASSERT(pk_descr->get_unpack_data_len() == 0);
+  rocksdb::Slice pk_unpack_info_slice;
+  if (pk_descr->unpack_record(table, buf, &rowkey_slice, &pk_unpack_info_slice))
+    return HA_ERR_INTERNAL_ERROR;
+  
+  /* Other fields are decoded from the value */
+  const char * UNINIT_VAR(null_bytes);
+  if (null_bytes_in_rec && !(null_bytes= reader.read(null_bytes_in_rec)))
+    return HA_ERR_INTERNAL_ERROR;
+  
   for (uint i=0; i < table->s->fields; i++)
   {
+    if (field_enc[i].skip)
+      continue;
+
     Field *field= table->field[i];
 
     int isNull = field_enc[i].maybe_null() &&
@@ -1745,6 +1765,29 @@ void ha_rocksdb::setup_field_converters()
   for (i= 0; i < table->s->fields; i++)
   {
     Field *field= table->field[i];
+    field_enc[i].skip= false;
+
+    /*
+      Check if this field is 
+      - a part of primary key, and
+      - it can be decoded back from its key image.
+      If both hold, we don't need to store this field in the value part of
+      RocksDB's key-value pair.
+    */
+    if (field->part_of_key.is_set(table->s->primary_key))
+    {
+      KEY *pk_info= &table->key_info[table->s->primary_key];
+      for (uint kp= 0; kp < pk_info->user_defined_key_parts; kp++)
+      {
+        if (field->field_index + 1 == pk_info->key_part[kp].fieldnr)
+        {
+          if (pk_descr->can_unpack(kp))
+            field_enc[i].skip= true; /* Don't store */
+          break;
+        }
+      }
+    }
+
     field_enc[i].field_type= field->real_type();
 
     if (field->real_maybe_null())
diff --git a/storage/rocksdb/ha_rocksdb.h b/storage/rocksdb/ha_rocksdb.h
index 73770b0..0019e4e 100644
--- a/storage/rocksdb/ha_rocksdb.h
+++ b/storage/rocksdb/ha_rocksdb.h
@@ -192,6 +192,9 @@ class ha_rocksdb: public handler
   */
   typedef struct st_field_encoder
   {
+    /* skip=true means this is decodeable part of PK and so not stored */
+    bool skip;
+
     uint null_offset;
     uchar null_mask;  // 0 means the field cannot be null
     
diff --git a/storage/rocksdb/rdb_datadic.cc b/storage/rocksdb/rdb_datadic.cc
index d40cc69..f46fba6 100644
--- a/storage/rocksdb/rdb_datadic.cc
+++ b/storage/rocksdb/rdb_datadic.cc
@@ -485,6 +485,12 @@ int RDBSE_KEYDEF::unpack_record(TABLE *table, uchar *buf,
   return 0;
 }
 
+
+bool RDBSE_KEYDEF::can_unpack(uint kp) const
+{
+  return (pack_info[kp].unpack_func != NULL);
+}
+
 ///////////////////////////////////////////////////////////////////////////////////////////
 // Field_pack_info
 ///////////////////////////////////////////////////////////////////////////////////////////
diff --git a/storage/rocksdb/rdb_datadic.h b/storage/rocksdb/rdb_datadic.h
index 6160e16..f8a780e 100644
--- a/storage/rocksdb/rdb_datadic.h
+++ b/storage/rocksdb/rdb_datadic.h
@@ -45,8 +45,11 @@ class Stream_reader
 public:
   Stream_reader(const std::string &str)
   {
-    ptr= &str.at(0);
     len= str.length();
+    if (len)
+      ptr= &str.at(0);
+    else
+      ptr= NULL;
   }
 
   Stream_reader(const rocksdb::Slice *slice)
@@ -247,6 +250,14 @@ public:
 
   rocksdb::ColumnFamilyHandle *get_cf() { return cf_handle; }
   
+  /* Check if keypart #kp can be unpacked from index tuple */
+  bool can_unpack(uint kp) const;
+  
+  /*
+    Current code assumes that unpack_data occupies fixed length regardless of
+    the value that is stored.
+  */
+  bool get_unpack_data_len() { return unpack_data_len; }
 private:
   
   /* Global number of this index (used as prefix in StorageFormat) */


More information about the commits mailing list