[Commits] 2f2bb86: Issue#16: Point lookup by reverse column family doesn't work

Sergei Petrunia psergey at askmonty.org
Tue Feb 24 00:24:32 EET 2015


revision-id: 2f2bb86c3a88bb5555b8e3ef5ef98f702ec58f49
parent(s): 43c740d78ca08b4be04cb5770dc44f9314793da6
committer: Sergei Petrunia
branch nick: mysql-5.6-rocksdb-issue16-r10
timestamp: 2015-02-24 01:24:32 +0300
message:

Issue#16: Point lookup by reverse column family doesn't work

Summary:
The code that converts MySQL's ha_rkey_function index navigation
commands into RocksDB's iterator commands had several bugs.
Fixed them all, and added documentation.

Test Plan: Run the included .test file

Reviewers: maykov, hermanlee4, yoshinorim, jonahcohen

Subscribers: jtolmer

---
 mysql-test/r/rocksdb_range.result        |  193 +++++++++++++++++
 mysql-test/t/rocksdb_range.test          |   94 +++++++++
 storage/rocksdb/ha_rocksdb.cc            |  258 ++++++++++++++---------
 storage/rocksdb/rocksdb-range-access.txt |  337 ++++++++++++++++++++++++++++++
 4 files changed, 783 insertions(+), 99 deletions(-)

diff --git a/mysql-test/r/rocksdb_range.result b/mysql-test/r/rocksdb_range.result
new file mode 100644
index 0000000..42fd9c8
--- /dev/null
+++ b/mysql-test/r/rocksdb_range.result
@@ -0,0 +1,193 @@
+select * from information_schema.engines where engine = 'rocksdb';
+ENGINE	SUPPORT	COMMENT	TRANSACTIONS	XA	SAVEPOINTS
+ROCKSDB	YES	RocksDB storage engine	YES	YES	NO
+drop table if exists t0,t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
+create table t0 (a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table one_k(a int);
+insert into one_k select A.a + B.a* 10 + C.a * 100 from t0 A, t0 B, t0 C;
+create table t1 (
+pk int not null,
+a  int not null,
+b  int not null,
+primary key(pk),
+key(a) comment 'rev:cf1'
+) engine=rocksdb;
+insert into t1 select A.a, FLOOR(A.a/10), A.a from one_k A;
+create table t10 (
+pk int not null,
+a  int not null,
+b  int not null,
+primary key(pk),
+key(a) comment 'rev:empty-cf'
+) engine=rocksdb;
+#
+# HA_READ_KEY_EXACT tests
+#
+explain select * from t1 force index (a) where a=3 and pk=33;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	const	a	a	8	const,const	10	NULL
+select * from t1 force index (a) where a=3 and pk=33;
+pk	a	b
+33	3	33
+select * from t1 ignore index (a) where a=3 and pk=33;
+pk	a	b
+33	3	33
+# Look at non-existing value
+select * from t1 force index (a) where a=99 and pk=99;
+pk	a	b
+# Look at non-existing value
+select * from t1 force index (a) where a=-1 and pk=99;
+pk	a	b
+select * from t1 force index (a) where a=0;
+pk	a	b
+0	0	0
+1	0	1
+2	0	2
+3	0	3
+4	0	4
+5	0	5
+6	0	6
+7	0	7
+8	0	8
+9	0	9
+select * from t1 ignore index (a) where a=0;
+pk	a	b
+0	0	0
+1	0	1
+2	0	2
+3	0	3
+4	0	4
+5	0	5
+6	0	6
+7	0	7
+8	0	8
+9	0	9
+select * from t10 force index (a) where a=1 and pk=1;
+pk	a	b
+#
+# HA_READ_KEY_OR_NEXT tests
+#
+explain 
+select * from t1 force index (a) where a between 3 and 4;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	range	a	a	4	NULL	10	Using where
+select * from t1 force index (a) where a between 3 and 4;
+pk	a	b
+30	3	30
+31	3	31
+32	3	32
+33	3	33
+34	3	34
+35	3	35
+36	3	36
+37	3	37
+38	3	38
+39	3	39
+40	4	40
+41	4	41
+42	4	42
+43	4	43
+44	4	44
+45	4	45
+46	4	46
+47	4	47
+48	4	48
+49	4	49
+select * from t1 ignore index (a) where a between 3 and 4;
+pk	a	b
+30	3	30
+31	3	31
+32	3	32
+33	3	33
+34	3	34
+35	3	35
+36	3	36
+37	3	37
+38	3	38
+39	3	39
+40	4	40
+41	4	41
+42	4	42
+43	4	43
+44	4	44
+45	4	45
+46	4	46
+47	4	47
+48	4	48
+49	4	49
+explain 
+select * from t1 force index (a) where a=5 and pk between 54 and 57;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	range	a	a	8	NULL	10	Using where
+select * from t1 force index (a) where a=5 and pk between 54 and 57;
+pk	a	b
+54	5	54
+55	5	55
+56	5	56
+57	5	57
+select * from t1 ignore index (a) where a=5 and pk between 54 and 57;
+pk	a	b
+54	5	54
+55	5	55
+56	5	56
+57	5	57
+select * from t10 force index (a) where a>=1;
+pk	a	b
+select * from t10 force index (a) where a=1 and pk>=1;
+pk	a	b
+#
+# HA_READ_AFTER_KEY tests
+#
+explain 
+select * from t1 force index (a) where a>2 and a<4;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	range	a	a	4	NULL	10	Using where
+select * from t1 force index (a) where a>2 and a<4;
+pk	a	b
+30	3	30
+31	3	31
+32	3	32
+33	3	33
+34	3	34
+35	3	35
+36	3	36
+37	3	37
+38	3	38
+39	3	39
+explain 
+select * from t1 force index (a) where a=5 and pk > 54 and pk < 57;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	range	a	a	8	NULL	10	Using where
+select * from t1 force index (a) where a=5 and pk > 54 and pk < 57;
+pk	a	b
+55	5	55
+56	5	56
+select * from t10 force index (a) where a>1;
+pk	a	b
+select * from t10 force index (a) where a=1 and pk>1;
+pk	a	b
+# 
+# HA_READ_BEFORE_KEY test
+# 
+explain 
+select * from t1 force index (a) where a>2 and a<4 order by a desc;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	range	a	a	4	NULL	10	Using where
+select * from t1 force index (a) where a>2 and a<4 order by a desc;
+pk	a	b
+39	3	39
+38	3	38
+37	3	37
+36	3	36
+35	3	35
+34	3	34
+33	3	33
+32	3	32
+31	3	31
+30	3	30
+select * from t10 force index (a) where a>1 order by a desc;
+pk	a	b
+select * from t10 force index (a) where a=1 and pk>1 order by a desc;
+pk	a	b
+drop table t0,t1,t10, one_k;
diff --git a/mysql-test/t/rocksdb_range.test b/mysql-test/t/rocksdb_range.test
new file mode 100644
index 0000000..d11f9cc
--- /dev/null
+++ b/mysql-test/t/rocksdb_range.test
@@ -0,0 +1,94 @@
+#
+# RocksDB Storage Engine tests
+#
+select * from information_schema.engines where engine = 'rocksdb';
+
+--disable_warnings
+drop table if exists t0,t1,t2,t3,t4,t5,t6,t7,t8,t9,t10;
+--enable_warnings
+create table t0 (a int);
+insert into t0 values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+create table one_k(a int);
+insert into one_k select A.a + B.a* 10 + C.a * 100 from t0 A, t0 B, t0 C;
+
+create table t1 (
+  pk int not null,
+  a  int not null,
+  b  int not null,
+  primary key(pk),
+  key(a) comment 'rev:cf1'
+) engine=rocksdb;
+
+# 10 pk values for each value of a...
+insert into t1 select A.a, FLOOR(A.a/10), A.a from one_k A;
+
+create table t10 (
+  pk int not null,
+  a  int not null,
+  b  int not null,
+  primary key(pk),
+  key(a) comment 'rev:empty-cf'
+) engine=rocksdb;
+
+
+--echo #
+--echo # HA_READ_KEY_EXACT tests
+--echo #
+explain select * from t1 force index (a) where a=3 and pk=33;
+select * from t1 force index (a) where a=3 and pk=33;
+select * from t1 ignore index (a) where a=3 and pk=33;
+
+--echo # Look at non-existing value
+select * from t1 force index (a) where a=99 and pk=99;
+
+--echo # Look at non-existing value
+select * from t1 force index (a) where a=-1 and pk=99;
+
+select * from t1 force index (a) where a=0;
+select * from t1 ignore index (a) where a=0;
+
+select * from t10 force index (a) where a=1 and pk=1;
+
+--echo #
+--echo # HA_READ_KEY_OR_NEXT tests
+--echo #
+explain 
+select * from t1 force index (a) where a between 3 and 4;
+select * from t1 force index (a) where a between 3 and 4;
+select * from t1 ignore index (a) where a between 3 and 4;
+
+explain 
+select * from t1 force index (a) where a=5 and pk between 54 and 57;
+select * from t1 force index (a) where a=5 and pk between 54 and 57;
+select * from t1 ignore index (a) where a=5 and pk between 54 and 57;
+
+select * from t10 force index (a) where a>=1;
+select * from t10 force index (a) where a=1 and pk>=1;
+
+--echo #
+--echo # HA_READ_AFTER_KEY tests
+--echo #
+explain 
+select * from t1 force index (a) where a>2 and a<4;
+select * from t1 force index (a) where a>2 and a<4;
+
+explain 
+select * from t1 force index (a) where a=5 and pk > 54 and pk < 57;
+select * from t1 force index (a) where a=5 and pk > 54 and pk < 57;
+
+select * from t10 force index (a) where a>1;
+select * from t10 force index (a) where a=1 and pk>1;
+
+--echo # 
+--echo # HA_READ_BEFORE_KEY test
+--echo # 
+explain 
+select * from t1 force index (a) where a>2 and a<4 order by a desc;
+select * from t1 force index (a) where a>2 and a<4 order by a desc;
+
+select * from t10 force index (a) where a>1 order by a desc;
+select * from t10 force index (a) where a=1 and pk>1 order by a desc;
+
+drop table t0,t1,t10, one_k;
+
diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc
index 3cb3334..8e2cda0 100644
--- a/storage/rocksdb/ha_rocksdb.cc
+++ b/storage/rocksdb/ha_rocksdb.cc
@@ -2442,7 +2442,10 @@ int ha_rocksdb::secondary_index_read(int keyno, uchar *buf)
 }
 
 
-//void store_key_image_to_rec(Field *field, uchar *ptr, uint len);
+/*
+   See storage/rocksdb/rocksdb-range-access.txt for description of how MySQL
+   index navigation commands are converted into RocksDB lookup commands.
+*/
 
 int ha_rocksdb::index_read_map(uchar *buf, const uchar *key,
                                key_part_map keypart_map,
@@ -2451,16 +2454,17 @@ int ha_rocksdb::index_read_map(uchar *buf, const uchar *key,
   int rc= 0;
   DBUG_ENTER("ha_rocksdb::index_read_map");
 
-  uint key_parts= table->key_info[active_index].user_defined_key_parts;
   RDBSE_KEYDEF *kd= key_descr[active_index];
+  uint actual_key_parts= table->key_info[active_index].actual_key_parts;
+  bool using_full_key= (keypart_map == HA_WHOLE_KEY) ||
+                       (keypart_map == ((key_part_map(1) << actual_key_parts)
+                                         - 1));
 
   stats.rows_requested++;
   io_perf_start();
 
-  if (active_index == table->s->primary_key && 
-      find_flag == HA_READ_KEY_EXACT &&
-      ((keypart_map == ((key_part_map(1) << key_parts) - 1) ||
-        keypart_map == HA_WHOLE_KEY)))
+  if (active_index == table->s->primary_key &&
+      find_flag == HA_READ_KEY_EXACT && using_full_key)
   {
     /* 
       Equality lookup over primary key, using full tuple. 
@@ -2477,17 +2481,17 @@ int ha_rocksdb::index_read_map(uchar *buf, const uchar *key,
     io_perf_end_and_record();
     DBUG_RETURN(rc);
   }
-
+  //TODO: how does this work with HA_WHOLE_KEY ??
   uint n_used_parts= my_count_bits(keypart_map);
 
   uint packed_size; 
-
   packed_size= kd->pack_index_tuple(table, pack_buffer, 
                                     sec_key_packed_tuple, key, 
                                     keypart_map);
 
   if (find_flag == HA_READ_PREFIX_LAST_OR_PREV || 
-      find_flag == HA_READ_PREFIX_LAST)
+      find_flag == HA_READ_PREFIX_LAST ||
+      find_flag == HA_READ_AFTER_KEY)
   {
     /* See below */
     kd->successor(sec_key_packed_tuple, packed_size);
@@ -2506,12 +2510,26 @@ int ha_rocksdb::index_read_map(uchar *buf, const uchar *key,
     greater than the lookup tuple.
   */
   setup_index_scan(kd, &slice, use_all_keys, is_ascending(kd, find_flag));
-  bool move_forward= true;
-
 
   switch (find_flag) {
   case HA_READ_KEY_EXACT:
   {
+    /* 
+      We are looking for the first record such that 
+        index_tuple= lookup_tuple.
+      lookup_tuple may be a prefix of the index.
+    */
+    if (key_descr[active_index]->is_reverse_cf)
+    {
+      if (!using_full_key)
+      {
+        if (!scan_it->Valid())
+          scan_it->SeekToLast();
+        else
+          scan_it->Prev();
+      }
+    }
+
     if (!scan_it->Valid())
       rc= HA_ERR_KEY_NOT_FOUND;
     else
@@ -2522,8 +2540,8 @@ int ha_rocksdb::index_read_map(uchar *buf, const uchar *key,
                              slice.data(), slice.size(), n_used_parts))
       {
         /* 
-          The record we've got is not from this index, or is not equal to the
-          lookup table
+          Got a record that is not equal to the lookup value, or even a record
+          from another table.index.
         */
         rc= HA_ERR_KEY_NOT_FOUND;
       }
@@ -2532,6 +2550,33 @@ int ha_rocksdb::index_read_map(uchar *buf, const uchar *key,
   }
   case HA_READ_KEY_OR_NEXT:
   {
+    /* 
+      We are looking for the first record such that 
+        index_tuple >= lookup_tuple.
+    */
+    if (key_descr[active_index]->is_reverse_cf)
+    {
+      if (!scan_it->Valid())
+      {
+        scan_it->SeekToLast();
+      }
+      else
+      {
+        /*
+          When using full extended key: we should not step back if 
+          we've found an exact match.
+        */
+        rkey= scan_it->key();
+        if (!using_full_key || 
+            (!kd->covers_key(rkey.data(), rkey.size()) ||
+             kd->cmp_full_keys(rkey.data(), rkey.size(), 
+                               slice.data(), slice.size(), n_used_parts)))
+        {
+          scan_it->Prev();
+        }
+      }
+    }
+
     if (!scan_it->Valid())
       rc= HA_ERR_KEY_NOT_FOUND;
     else
@@ -2545,108 +2590,120 @@ int ha_rocksdb::index_read_map(uchar *buf, const uchar *key,
     }
     break;
   }
-  case HA_READ_BEFORE_KEY:
+  case HA_READ_AFTER_KEY:
   {
-    move_forward= false;
-    /* We want to read the record that's right *before* the given key.  */
-    if (!scan_it->Valid())
+    /* 
+      We are looking for the first record such that 
+        index_tuple > lookup_tuple.
+    */
+    if (key_descr[active_index]->is_reverse_cf)
     {
-      /*
-        All the values in the database are smaller than our key. Two cases
-         - our index is the last in db. Its last value is a match 
-         - our index has no records (in that case we will get a record from 
-           our index and detect it below)
-      */
-      scan_it->SeekToLast();
+      if (!scan_it->Valid())
+        scan_it->SeekToLast();
+      else
+      {
+        /*
+          We should step back
+           - when not using full extended key
+           - when using full extended key and when we've got an exact match
+        */
+        rkey= scan_it->key();
+        if (!using_full_key ||
+            !(kd->covers_key(rkey.data(), rkey.size()) &&
+              !kd->cmp_full_keys(rkey.data(), rkey.size(), 
+                                 slice.data(), slice.size(), n_used_parts)))
+        {
+          scan_it->Prev();
+        }
+      }
     }
+
+    if (!scan_it->Valid())
+      rc= HA_ERR_KEY_NOT_FOUND;
     else
     {
-      /*
-        RocksDB iterator is positioned at "the first key in the source that 
-        at or past target".
-        We need to step one key back, so that we're at the last key that is
-        before the target.
-        If the passed key is greater than the max. value that is found in the
-        table, then iterator is pointing at the *first* record in subsequent
-        table/index.
-      */
-      scan_it->Prev();
+      rkey= scan_it->key();
+      if (!kd->covers_key(rkey.data(), rkey.size()))
+      {
+        /* The record we've got is not from this index */
+        rc= HA_ERR_KEY_NOT_FOUND;
+      }
     }
-    /* fall through */
+    break;
   }
-  case HA_READ_AFTER_KEY:
+  case HA_READ_KEY_OR_PREV:
   {
-    bool in_key;
-    bool have_row;
-    /* 
-      Walk forward until we've found a record that is not equal to the lookup
-      tuple, but still belongs to this index.
+    rc= HA_ERR_UNSUPPORTED;
+    break;
+  }
+  case HA_READ_BEFORE_KEY:
+  {
+    /*
+      We are looking for the last record such that t.key < lookup_tuple.
     */
-    while ((have_row= scan_it->Valid()))
+    if (key_descr[active_index]->is_reverse_cf)
     {
-      rkey= scan_it->key();
-      if (!(in_key= kd->covers_key(rkey.data(), rkey.size())) ||
-          kd->cmp_full_keys(rkey.data(), rkey.size(), 
-                            slice.data(), slice.size(),
-                            n_used_parts))
-        break;
-
-      if (move_forward) 
-        scan_it->Next();
-      else
+      if (using_full_key && scan_it->Valid() &&
+          (kd->covers_key(rkey.data(), rkey.size()) &&
+            !kd->cmp_full_keys(rkey.data(), rkey.size(), 
+                               slice.data(), slice.size(), n_used_parts)))
+      {
+        /* We are using full key and we've hit an exact match */
         scan_it->Prev();
+      }
     }
-    if (!have_row || !in_key)
-      rc= HA_ERR_END_OF_FILE;
-    break;
-  }
-  case HA_READ_KEY_OR_PREV:
-  {
-    if (!scan_it->Valid())
+    else
     {
-      /*
-        We're after the last value in the database. It could be we needed the
-        last one.
-      */
-      scan_it->SeekToLast();
+      if (scan_it->Valid())
+        scan_it->Prev();
     }
-    /* We should see a key that is less-or-equal than specified */
-    bool in_key;
-    bool have_row;
-    while ((have_row= scan_it->Valid()))
+    //a copy
+    if (!scan_it->Valid())
+      rc= HA_ERR_KEY_NOT_FOUND;
+    else
     {
       rkey= scan_it->key();
-      if (!(in_key= kd->covers_key(rkey.data(), rkey.size())) ||
-           kd->cmp_full_keys(rkey.data(), rkey.size(), 
-                             slice.data(), slice.size(),
-                             n_used_parts) <= 0)
-        break;
-      scan_it->Prev();
+      if (!kd->covers_key(rkey.data(), rkey.size()))
+      {
+        /* The record we've got is not from this index */
+        rc= HA_ERR_KEY_NOT_FOUND;
+      }
     }
-    if (!have_row || !in_key)
-      rc= HA_ERR_END_OF_FILE;
     break;
   }
   case HA_READ_PREFIX_LAST:
   case HA_READ_PREFIX_LAST_OR_PREV:
   {
-    /* 
-      Given a prefix of (VAL1,VAL2), get the last record that has
-      (kp1,kp2)=(VAL1,VAL2).  This cannot be translated directly to RocksDB
-      Iterator command.
-
-      We navigate to (VAL1,VAL2+1) and then step one record back. 
+    /*
+      Find the last record with the specified index prefix lookup_tuple.
+      - HA_READ_PREFIX_LAST requires that the record has the 
+        prefix=lookup_tuple (if there are no such records,
+        HA_ERR_KEY_NOT_FOUND should be returned).
+      - HA_READ_PREFIX_LAST_OR_PREV has no such requirement. If there are no 
+        records with prefix=lookup_tuple, we should return the last record
+        before that.
     */
-    if (!scan_it->Valid())
+
+    if (key_descr[active_index]->is_reverse_cf)
     {
-      /*
-        We're after the last value in the database. It could be we needed the
-        last one.
-      */
-      scan_it->SeekToLast();
+      if (using_full_key && scan_it->Valid())
+      {
+        rkey= scan_it->key();
+        if ((kd->covers_key(rkey.data(), rkey.size()) &&
+            !kd->cmp_full_keys(rkey.data(), rkey.size(), 
+                               slice.data(), slice.size(), n_used_parts)))
+        {
+          scan_it->Next();
+        }
+      }
     }
     else
-      scan_it->Prev();
+    {
+      if (!scan_it->Valid())
+        scan_it->SeekToLast();
+      else
+        scan_it->Prev();
+    }
 
     if (scan_it->Valid())
     {
@@ -2658,18 +2715,21 @@ int ha_rocksdb::index_read_map(uchar *buf, const uchar *key,
         rc= HA_ERR_KEY_NOT_FOUND;
       }
       else
-      if (find_flag == HA_READ_PREFIX_LAST) 
       {
-        packed_size= kd->pack_index_tuple(table, pack_buffer,
-                                          sec_key_packed_tuple, key, 
-                                          keypart_map);
         /*
-          Check if the record has the same search prefix.
+          HA_READ_PREFIX_LAST means we should only return records 
+          that match the search prefix.
         */
-        if (kd->cmp_full_keys(rkey.data(), rkey.size(), 
-                              (const char*)sec_key_packed_tuple, packed_size,
-                              n_used_parts))
-          rc= HA_ERR_END_OF_FILE;
+        if (find_flag == HA_READ_PREFIX_LAST) 
+        {
+          packed_size= kd->pack_index_tuple(table, pack_buffer,
+                                            sec_key_packed_tuple, key, 
+                                            keypart_map);
+          if (kd->cmp_full_keys(rkey.data(), rkey.size(), 
+                                (const char*)sec_key_packed_tuple, packed_size,
+                                n_used_parts))
+            rc= HA_ERR_END_OF_FILE;
+        }
       }
     }
     else
@@ -3123,7 +3183,7 @@ int ha_rocksdb::index_first_intern(uchar *buf)
      (n+1)-smth, or no value at all
 
    RocksDB's Iterator::Seek($val) seeks to "at $val or first value that's
-   greater". We can't see to "(n)-ccc" directly, because we don't know what
+   greater". We can't seek to "(n)-ccc" directly, because we don't know what
    is the value of 'ccc' (the biggest record with prefix (n)). Instead, we seek
    to "(n+1)", which is the least possible value that's greater than any value 
    in index #n. Then we step one record back.
diff --git a/storage/rocksdb/rocksdb-range-access.txt b/storage/rocksdb/rocksdb-range-access.txt
new file mode 100644
index 0000000..ef8f797
--- /dev/null
+++ b/storage/rocksdb/rocksdb-range-access.txt
@@ -0,0 +1,337 @@
+
+This file describes how MySQL index navigation commands are translated into 
+RocksDB index navigation commands.
+
+Index tuples are shown as
+
+  ( kv )-aaa-pkN
+
+here 
+ * '(kv)' is the 4-byte index number. 
+ * '-' is just for readability
+ * everything that follows the '-' is mem-comparable form of the key.
+   In ascii encoding,  aaa < bbb < ccc < xxx.
+
+Tuples that start with '#' do not exist in the database. They are only shown
+to demonstrate where Seek() calls end up with.
+
+== HA_READ_KEY_EXACT, forward CF ==
+
+  (kv-1)-xxx-pk
+# ( kv )-aaa      <-- "kv-aaa" doesn't exist in the database, but it would be
+                       here.
+  ( kv )-aaa-pk   <--- Seek("kv-aaa") will put us here on the next record.
+  ( kv )-aaa-pk2       
+  ( kv )-bbb-...
+
+RocksDB calls:
+
+  it->Seek(kv); 
+  if (it->Valid() && kd->covers_key(..) && kd->cmp_full_keys(...))
+    return record.
+
+== HA_READ_KEY_EXACT, backward CF ==
+
+When we need to seek to a tuple that is a prefix of a full key:
+
+  (kv+1)-xxx-pk
+  ( kv )-ccc-pk 
+  ( kv )-bbb-pk3
+  ( kv )-bbb-pk2
+  ( kv )-bbb-pk1 < -- We need to be here
+# ( kv )-bbb         <---we call Seek(kv-bbb)
+  ( kv )-aaa-pk      ... and end up here.  Should call it->Prev().  
+
+There is a special case when (kv)-bbb-pk1 is the last record in the CF, and 
+we get invalid iterator. Then, we need to call SeekToLast().
+
+Another kind of special case is when we need to seek to the full value. 
+Suppose, the lookup tuple is kv-bbb-pk1:
+
+  (kv+1)-xxx-pk
+  ( kv )-ccc-pk 
+  ( kv )-bbb-pk3
+  ( kv )-bbb-pk2
+  ( kv )-bbb-pk1 < -- Seek(kv-bbb-pk1)
+  ( kv )-bbb-pk0  
+
+Then, Seek(kv-bbb-pk1) will position us exactly the tuple we need, and we
+won't need to call it->Prev().  If we get an invalid iterator, there is no
+need to call SeekToLast().
+
+RocksDB calls:
+
+  it->Seek(tuple);
+
+  if (!using_full_key)
+  {
+    if (!it->Valid())
+      it->SeekToLast();
+    else
+      it->Prev();
+  }
+
+  if (it->Valid() && kd->covers_key(..) && kd->cmp_full_keys(...))
+    return record.
+
+== HA_READ_KEY_OR_NEXT, forward CF ==
+
+This is finding min(key) such that key >= lookup_tuple.
+
+If lookup tuple is kv-bbb:
+
+  ( kv )-aaa-pk
+# ( kv )-bbb      <-- "kv-bbb" doesn't exist in the database, but it would be
+                       here.
+  ( kv )-bbb-pk1  <--- Seek("kv-bbb") will put us here on the next record.
+  ( kv )-bbb-pk2       
+  ( kv )-bbb-...
+
+RocksDB calls:
+
+  Seek(kv);
+  if (it->Valid() && kd->covers_key(..) && kd->cmp_full_keys(...))
+    return record.
+
+== HA_READ_KEY_OR_NEXT, backward CF ==
+
+When specified key tuple is a key prefix:
+
+  (kv+1)-xxx-pk
+  ( kv )-ccc-pk 
+  ( kv )-bbb-pk3
+  ( kv )-bbb-pk2
+  ( kv )-bbb-pk1 < -- We need to be here (or above)
+# ( kv )-bbb         <---we call Seek(kv-bbb)
+  ( kv )-aaa-pk      ... and end up here.  Should call it->Prev().  
+
+There is a special case when (kv)-bbb-pk1 is the last record in the CF, and 
+we get invalid iterator. Then, we need to call SeekToLast().
+
+Another kind of special case is when we need to seek to the full value. 
+Suppose, the lookup tuple is kv-bbb-pk1:
+
+  (kv+1)-xxx-pk
+  ( kv )-ccc-pk 
+  ( kv )-bbb-pk3
+  ( kv )-bbb-pk2
+  ( kv )-bbb-pk1 < -- Seek(kv-bbb-pk1)
+  ( kv )-bbb-pk0  
+
+Then, Seek(kv-bbb-pk1) may position us exactly at the tuple we need, and we
+won't need to call it->Prev().
+If we kv-bbb-pk1 is not present in the database, we will be positioned on
+kv-bbb-pk0, and we will need to call it->Prev().
+If we get an invalid iterator, we DO need to call SeekToLast().
+
+RocksDB calls:
+ 
+  Seek(...);
+
+  if (!it->Valid())
+    it->SeekToLast();
+  else
+  {
+    if (!using_full_key ||
+        !(kd->covers_key(...) || kd->cmp_full_keys(...))
+      it->Prev();
+  }
+
+  if (it->Valid() && kd->covers_key(..))
+    return record.
+
+== HA_READ_AFTER_KEY, forward CF ==
+
+This is finding min(key) such that key > lookup_key.
+
+// TODO: mention in github that scans on "key > lookup_key" were made faster.
+
+Suppose index lookup tuple is kv-bbb
+
+  ( kv )-aaa-pk
+# ( kv )-bbb    
+  ( kv )-bbb-pk1  <--- Seek("kv-bbb") will put us here. We need to 
+  ( kv )-bbb-pk2       get to the value that is next after 'bbb'.
+  ( kv )-bbb-pk3       
+  ( kv )-bbb-pk4       
+  ( kv )-bbb-pk5
+  ( kv )-ccc-pkN  <-- That is, we need to be here.
+
+However, we don't know that the next value is kv-ccc. Instead, we seek to the
+first value that is not 'kv-bbb'. It is Successor(kv-bbb).  
+
+It doesn't matter if we're using a full extended key or not.
+
+RocksDB calls:
+
+  Seek(Succcessor(kv-bbb));
+  if (it->Valid() && kd->covers_key(..))
+    return record.
+
+== HA_READ_AFTER_KEY, backward CF ==
+
+Suppose, the lookup key is 'kv-bbb':
+
+  (kv+1)-xxx-pk
+  ( kv )-ccc-pk7
+  ( kv )-ccc-pk6      <-- We need to be here. 
+#  Successor(kv-bbb)  <-- We get here when we call Seek(Successor(kv-bbb)
+  ( kv )-bbb-pk5          and we will need to call Prev();
+  ( kv )-bbb-pk4
+  ( kv )-bbb-pk3
+  ( kv )-bbb-pk2
+  ( kv )-bbb-pk1
+# ( kv )-bbb     
+  ( kv )-aaa-pk  
+
+It doesn't matter if we're using a full extended key or not.
+
+RocksDB calls:
+
+  Seek(Succcessor(kv-bbb));
+  if (it->Valid())
+    it->Prev();
+
+  if (it->Valid() && kd->covers_key(..))
+    return record.
+
+== HA_READ_KEY_OR_PREV ==
+
+HA_READ_KEY_OR_PREV is only used by HANDLER command. RocksDB-SE doesn't
+support it (but we could do it).
+
+== HA_READ_BEFORE_KEY, forward CF ==
+
+This is finding max(key) such that key < lookup_tuple.
+
+Suppose, lookup_tuple=kv-bbb.
+
+  ( kv )-aaa-pk1
+  ( kv )-aaa-pk2
+  ( kv )-aaa-pk3  <-- Need to be here.
+# ( kv )-bbb     
+  ( kv )-bbb-pk4  <--- Seek("kv-bbb") will put us here. 
+  ( kv )-bbb-pk5        
+  ( kv )-bbb-pk6       
+
+1. Seek(kv-bbb) will put us at kv-bbb-pk4.
+2. We will need to call Prev() to get back. (if there is no record
+   before kv-bbb, then we can't find a record).
+
+It doesn't matter if we're using a full extended key or not.
+
+RocksDB calls:
+
+  Seek(kv-bbb);
+  if (it->Valid())
+    it->Prev();
+
+  if (it->Valid() && kd->covers_key(..))
+    return record.
+
+
+== HA_READ_BEFORE_KEY, backward CF ==
+(TODO: put into the code)
+
+This is finding max(key) such that key < lookup_tuple.
+Suppose, lookup_tuple=kv-bbb, a prefix of the full key.
+
+  ( kv )-bbb-pk6
+  ( kv )-bbb-pk5
+  ( kv )-bbb-pk4
+# ( kv )-bbb     
+  ( kv )-aaa-pk3 <-- Need to be here.  Seek("kv-bbb") will put is
+  ( kv )-aaa-pk2                       right here.
+  ( kv )-aaa-pk1
+
+If the lookup tuple is a full key (e.g. kv-bbb-pk4), and the key is present in
+the database, we will hit the key.  We will need to call Prev() to get the
+previous key.
+
+RocksDB calls:
+
+  Seek(kv-bbb);
+
+  if (it->Valid() && using_full_key &&
+      (kd->covers_key(...) && !kd->cmp_full_keys(...)))
+  {
+    /* We are using full key and we've hit an exact match */
+    it->Prev();
+  }
+
+  if (it->Valid() && kd->covers_key(..))
+    return record;
+
+== HA_READ_PREFIX_LAST, forward CF ==
+
+Find the last record with the specified index prefix lookup_tuple.
+
+Suppose, lookup_tuple='kv-bbb'
+
+  ( kv )-aaa-pk2
+  ( kv )-aaa-pk3
+# ( kv )-bbb
+  ( kv )-bbb-pk4
+  ( kv )-bbb-pk5
+  ( kv )-bbb-pk6
+  ( kv )-bbb-pk7 <--- Need to be here.
+# ( kv )-ccc     
+  ( kv )-ccc-pk8 <-- Seek(Successor(kv-bbb)) will get us here. will need 
+  ( kv )-ccc-pk9                               to call Prev().
+
+RocksDB calls:
+
+  Seek(Successor(kv-bbb));
+  if (!it->Valid())
+    it->SeekToLast();
+  else
+    it->Prev();
+
+  if (it->Valid() && kd->covers_key(..))
+  {
+    if (!cmp_full_keys(lookup_tuple))
+    {
+      // the record's prefix matches lookup_tuple.
+      return record;
+    }
+  }
+  
+== HA_READ_PREFIX_LAST, backward CF ==
+
+Suppose, lookup_tuple='kv-bbb'
+
+  ( kv )-ccc-pk9
+  ( kv )-ccc-pk8
+# ( kv )-ccc     <--  2. Seek(Successor(kv-bbb)) will point here
+                         and it will fall down to the next row.
+  ( kv )-bbb-pk7 <--- 1. Need to be here.
+  ( kv )-bbb-pk6
+  ( kv )-bbb-pk5
+  ( kv )-bbb-pk4
+# ( kv )-bbb
+  ( kv )-aaa-pk3
+  ( kv )-aaa-pk2
+
+
+RocksDB calls: 
+
+  it->Seek(Successor(kv-bbb));
+
+  if (using_full_key && it->Valid() && !cmp_full_keys(Sucessor(lookup_key)))
+    it->Next();
+  
+  if (it->Valid() && kd->covers_key(..))
+  {
+    if (!cmp_full_keys(...))
+    {
+      // the record's prefix matches lookup_tuple.
+      return record;
+    }
+  }
+
+== HA_READ_PREFIX_LAST_OR_PREV, forward or backward CF ==
+
+This is just like HA_READ_PREFIX_LAST but we don't need 
+to check that the key we've got is in the search prefix.
+
+


More information about the commits mailing list