[Commits] 2de94d8de7a: Poor optimization of JOIN and ORDER BY LIMIT

Varun varunraiko1803 at gmail.com
Fri Jul 26 21:18:07 EEST 2019


revision-id: 2de94d8de7a48a4a169dae8cd2a915dcd73de22f (mariadb-10.4.4-154-g2de94d8de7a)
parent(s): c51f615bf575480b13ee542fdb3b7fe644e21098
author: Varun Gupta
committer: Varun Gupta
timestamp: 2019-07-26 23:43:04 +0530
message:

Poor optimization of JOIN and ORDER BY LIMIT

Iteration one

---
 mysql-test/main/order_by_limit.result | 200 +++++++++++
 mysql-test/main/order_by_limit.test   |  23 ++
 mysql-test/main/sort_nest.result      | 524 +++++++++++++++++++++++++++
 mysql-test/main/sort_nest.test        | 145 ++++++++
 sql/item.cc                           |  41 ++-
 sql/item.h                            |  40 +++
 sql/item_cmpfunc.cc                   |  16 +
 sql/item_cmpfunc.h                    |   1 +
 sql/item_func.h                       |   8 +
 sql/item_row.h                        |   5 +
 sql/opt_split.cc                      |   1 -
 sql/opt_subselect.cc                  |  30 ++
 sql/opt_subselect.h                   |   1 +
 sql/opt_trace.cc                      |  20 ++
 sql/opt_trace.h                       |   1 +
 sql/sql_class.h                       |  14 +
 sql/sql_const.h                       |   8 +
 sql/sql_select.cc                     | 660 +++++++++++++++++++++++++++++++---
 sql/sql_select.h                      |  40 ++-
 19 files changed, 1729 insertions(+), 49 deletions(-)

diff --git a/mysql-test/main/order_by_limit.result b/mysql-test/main/order_by_limit.result
new file mode 100644
index 00000000000..5199cc88519
--- /dev/null
+++ b/mysql-test/main/order_by_limit.result
@@ -0,0 +1,200 @@
+create table ten(a int);
+insert into ten values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table one_k(a int);
+insert into one_k select A.a + B.a* 10 + C.a * 100 from ten A, ten B, ten C;
+create table t1(a int, b int);
+insert into t1 select A.a + B.a* 10, A.a + B.a* 10 from ten A, ten B,ten C;
+create table t2(a int, b int);
+insert into t2(a,b) values (1,1), (2,1);
+insert into t2 select A.a + B.a* 10, -1 from ten A, ten B;
+create table t3(a int, b int);
+insert into t3 select A.a + B.a* 10 + C.a * 100, A.a + B.a* 10 + C.a * 100 from ten A, ten B, ten C;
+set optimizer_trace=1;
+the best plan would be having an order nest on (t2,t1)
+explain
+select * from t1,t2,t3 where t1.a=t2.b order by t1.b;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	102	Using temporary; Using filesort
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	1000	Using where; Using join buffer (flat, BNL join)
+1	SIMPLE	t3	ALL	NULL	NULL	NULL	NULL	1000	Using join buffer (incremental, BNL join)
+select JSON_DETAILED(JSON_EXTRACT(trace, '$**.considered_execution_plans')) from INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+JSON_DETAILED(JSON_EXTRACT(trace, '$**.considered_execution_plans'))
+[
+    
+    [
+        
+        {
+            "plan_prefix": 
+            [
+            ],
+            "table": "t2",
+            "best_access_path": 
+            {
+                "considered_access_paths": 
+                [
+                    
+                    {
+                        "access_type": "scan",
+                        "resulting_rows": 102,
+                        "cost": 2.2241,
+                        "chosen": true
+                    }
+                ]
+            },
+            "rest_of_plan": 
+            [
+                
+                {
+                    "plan_prefix": 
+                    [
+                        "t2"
+                    ],
+                    "table": "t1",
+                    "best_access_path": 
+                    {
+                        "considered_access_paths": 
+                        [
+                            
+                            {
+                                "access_type": "scan",
+                                "resulting_rows": 1000,
+                                "cost": 4.1973,
+                                "chosen": true
+                            }
+                        ]
+                    },
+                    "rest_of_plan": 
+                    [
+                        
+                        {
+                            "plan_prefix": 
+                            [
+                                "t2",
+                                "t1"
+                            ],
+                            "table": "t3",
+                            "best_access_path": 
+                            {
+                                "considered_access_paths": 
+                                [
+                                    
+                                    {
+                                        "access_type": "scan",
+                                        "resulting_rows": 1000,
+                                        "cost": 33.578,
+                                        "chosen": true
+                                    }
+                                ]
+                            },
+                            "cardinality": 1.02e8,
+                            "cost_of_plan": 1.22e8
+                        },
+                        
+                        {
+                            "plan_prefix": 
+                            [
+                                "t2",
+                                "t1"
+                            ],
+                            "table": "t3",
+                            "order_by_nest": 
+                            [
+                                "t2",
+                                "t1"
+                            ],
+                            "best_access_path": 
+                            {
+                                "considered_access_paths": 
+                                [
+                                    
+                                    {
+                                        "access_type": "scan",
+                                        "resulting_rows": 1000,
+                                        "cost": 428121,
+                                        "chosen": true
+                                    }
+                                ]
+                            },
+                            "cost_of_plan": 2.1e7
+                        }
+                    ]
+                },
+                
+                {
+                    "plan_prefix": 
+                    [
+                        "t2"
+                    ],
+                    "table": "t3",
+                    "best_access_path": 
+                    {
+                        "considered_access_paths": 
+                        [
+                            
+                            {
+                                "access_type": "scan",
+                                "resulting_rows": 1000,
+                                "cost": 4.1973,
+                                "chosen": true
+                            }
+                        ]
+                    },
+                    "pruned_by_heuristic": true
+                }
+            ]
+        },
+        
+        {
+            "plan_prefix": 
+            [
+            ],
+            "table": "t1",
+            "best_access_path": 
+            {
+                "considered_access_paths": 
+                [
+                    
+                    {
+                        "access_type": "scan",
+                        "resulting_rows": 1000,
+                        "cost": 4.1973,
+                        "chosen": true,
+                        "use_tmp_table": true
+                    }
+                ]
+            },
+            "pruned_by_heuristic": true
+        },
+        
+        {
+            "plan_prefix": 
+            [
+            ],
+            "table": "t3",
+            "best_access_path": 
+            {
+                "considered_access_paths": 
+                [
+                    
+                    {
+                        "access_type": "scan",
+                        "resulting_rows": 1000,
+                        "cost": 4.1973,
+                        "chosen": true
+                    }
+                ]
+            },
+            "pruned_by_heuristic": true
+        }
+    ]
+]
+select JSON_DETAILED(JSON_EXTRACT(trace, '$**.order_nest')) from INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+JSON_DETAILED(JSON_EXTRACT(trace, '$**.order_nest'))
+[
+    
+    [
+        "t2",
+        "t1"
+    ]
+]
+drop table t1,t2,t3,ten,one_k;
diff --git a/mysql-test/main/order_by_limit.test b/mysql-test/main/order_by_limit.test
new file mode 100644
index 00000000000..b717e84a1a8
--- /dev/null
+++ b/mysql-test/main/order_by_limit.test
@@ -0,0 +1,23 @@
+create table ten(a int);
+insert into ten values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+create table one_k(a int);
+insert into one_k select A.a + B.a* 10 + C.a * 100 from ten A, ten B, ten C;
+
+create table t1(a int, b int);
+insert into t1 select A.a + B.a* 10, A.a + B.a* 10 from ten A, ten B,ten C;
+create table t2(a int, b int);
+insert into t2(a,b) values (1,1), (2,1);
+insert into t2 select A.a + B.a* 10, -1 from ten A, ten B;
+create table t3(a int, b int);
+insert into t3 select A.a + B.a* 10 + C.a * 100, A.a + B.a* 10 + C.a * 100 from ten A, ten B, ten C;
+
+set optimizer_trace=1;
+
+--echo the best plan would be having an order nest on (t2,t1)
+
+explain
+select * from t1,t2,t3 where t1.a=t2.b order by t1.b;
+select JSON_DETAILED(JSON_EXTRACT(trace, '$**.considered_execution_plans')) from INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+select JSON_DETAILED(JSON_EXTRACT(trace, '$**.order_nest')) from INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+drop table t1,t2,t3,ten,one_k;
diff --git a/mysql-test/main/sort_nest.result b/mysql-test/main/sort_nest.result
new file mode 100644
index 00000000000..f3e10597536
--- /dev/null
+++ b/mysql-test/main/sort_nest.result
@@ -0,0 +1,524 @@
+create table ten(a int);
+insert into ten values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table one_k(a int);
+insert into one_k select A.a + B.a* 10 from ten A, ten B;
+create table t1 (a int, b int);
+insert into t1 select a,a from one_k;
+create table t2 (a int, b int);
+insert into t2 select a,a from one_k;
+explain
+select t1.b from t1,t2 where t1.a=t2.a  order by t2.b limit 2;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	100	Using filesort
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	100	Using where
+select t1.b from t1,t2 where t1.a=t2.a  order by t2.b limit 2;
+b
+0
+1
+analyze
+select t1.a,t2.a from t1,t2 where t1.a=t2.a  order by t2.b limit 10;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	100	10.00	100.00	100.00	Using filesort
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	100	91.00	100.00	1.10	Using where
+select t1.a,t2.a from t1,t2 where t1.a=t2.a  order by t2.b limit 10;
+a	a
+0	0
+1	1
+2	2
+3	3
+4	4
+5	5
+6	6
+7	7
+8	8
+9	9
+alter table t1 add key (a);
+ref should be func
+explain
+select t1.b,t2.b from t1,t2 where t1.a=t2.a+1  order by t2.b limit 2;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	100	Using filesort
+1	SIMPLE	t1	ref	a	a	5	func	1	Using index condition
+select t1.b,t2.b from t1,t2 where t1.a=t2.a+1  order by t2.b limit 2;
+b	b
+1	0
+2	1
+ref should be order-nest.a
+explain
+select t1.b,t2.b from t1,t2 where t1.a=t2.a  order by t2.b limit 2;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	100	Using where; Using filesort
+1	SIMPLE	t1	ref	a	a	5	test.t2.a	1	
+select t1.b,t2.b from t1,t2 where t1.a=t2.a  order by t2.b limit 2;
+b	b
+0	0
+1	1
+drop table t1,t2,ten,one_k;
+create table ten(a int);
+insert into ten values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table one_k(a int);
+insert into one_k select A.a + B.a* 10 + C.a * 100 from ten A, ten B, ten C;
+create table t1(a int, b int);
+insert into t1 select A.a + B.a* 10, A.a + B.a* 10 from ten A, ten B;
+create table t2(a int, b int);
+insert into t2(a,b) values (1,1), (2,2);
+insert into t2 select A.a + B.a* 10, A.a+B.a*10 from ten A, ten B;
+create table t3(a int, b int);
+insert into t3 select A.a + B.a* 10 + C.a * 100, A.a + B.a* 10 + C.a * 100 from ten A, ten B, ten C;
+create function f1(a int) returns int
+begin
+declare b int default 0;
+return a+b;
+end|
+Covering 3 table joins
+# {t1,t2} part of the nest
+# t1.a > 95 would be attached to table t1
+# t1.b=t2.a would be attached to table t2;
+explain select * from t1,t2,t3 where t1.a > 95 and  t1.a=t2.a and t1.b = t3.a order by t2.b;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	100	Using where
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	102	Using where; Using join buffer (flat, BNL join)
+1	SIMPLE	<order-nest>	ALL	NULL	NULL	NULL	NULL	10200	Using filesort
+1	SIMPLE	t3	ALL	NULL	NULL	NULL	NULL	1000	Using where
+explain format=json select * from t1,t2,t3 where t1.a > 95 and  t1.a=t2.a and t1.b = t3.a order by t2.b;
+EXPLAIN
+{
+  "query_block": {
+    "select_id": 1,
+    "table": {
+      "table_name": "t1",
+      "access_type": "ALL",
+      "rows": 100,
+      "filtered": 100,
+      "attached_condition": "t1.a > 95"
+    },
+    "block-nl-join": {
+      "table": {
+        "table_name": "t2",
+        "access_type": "ALL",
+        "rows": 102,
+        "filtered": 100
+      },
+      "buffer_type": "flat",
+      "buffer_size": "1Kb",
+      "join_type": "BNL",
+      "attached_condition": "t2.a = t1.a"
+    },
+    "read_sorted_file": {
+      "filesort": {
+        "sort_key": "`order-nest`.b",
+        "table": {
+          "table_name": "<order-nest>",
+          "access_type": "ALL",
+          "rows": 10200,
+          "filtered": 100
+        }
+      }
+    },
+    "table": {
+      "table_name": "t3",
+      "access_type": "ALL",
+      "rows": 1000,
+      "filtered": 100,
+      "attached_condition": "t3.a = `order-nest`.b"
+    }
+  }
+}
+select * from t1,t2,t3 where t1.a > 95 and  t1.a=t2.a and t1.b = t3.a order by t2.b;
+a	b	a	b	a	b
+96	96	96	96	96	96
+97	97	97	97	97	97
+98	98	98	98	98	98
+99	99	99	99	99	99
+# {t1,t2} part of the sort nest
+# (t2.a < 2 or t1.b > 98) would be attached to table t2
+explain select * from t1,t2,t3 where (t3.a<2 and  t2.a <2) or (t1.b > 98 and t3.b > 98)
+order by t1.a, t2.b limit 5;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	100	
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	102	Using where; Using join buffer (flat, BNL join)
+1	SIMPLE	<order-nest>	ALL	NULL	NULL	NULL	NULL	10200	Using filesort
+1	SIMPLE	t3	ALL	NULL	NULL	NULL	NULL	1000	Using where
+explain format=json select * from t1,t2,t3 where (t3.a<2 and  t2.a <2) or (t1.b > 98 and t3.b > 98)
+order by t1.a, t2.b limit 5;
+EXPLAIN
+{
+  "query_block": {
+    "select_id": 1,
+    "table": {
+      "table_name": "t1",
+      "access_type": "ALL",
+      "rows": 100,
+      "filtered": 100
+    },
+    "block-nl-join": {
+      "table": {
+        "table_name": "t2",
+        "access_type": "ALL",
+        "rows": 102,
+        "filtered": 100
+      },
+      "buffer_type": "flat",
+      "buffer_size": "1Kb",
+      "join_type": "BNL",
+      "attached_condition": "t2.a < 2 or t1.b > 98"
+    },
+    "read_sorted_file": {
+      "filesort": {
+        "sort_key": "`order-nest`.a, `order-nest`.b",
+        "table": {
+          "table_name": "<order-nest>",
+          "access_type": "ALL",
+          "rows": 10200,
+          "filtered": 100
+        }
+      }
+    },
+    "table": {
+      "table_name": "t3",
+      "access_type": "ALL",
+      "rows": 1000,
+      "filtered": 100,
+      "attached_condition": "t3.a < 2 and `order-nest`.a < 2 or `order-nest`.b > 98 and t3.b > 98"
+    }
+  }
+}
+select * from t1,t2,t3 where (t3.a<2 and  t2.a <2) or (t1.b > 98 and t3.b > 98)
+order by t1.a, t2.b limit 5;
+a	b	a	b	a	b
+0	0	1	1	0	0
+0	0	1	1	1	1
+1	1	1	1	0	0
+1	1	1	1	1	1
+2	2	1	1	0	0
+# {t1,t2} part of the nest
+# t2.a < 2 or f1(t1.b) attached to table t2
+# t1.b=t2.a would be attached to table t2;
+explain select * from t1,t2,t3 where (t3.a<2  and t2.a <2) or (f1(t1.b) > 98 and t3.b > 98)
+order by t1.a,t2.b limit 5;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	100	
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	102	Using join buffer (flat, BNL join)
+1	SIMPLE	<order-nest>	ALL	NULL	NULL	NULL	NULL	10200	Using filesort
+1	SIMPLE	t3	ALL	NULL	NULL	NULL	NULL	1000	Using where
+explain format=json select * from t1,t2,t3 where (t3.a<2  and t2.a <2) or (f1(t1.b) > 98 and t3.b > 98)
+order by t1.a,t2.b limit 5;
+EXPLAIN
+{
+  "query_block": {
+    "select_id": 1,
+    "table": {
+      "table_name": "t1",
+      "access_type": "ALL",
+      "rows": 100,
+      "filtered": 100
+    },
+    "block-nl-join": {
+      "table": {
+        "table_name": "t2",
+        "access_type": "ALL",
+        "rows": 102,
+        "filtered": 100
+      },
+      "buffer_type": "flat",
+      "buffer_size": "1Kb",
+      "join_type": "BNL"
+    },
+    "read_sorted_file": {
+      "filesort": {
+        "sort_key": "`order-nest`.a, `order-nest`.b",
+        "table": {
+          "table_name": "<order-nest>",
+          "access_type": "ALL",
+          "rows": 10200,
+          "filtered": 100
+        }
+      }
+    },
+    "table": {
+      "table_name": "t3",
+      "access_type": "ALL",
+      "rows": 1000,
+      "filtered": 100,
+      "attached_condition": "t3.a < 2 and `order-nest`.a < 2 or f1(`order-nest`.b) > 98 and t3.b > 98"
+    }
+  }
+}
+select * from t1,t2,t3 where (t3.a<2  and t2.a <2) or (f1(t1.b) > 98 and t3.b > 98)
+order by t1.a,t2.b limit 5;
+a	b	a	b	a	b
+0	0	0	0	0	0
+0	0	0	0	1	1
+0	0	1	1	0	0
+0	0	1	1	1	1
+0	0	1	1	0	0
+#
+# Removing constant from the order by clause
+#
+explain select * from t1,t2 where t1.a > 95  and t1.a=t2.a order by t2.a limit 4;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	100	Using where; Using filesort
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	102	Using where
+explain format=json select * from t1,t2 where t1.a > 95  and t1.a=t2.a order by t2.a limit 4;
+EXPLAIN
+{
+  "query_block": {
+    "select_id": 1,
+    "read_sorted_file": {
+      "filesort": {
+        "sort_key": "t2.a",
+        "table": {
+          "table_name": "t1",
+          "access_type": "ALL",
+          "rows": 100,
+          "filtered": 100,
+          "attached_condition": "t1.a > 95"
+        }
+      }
+    },
+    "table": {
+      "table_name": "t2",
+      "access_type": "ALL",
+      "rows": 102,
+      "filtered": 100,
+      "attached_condition": "t2.a = t1.a"
+    }
+  }
+}
+select * from t1,t2 where t1.a > 95  and t1.a=t2.a order by t2.a limit 4;
+a	b	a	b
+96	96	96	96
+97	97	97	97
+98	98	98	98
+99	99	99	99
+explain select * from t1,t2 where t1.a > 95  and t1.a=t2.a order by 1+2,t2.a limit 4;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	100	Using where; Using filesort
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	102	Using where
+explain format=json select * from t1,t2 where t1.a > 95  and t1.a=t2.a order by 1+2,t2.a limit 4;
+EXPLAIN
+{
+  "query_block": {
+    "select_id": 1,
+    "read_sorted_file": {
+      "filesort": {
+        "sort_key": "t2.a",
+        "table": {
+          "table_name": "t1",
+          "access_type": "ALL",
+          "rows": 100,
+          "filtered": 100,
+          "attached_condition": "t1.a > 95"
+        }
+      }
+    },
+    "table": {
+      "table_name": "t2",
+      "access_type": "ALL",
+      "rows": 102,
+      "filtered": 100,
+      "attached_condition": "t2.a = t1.a"
+    }
+  }
+}
+select * from t1,t2 where t1.a > 95  and t1.a=t2.a order by 1+2,t2.a limit 4;
+a	b	a	b
+96	96	96	96
+97	97	97	97
+98	98	98	98
+99	99	99	99
+#
+# Equality propagation, both the queries should use a sort nest on {t1,t2}
+#
+explain select t3.b, t2.a , t1.b , t1.a  from t1,t2,t3 where t1.b=t3.b
+order by t1.b desc, t2.a desc limit 3;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	100	
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	102	Using join buffer (flat, BNL join)
+1	SIMPLE	<order-nest>	ALL	NULL	NULL	NULL	NULL	10200	Using filesort
+1	SIMPLE	t3	ALL	NULL	NULL	NULL	NULL	1000	Using where
+explain format=json select t3.b, t2.a , t1.b , t1.a  from t1,t2,t3 where t1.b=t3.b
+order by t1.b desc, t2.a desc limit 3;
+EXPLAIN
+{
+  "query_block": {
+    "select_id": 1,
+    "table": {
+      "table_name": "t1",
+      "access_type": "ALL",
+      "rows": 100,
+      "filtered": 100
+    },
+    "block-nl-join": {
+      "table": {
+        "table_name": "t2",
+        "access_type": "ALL",
+        "rows": 102,
+        "filtered": 100
+      },
+      "buffer_type": "flat",
+      "buffer_size": "1Kb",
+      "join_type": "BNL"
+    },
+    "read_sorted_file": {
+      "filesort": {
+        "sort_key": "`order-nest`.b desc, `order-nest`.a desc",
+        "table": {
+          "table_name": "<order-nest>",
+          "access_type": "ALL",
+          "rows": 10200,
+          "filtered": 100
+        }
+      }
+    },
+    "table": {
+      "table_name": "t3",
+      "access_type": "ALL",
+      "rows": 1000,
+      "filtered": 100,
+      "attached_condition": "t3.b = `order-nest`.b"
+    }
+  }
+}
+select t3.b, t2.a , t1.b , t1.a  from t1,t2,t3 where t1.b=t3.b
+order by t1.b desc, t2.a desc limit 3;
+b	a	b	a
+99	99	99	99
+99	98	99	99
+99	97	99	99
+explain select t3.b, t2.a , t1.b , t1.a  from t1,t2,t3 where t1.b=t3.b
+order by t3.b desc, t2.a desc limit 3;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	100	
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	102	Using join buffer (flat, BNL join)
+1	SIMPLE	<order-nest>	ALL	NULL	NULL	NULL	NULL	10200	Using filesort
+1	SIMPLE	t3	ALL	NULL	NULL	NULL	NULL	1000	Using where
+select * from INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+QUERY	TRACE	MISSING_BYTES_BEYOND_MAX_MEM_SIZE	INSUFFICIENT_PRIVILEGES
+explain format=json select t3.b, t2.a , t1.b , t1.a  from t1,t2,t3 where t1.b=t3.b
+order by t3.b desc, t2.a desc limit 3;
+EXPLAIN
+{
+  "query_block": {
+    "select_id": 1,
+    "table": {
+      "table_name": "t1",
+      "access_type": "ALL",
+      "rows": 100,
+      "filtered": 100
+    },
+    "block-nl-join": {
+      "table": {
+        "table_name": "t2",
+        "access_type": "ALL",
+        "rows": 102,
+        "filtered": 100
+      },
+      "buffer_type": "flat",
+      "buffer_size": "1Kb",
+      "join_type": "BNL"
+    },
+    "read_sorted_file": {
+      "filesort": {
+        "sort_key": "`order-nest`.b desc, `order-nest`.a desc",
+        "table": {
+          "table_name": "<order-nest>",
+          "access_type": "ALL",
+          "rows": 10200,
+          "filtered": 100
+        }
+      }
+    },
+    "table": {
+      "table_name": "t3",
+      "access_type": "ALL",
+      "rows": 1000,
+      "filtered": 100,
+      "attached_condition": "t3.b = `order-nest`.b"
+    }
+  }
+}
+select t3.b, t2.a , t1.b , t1.a  from t1,t2,t3 where t1.b=t3.b
+order by t3.b desc, t2.a desc limit 3;
+b	a	b	a
+99	99	99	99
+99	98	99	99
+99	97	99	99
+#
+# Equality propagation also for arguments of expressions, 
+# the plan should use a sort nest on {t1,t2}
+#
+explain select t3.b, t2.a , t1.b , t1.a  from t1,t2,t3 where t1.b=t3.b
+order by t3.b+1 desc, t2.a desc limit 3;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	100	
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	102	Using join buffer (flat, BNL join)
+1	SIMPLE	<order-nest>	ALL	NULL	NULL	NULL	NULL	10200	Using filesort
+1	SIMPLE	t3	ALL	NULL	NULL	NULL	NULL	1000	Using where
+explain format=json select t3.b, t2.a , t1.b , t1.a  from t1,t2,t3 where t1.b=t3.b
+order by t3.b+1 desc, t2.a desc limit 3;
+EXPLAIN
+{
+  "query_block": {
+    "select_id": 1,
+    "table": {
+      "table_name": "t1",
+      "access_type": "ALL",
+      "rows": 100,
+      "filtered": 100
+    },
+    "block-nl-join": {
+      "table": {
+        "table_name": "t2",
+        "access_type": "ALL",
+        "rows": 102,
+        "filtered": 100
+      },
+      "buffer_type": "flat",
+      "buffer_size": "1Kb",
+      "join_type": "BNL"
+    },
+    "read_sorted_file": {
+      "filesort": {
+        "sort_key": "tmp_field desc, `order-nest`.a desc",
+        "table": {
+          "table_name": "<order-nest>",
+          "access_type": "ALL",
+          "rows": 10200,
+          "filtered": 100
+        }
+      }
+    },
+    "table": {
+      "table_name": "t3",
+      "access_type": "ALL",
+      "rows": 1000,
+      "filtered": 100,
+      "attached_condition": "t3.b = `order-nest`.b"
+    }
+  }
+}
+select t3.b, t2.a , t1.b , t1.a  from t1,t2,t3 where t1.b=t3.b
+order by t3.b+1 desc, t2.a desc limit 3;
+b	a	b	a
+99	99	99	99
+99	98	99	99
+99	97	99	99
+#
+# Rows for the sort-nest should be the cardinality of the join of inner tables
+# of the sort-nest
+#
+# Rows for sort nest would be 9894 here
+alter table t1 add key(a);
+explain extended select t3.b, t2.a , t1.b , t1.a  from t1,t2,t3 where t1.a > 5 and t1.b=t3.b
+order by t1.b desc, t2.a desc limit 3;
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
+1	SIMPLE	t1	ALL	a	NULL	NULL	NULL	100	97.00	Using where
+1	SIMPLE	t2	ALL	NULL	NULL	NULL	NULL	102	100.00	Using join buffer (flat, BNL join)
+1	SIMPLE	<order-nest>	ALL	NULL	NULL	NULL	NULL	9894	100.00	Using filesort
+1	SIMPLE	t3	ALL	NULL	NULL	NULL	NULL	1000	100.00	Using where
+Warnings:
+Note	1003	select `test`.`t3`.`b` AS `b`,`order-nest`.`a` AS `a`,`order-nest`.`b` AS `b`,`order-nest`.`a` AS `a` from `test`.`t1` join `test`.`t2` join `test`.`t3` where `test`.`t3`.`b` = `order-nest`.`b` order by `order-nest`.`b` desc,`order-nest`.`a` desc limit 3
+alter table t1 drop key a;
+drop table t1,t2,t3,ten,one_k;
+drop function f1;
diff --git a/mysql-test/main/sort_nest.test b/mysql-test/main/sort_nest.test
new file mode 100644
index 00000000000..729a6cd9eaa
--- /dev/null
+++ b/mysql-test/main/sort_nest.test
@@ -0,0 +1,145 @@
+create table ten(a int);
+insert into ten values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+create table one_k(a int);
+insert into one_k select A.a + B.a* 10 from ten A, ten B;
+create table t1 (a int, b int);
+insert into t1 select a,a from one_k;
+
+create table t2 (a int, b int);
+insert into t2 select a,a from one_k;
+explain
+select t1.b from t1,t2 where t1.a=t2.a  order by t2.b limit 2;
+select t1.b from t1,t2 where t1.a=t2.a  order by t2.b limit 2;
+
+analyze
+select t1.a,t2.a from t1,t2 where t1.a=t2.a  order by t2.b limit 10;
+select t1.a,t2.a from t1,t2 where t1.a=t2.a  order by t2.b limit 10;
+
+alter table t1 add key (a);
+
+--echo ref should be func
+explain
+select t1.b,t2.b from t1,t2 where t1.a=t2.a+1  order by t2.b limit 2;
+select t1.b,t2.b from t1,t2 where t1.a=t2.a+1  order by t2.b limit 2;
+
+--echo ref should be order-nest.a
+explain
+select t1.b,t2.b from t1,t2 where t1.a=t2.a  order by t2.b limit 2;
+select t1.b,t2.b from t1,t2 where t1.a=t2.a  order by t2.b limit 2;
+
+drop table t1,t2,ten,one_k;
+
+create table ten(a int);
+insert into ten values (0),(1),(2),(3),(4),(5),(6),(7),(8),(9);
+
+create table one_k(a int);
+insert into one_k select A.a + B.a* 10 + C.a * 100 from ten A, ten B, ten C;
+
+create table t1(a int, b int);
+insert into t1 select A.a + B.a* 10, A.a + B.a* 10 from ten A, ten B;
+create table t2(a int, b int);
+insert into t2(a,b) values (1,1), (2,2);
+insert into t2 select A.a + B.a* 10, A.a+B.a*10 from ten A, ten B;
+create table t3(a int, b int);
+insert into t3 select A.a + B.a* 10 + C.a * 100, A.a + B.a* 10 + C.a * 100 from ten A, ten B, ten C;
+
+delimiter |;
+create function f1(a int) returns int
+begin
+  declare b int default 0;
+  return a+b;
+end|
+delimiter ;|
+
+--echo Covering 3 table joins
+
+--echo # {t1,t2} part of the nest
+--echo # t1.a > 95 would be attached to table t1
+--echo # t1.b=t2.a would be attached to table t2;
+
+let $query= 
+select * from t1,t2,t3 where t1.a > 95 and  t1.a=t2.a and t1.b = t3.a order by t2.b;
+eval explain $query;
+eval explain format=json $query;
+eval $query;
+
+--echo # {t1,t2} part of the sort nest
+--echo # (t2.a < 2 or t1.b > 98) would be attached to table t2
+
+let $query= 
+select * from t1,t2,t3 where (t3.a<2 and  t2.a <2) or (t1.b > 98 and t3.b > 98)
+order by t1.a, t2.b limit 5;
+eval explain $query;
+eval explain format=json $query;
+eval $query;
+
+--echo # {t1,t2} part of the nest
+--echo # t2.a < 2 or f1(t1.b) attached to table t2
+--echo # t1.b=t2.a would be attached to table t2;
+
+let $query= 
+select * from t1,t2,t3 where (t3.a<2  and t2.a <2) or (f1(t1.b) > 98 and t3.b > 98)
+order by t1.a,t2.b limit 5;
+eval explain $query;
+eval explain format=json $query;
+eval $query;
+
+--echo #
+--echo # Removing constant from the order by clause
+--echo #
+
+let $query=
+select * from t1,t2 where t1.a > 95  and t1.a=t2.a order by t2.a limit 4;
+eval explain $query;
+eval explain format=json $query;
+eval $query;
+
+let $query=
+select * from t1,t2 where t1.a > 95  and t1.a=t2.a order by 1+2,t2.a limit 4;
+eval explain $query;
+eval explain format=json $query;
+eval $query;
+
+
+--echo #
+--echo # Equality propagation, both the queries should use a sort nest on {t1,t2}
+--echo #
+
+let $query=select t3.b, t2.a , t1.b , t1.a  from t1,t2,t3 where t1.b=t3.b
+order by t1.b desc, t2.a desc limit 3;
+eval explain $query;
+eval explain format=json $query;
+eval $query;
+
+let $query=select t3.b, t2.a , t1.b , t1.a  from t1,t2,t3 where t1.b=t3.b
+order by t3.b desc, t2.a desc limit 3;
+
+eval explain $query;
+select * from INFORMATION_SCHEMA.OPTIMIZER_TRACE;
+eval explain format=json $query;
+eval $query;
+
+--echo #
+--echo # Equality propagation also for arguments of expressions, 
+--echo # the plan should use a sort nest on {t1,t2}
+--echo #
+
+let $query=select t3.b, t2.a , t1.b , t1.a  from t1,t2,t3 where t1.b=t3.b
+order by t3.b+1 desc, t2.a desc limit 3;
+eval explain $query;
+eval explain format=json $query;
+eval $query;
+
+--echo #
+--echo # Rows for the sort-nest should be the cardinality of the join of inner tables
+--echo # of the sort-nest
+--echo #
+
+--echo # Rows for sort nest would be 9894 here
+alter table t1 add key(a);
+let $query=select t3.b, t2.a , t1.b , t1.a  from t1,t2,t3 where t1.a > 5 and t1.b=t3.b
+order by t1.b desc, t2.a desc limit 3;
+eval explain extended $query;
+alter table t1 drop key a;
+drop table t1,t2,t3,ten,one_k;
+drop function f1;
diff --git a/sql/item.cc b/sql/item.cc
index 84fd4dc6fb7..cf71a31be10 100644
--- a/sql/item.cc
+++ b/sql/item.cc
@@ -6158,6 +6158,28 @@ Item *Item_field::replace_equal_field(THD *thd, uchar *arg)
 }
 
 
+Item *Item_field::replace_with_nest_items(THD *thd, uchar *arg)
+{
+  REPLACE_NEST_FIELD_ARG* param= (REPLACE_NEST_FIELD_ARG*)arg;
+  JOIN *join= param->join;
+  SORT_NEST_INFO *sort_nest_info= join->sort_nest_info;
+  if (!(used_tables() & sort_nest_info->nest_tables_map))
+    return this;
+
+  List_iterator_fast<Item> li(sort_nest_info->nest_base_table_cols);
+  uint index= 0;
+  Item *item;
+  while((item= li++))
+  {
+    Item *field_item= item->real_item();
+    if (field->eq(((Item_field*)field_item)->field))
+      return sort_nest_info->nest_temp_table_cols.elem(index);
+    index++;
+  }
+  return this;
+}
+
+
 void Item::init_make_send_field(Send_field *tmp_field,
                                 const Type_handler *h)
 {
@@ -9061,11 +9083,17 @@ Item *Item_direct_view_ref::replace_equal_field(THD *thd, uchar *arg)
 
 bool Item_field::excl_dep_on_table(table_map tab_map)
 {
-  return used_tables() == tab_map ||
+  return !(used_tables() & ~tab_map) ||
          (item_equal && (item_equal->used_tables() & tab_map));
 }
 
 
+bool Item_field::excl_dep_on_nest(table_map tab_map)
+{
+  return !(used_tables() & ~tab_map);
+}
+
+
 bool
 Item_field::excl_dep_on_grouping_fields(st_select_lex *sel)
 {
@@ -9089,6 +9117,17 @@ bool Item_direct_view_ref::excl_dep_on_table(table_map tab_map)
 }
 
 
+bool Item_direct_view_ref::excl_dep_on_nest(table_map tab_map)
+{
+  table_map used= used_tables();
+  if (used & OUTER_REF_TABLE_BIT)
+    return false;
+  if (!(used & ~tab_map))
+    return true;
+  return (*ref)->excl_dep_on_nest(tab_map);
+}
+
+
 bool Item_direct_view_ref::excl_dep_on_grouping_fields(st_select_lex *sel)
 {
   if (item_equal)
diff --git a/sql/item.h b/sql/item.h
index 3b889b08e7c..8c4d07ca5b8 100644
--- a/sql/item.h
+++ b/sql/item.h
@@ -445,6 +445,11 @@ typedef struct replace_equal_field_arg
   struct st_join_table *context_tab;
 } REPLACE_EQUAL_FIELD_ARG;
 
+typedef struct replace_nest_field_arg
+{
+  JOIN *join;
+} REPLACE_NEST_FIELD_ARG;
+
 class Settable_routine_parameter
 {
 public:
@@ -1887,6 +1892,12 @@ class Item: public Value_source,
     Not to be used for AND/OR formulas.
   */
   virtual bool excl_dep_on_table(table_map tab_map) { return false; }
+
+  /*
+    TRUE if the expression depends only on the table indicated by tab_map
+    Not to be used for AND/OR formulas.
+  */
+  virtual bool excl_dep_on_nest(table_map tab_map) { return false; }
   /*
     TRUE if the expression depends only on grouping fields of sel
     or can be converted to such an expression using equalities.
@@ -2109,6 +2120,8 @@ class Item: public Value_source,
   { return this; }
   virtual Item *multiple_equality_transformer(THD *thd, uchar *arg)
   { return this; }
+  virtual Item *replace_with_nest_items(THD *thd, uchar *arg)
+  { return this; }
   virtual bool expr_cache_is_needed(THD *) { return FALSE; }
   virtual Item *safe_charset_converter(THD *thd, CHARSET_INFO *tocs);
   bool needs_charset_converter(uint32 length, CHARSET_INFO *tocs) const
@@ -2316,6 +2329,10 @@ class Item: public Value_source,
   {
     return excl_dep_on_table(*((table_map *)arg));
   }
+  bool pushable_cond_checker_for_nest(uchar *arg)
+  {
+    return excl_dep_on_nest(*((table_map *)arg));
+  }
   bool pushable_cond_checker_for_subquery(uchar *arg)
   {
     return excl_dep_on_in_subq_left_part((Item_in_subselect *)arg);
@@ -2511,6 +2528,17 @@ class Item_args
     }
     return true;
   }
+  bool excl_dep_on_nest(table_map tab_map)
+  {
+    for (uint i= 0; i < arg_count; i++)
+    {
+      if (args[i]->const_item())
+        continue;
+      if (!args[i]->excl_dep_on_nest(tab_map))
+        return false;
+    }
+    return true;
+  }
   bool excl_dep_on_grouping_fields(st_select_lex *sel);
   bool eq(const Item_args *other, bool binary_cmp) const
   {
@@ -3451,6 +3479,7 @@ class Item_field :public Item_ident,
   Item *in_subq_field_transformer_for_having(THD *thd, uchar *arg);
   virtual void print(String *str, enum_query_type query_type);
   bool excl_dep_on_table(table_map tab_map);
+  bool excl_dep_on_nest(table_map tab_map);
   bool excl_dep_on_grouping_fields(st_select_lex *sel);
   bool excl_dep_on_in_subq_left_part(Item_in_subselect *subq_pred);
   bool cleanup_excluding_fields_processor(void *arg)
@@ -3471,6 +3500,7 @@ class Item_field :public Item_ident,
     return field->get_geometry_type();
   }
   bool check_index_dependence(void *arg);
+  Item *replace_with_nest_items(THD *thd, uchar *arg);
   friend class Item_default_value;
   friend class Item_insert_value;
   friend class st_select_lex_unit;
@@ -5306,6 +5336,15 @@ class Item_ref :public Item_ident,
       return false;
     return (used == tab_map) || (*ref)->excl_dep_on_table(tab_map);
   }
+
+  bool excl_dep_on_nest(table_map tab_map)
+  {
+    table_map used= used_tables();
+    if (used & OUTER_REF_TABLE_BIT)
+      return false;
+    return (!(used & ~tab_map) || (*ref)->excl_dep_on_nest(tab_map));
+  }
+
   bool excl_dep_on_grouping_fields(st_select_lex *sel)
   { return (*ref)->excl_dep_on_grouping_fields(sel); }
   bool excl_dep_on_in_subq_left_part(Item_in_subselect *subq_pred)
@@ -5631,6 +5670,7 @@ class Item_direct_view_ref :public Item_direct_ref
     return 0;
   }
   bool excl_dep_on_table(table_map tab_map);
+  bool excl_dep_on_nest(table_map tab_map);
   bool excl_dep_on_grouping_fields(st_select_lex *sel);
   bool excl_dep_on_in_subq_left_part(Item_in_subselect *subq_pred);
   Item *derived_field_transformer_for_having(THD *thd, uchar *arg);
diff --git a/sql/item_cmpfunc.cc b/sql/item_cmpfunc.cc
index 02fc7719fbc..48434f66f64 100644
--- a/sql/item_cmpfunc.cc
+++ b/sql/item_cmpfunc.cc
@@ -5225,6 +5225,22 @@ bool Item_cond::excl_dep_on_table(table_map tab_map)
   return true;
 }
 
+bool Item_cond::excl_dep_on_nest(table_map tab_map)
+{
+  if (used_tables() & OUTER_REF_TABLE_BIT)
+    return false;
+  if (!(used_tables() & ~tab_map))
+    return true;
+  List_iterator_fast<Item> li(list);
+  Item *item;
+  while ((item= li++))
+  {
+    if (!item->excl_dep_on_nest(tab_map))
+      return false;
+  }
+  return true;
+}
+
 
 bool Item_cond::excl_dep_on_grouping_fields(st_select_lex *sel)
 {
diff --git a/sql/item_cmpfunc.h b/sql/item_cmpfunc.h
index 2af7ebdf231..b992ee1b676 100644
--- a/sql/item_cmpfunc.h
+++ b/sql/item_cmpfunc.h
@@ -3011,6 +3011,7 @@ class Item_cond :public Item_bool_func
   bool eval_not_null_tables(void *opt_arg);
   Item *build_clone(THD *thd);
   bool excl_dep_on_table(table_map tab_map);
+  bool excl_dep_on_nest(table_map tab_map);
   bool excl_dep_on_grouping_fields(st_select_lex *sel);
 };
 
diff --git a/sql/item_func.h b/sql/item_func.h
index 610adb4bb46..350fb937957 100644
--- a/sql/item_func.h
+++ b/sql/item_func.h
@@ -340,6 +340,14 @@ class Item_func :public Item_func_or_sum,
            Item_args::excl_dep_on_table(tab_map);
   }
 
+  bool excl_dep_on_nest(table_map tab_map)
+  {
+    if (used_tables() & OUTER_REF_TABLE_BIT)
+      return false;
+    return !(used_tables() & ~tab_map) ||
+           Item_args::excl_dep_on_nest(tab_map);
+  }
+
   bool excl_dep_on_grouping_fields(st_select_lex *sel)
   {
     if (has_rand_bit() || with_subquery())
diff --git a/sql/item_row.h b/sql/item_row.h
index ea5a0f21d8b..4698dcd8150 100644
--- a/sql/item_row.h
+++ b/sql/item_row.h
@@ -140,6 +140,11 @@ class Item_row: public Item_fixed_hybrid,
     return Item_args::excl_dep_on_table(tab_map);
   }
 
+  bool excl_dep_on_nest(table_map tab_map)
+  {
+    return Item_args::excl_dep_on_nest(tab_map);
+  }
+
   bool excl_dep_on_grouping_fields(st_select_lex *sel)
   {
     return Item_args::excl_dep_on_grouping_fields(sel);
diff --git a/sql/opt_split.cc b/sql/opt_split.cc
index cfac0c93544..0dc8006029b 100644
--- a/sql/opt_split.cc
+++ b/sql/opt_split.cc
@@ -651,7 +651,6 @@ add_ext_keyuses_for_splitting_field(Dynamic_array<KEYUSE_EXT> *ext_keyuses,
   @brief
     Cost of the post join operation used in specification of splittable table
 */
-
 static
 double spl_postjoin_oper_cost(THD *thd, double join_record_count, uint rec_len)
 {
diff --git a/sql/opt_subselect.cc b/sql/opt_subselect.cc
index 02d221d13b0..ef1f6f2c6ff 100644
--- a/sql/opt_subselect.cc
+++ b/sql/opt_subselect.cc
@@ -4777,6 +4777,11 @@ int setup_semijoin_loosescan(JOIN *join)
   for (i= join->const_tables ; i < join->top_join_tab_count; )
   {
     JOIN_TAB *tab=join->join_tab + i;
+    if (tab->is_sort_nest)
+    {
+      i++;
+      continue;
+    }
     switch (pos->sj_strategy) {
       case SJ_OPT_MATERIALIZE:
       case SJ_OPT_MATERIALIZE_SCAN:
@@ -5517,6 +5522,31 @@ enum_nested_loop_state join_tab_execution_startup(JOIN_TAB *tab)
       sjm->materialized= TRUE;
     }
   }
+  /*
+    This should be the place to add the sub_select call for the
+    prefix of the join order
+  */
+
+  else if (tab->is_sort_nest)
+  {
+    enum_nested_loop_state rc;
+    JOIN *join= tab->join;
+    SORT_NEST_INFO *nest_info= join->sort_nest_info;
+
+    if (!nest_info->materialized)
+    {
+      JOIN_TAB *join_tab= join->join_tab + join->const_tables;
+      JOIN_TAB *save_return_tab= join->return_tab;
+      if ((rc= sub_select(join, join_tab, FALSE)) < 0 ||
+          (rc= sub_select(join, join_tab, TRUE)) < 0)
+      {
+        join->return_tab= save_return_tab;
+        DBUG_RETURN(rc);
+      }
+      join->return_tab= save_return_tab;
+      nest_info->materialized= TRUE;
+    }
+  }
 
   DBUG_RETURN(NESTED_LOOP_OK);
 }
diff --git a/sql/opt_subselect.h b/sql/opt_subselect.h
index 65131f6bc89..dad63c5d1db 100644
--- a/sql/opt_subselect.h
+++ b/sql/opt_subselect.h
@@ -298,6 +298,7 @@ class Loose_scan_opt
       pos->loosescan_picker.loosescan_key=   best_loose_scan_key;
       pos->loosescan_picker.loosescan_parts= best_max_loose_keypart + 1;
       pos->use_join_buffer= FALSE;
+      pos->sort_nest_operation_here= FALSE;
       pos->table=           tab;
       pos->range_rowid_filter_info= tab->range_rowid_filter_info;
       // todo need ref_depend_map ?
diff --git a/sql/opt_trace.cc b/sql/opt_trace.cc
index befc7934a3a..ca75b697ec3 100644
--- a/sql/opt_trace.cc
+++ b/sql/opt_trace.cc
@@ -606,6 +606,13 @@ void Json_writer::add_table_name(const JOIN_TAB *tab)
                            ctab->emb_sj_nest->sj_subq_pred->get_identifier());
       add_str(table_name_buffer, len);
     }
+    else if (tab->is_sort_nest)
+    {
+      size_t len= my_snprintf(table_name_buffer,
+                           sizeof(table_name_buffer)-1,
+                           "<order-nest>");
+      add_str(table_name_buffer, len);
+    }
     else
     {
       TABLE_LIST *real_table= tab->table->pos_in_table_list;
@@ -630,6 +637,19 @@ void add_table_scan_values_to_trace(THD *thd, JOIN_TAB *tab)
   table_rec.add("rows", tab->found_records)
            .add("cost", tab->read_time);
 }
+
+void add_sort_nest_tables_to_trace(JOIN *join)
+{
+  JOIN_TAB *end_tab, *tab;
+  THD *thd= join->thd;
+  SORT_NEST_INFO *sort_nest_info= join->sort_nest_info;
+  end_tab= sort_nest_info->nest_tab;
+  Json_writer_object trace_wrapper(thd);
+  Json_writer_array sort_nest(thd, "sort_nest");
+  for (tab= join->join_tab + join->const_tables; tab < end_tab; tab++)
+    sort_nest.add_table_name(tab);
+}
+
 /*
   Introduce enum_query_type flags parameter, maybe also allow
   EXPLAIN also use this function.
diff --git a/sql/opt_trace.h b/sql/opt_trace.h
index 52318bc6b7f..b2dd3dff924 100644
--- a/sql/opt_trace.h
+++ b/sql/opt_trace.h
@@ -105,6 +105,7 @@ void opt_trace_print_expanded_query(THD *thd, SELECT_LEX *select_lex,
                                     Json_writer_object *trace_object);
 
 void add_table_scan_values_to_trace(THD *thd, JOIN_TAB *tab);
+void add_sort_nest_tables_to_trace(JOIN *join);
 
 /*
   Security related (need to add a proper comment here)
diff --git a/sql/sql_class.h b/sql/sql_class.h
index 63f964e96ce..73f97ba727b 100644
--- a/sql/sql_class.h
+++ b/sql/sql_class.h
@@ -6037,6 +6037,20 @@ class SJ_MATERIALIZATION_INFO : public Sql_alloc
   Copy_field *copy_field; /* Needed for SJ_Materialization scan */
 };
 
+class SORT_NEST_INFO : public Sql_alloc
+{
+public:
+  TMP_TABLE_PARAM tmp_table_param;
+  List<Item> nest_base_table_cols;
+  List<Item> nest_temp_table_cols;
+  TABLE *table;
+  st_join_table *nest_tab;
+  uint n_tables;
+  bool materialized; /* TRUE <=> materialization already performed */
+  table_map nest_tables_map;
+  Item *nest_cond;
+};
+
 
 /* Structs used when sorting */
 struct SORT_FIELD_ATTR
diff --git a/sql/sql_const.h b/sql/sql_const.h
index 7aa4249f5ad..96779e4dbeb 100644
--- a/sql/sql_const.h
+++ b/sql/sql_const.h
@@ -296,6 +296,14 @@
 */
 #define MAX_TIME_ZONE_NAME_LENGTH       (NAME_LEN + 1)
 
+
+/**
+  Average record length is the number of bytes for the record, it is just a rough guess, needs
+  this to calculate cost of filling and reading the temp table
+
+*/
+#define AVG_REC_LEN  50
+
 #if defined(__WIN__)
 
 #define INTERRUPT_PRIOR -2
diff --git a/sql/sql_select.cc b/sql/sql_select.cc
index e97abf3d981..2346a600c6f 100644
--- a/sql/sql_select.cc
+++ b/sql/sql_select.cc
@@ -120,8 +120,12 @@ static bool best_extension_by_limited_search(JOIN *join,
                                              uint idx, double record_count,
                                              double read_time, uint depth,
                                              uint prune_level,
-                                             uint use_cond_selectivity);
+                                             uint use_cond_selectivity,
+                                             table_map previous_tables,
+                                             bool nest_created,
+                                             double *cardinality);
 void trace_plan_prefix(JOIN *join, uint idx, table_map remaining_tables);
+void trace_order_by_nest(JOIN *join, uint idx, table_map remaining_tables);
 static uint determine_search_depth(JOIN* join);
 C_MODE_START
 static int join_tab_cmp(const void *dummy, const void* ptr1, const void* ptr2);
@@ -189,6 +193,8 @@ static enum_nested_loop_state
 end_update(JOIN *join, JOIN_TAB *join_tab, bool end_of_records);
 static enum_nested_loop_state
 end_unique_update(JOIN *join, JOIN_TAB *join_tab, bool end_of_records);
+enum_nested_loop_state
+end_nest_materialization(JOIN *join, JOIN_TAB *join_tab, bool end_of_records);
 
 static int join_read_const_table(THD *thd, JOIN_TAB *tab, POSITION *pos);
 static int join_read_system(JOIN_TAB *tab);
@@ -304,6 +310,11 @@ void set_postjoin_aggr_write_func(JOIN_TAB *tab);
 
 static Item **get_sargable_cond(JOIN *join, TABLE *table);
 
+void substitute_base_to_nest_items(JOIN *join);
+void substitute_base_to_nest_items2(JOIN *join, Item **cond);
+void check_cond_extraction_for_nest(THD *thd, Item *cond,
+                                    Pushdown_checker checker, uchar* arg);
+
 #ifndef DBUG_OFF
 
 /*
@@ -2463,6 +2474,10 @@ int JOIN::optimize_stage2()
   if (setup_semijoin_loosescan(this))
     DBUG_RETURN(1);
 
+  if (setup_sort_nest(this))
+    DBUG_RETURN(1);
+  substitute_base_to_nest_items(this);
+
   if (make_join_select(this, select, conds))
   {
     zero_result_cause=
@@ -2474,22 +2489,9 @@ int JOIN::optimize_stage2()
   error= -1;					/* if goto err */
 
   /* Optimize distinct away if possible */
-  {
-    ORDER *org_order= order;
-    order=remove_const(this, order,conds,1, &simple_order);
-    if (unlikely(thd->is_error()))
-    {
-      error= 1;
-      DBUG_RETURN(1);
-    }
+  if (remove_const_from_order_by())
+    DBUG_RETURN(TRUE);
 
-    /*
-      If we are using ORDER BY NULL or ORDER BY const_expression,
-      return result in any order (even if we are using a GROUP BY)
-    */
-    if (!order && org_order)
-      skip_sort_order= 1;
-  }
   /*
      Check if we can optimize away GROUP BY/DISTINCT.
      We can do that if there are no aggregate functions, the
@@ -2710,7 +2712,9 @@ int JOIN::optimize_stage2()
     Yet the current implementation of FORCE INDEX hints does not
     allow us to do it in a clean manner.
   */
-  no_jbuf_after= 1 ? table_count : make_join_orderinfo(this);
+  no_jbuf_after= 1 ? (sort_nest_info ? const_tables + sort_nest_info->n_tables
+                                      : table_count)
+                   : make_join_orderinfo(this);
 
   // Don't use join buffering when we use MATCH
   select_opts_for_readinfo=
@@ -3583,7 +3587,9 @@ bool JOIN::make_aggr_tables_info()
         curr_tab->type != JT_EQ_REF) // Don't sort 1 row
     {
       // Sort either first non-const table or the last tmp table
-      JOIN_TAB *sort_tab= curr_tab;
+      JOIN_TAB *sort_tab= sort_nest_info ?
+                          sort_nest_info->nest_tab :
+                          curr_tab;
 
       if (add_sorting_to_table(sort_tab, order_arg))
         DBUG_RETURN(true);
@@ -4421,7 +4427,7 @@ JOIN::destroy()
                                          WITH_CONST_TABLES);
          tab; tab= next_linear_tab(this, tab, WITH_BUSH_ROOTS))
     {
-      if (tab->aggr)
+      if (tab->aggr || tab->is_sort_nest)
       {
         free_tmp_table(thd, tab->table);
         delete tab->tmp_table_param;
@@ -4755,6 +4761,156 @@ static Item **get_sargable_cond(JOIN *join, TABLE *table)
   return retval;
 }
 
+void substitute_base_to_nest_items(JOIN *join)
+{
+  if (!join->sort_nest_needed())
+    return;
+  SORT_NEST_INFO *sort_nest_info= join->sort_nest_info;
+  REPLACE_NEST_FIELD_ARG arg= {join};
+
+  List_iterator<Item> it(join->fields_list);
+  Item *item, *new_item;
+  while ((item= it++))
+  {
+    if ((new_item= item->transform(join->thd,
+                                   &Item::replace_with_nest_items,
+                                  (uchar *) &arg)) != item)
+      it.replace(new_item);
+  }
+  JOIN_TAB *end_tab= sort_nest_info->nest_tab;
+  uint i, j;
+  for (i= join->const_tables + sort_nest_info->n_tables, j=0;
+       i < join->top_join_tab_count; i++, j++)
+  {
+    JOIN_TAB *tab= end_tab + j;
+    if (tab->type == JT_REF || tab->type == JT_EQ_REF ||
+        tab->type == JT_REF_OR_NULL)
+    {
+      for (uint keypart= 0; keypart < tab->ref.key_parts; keypart++)
+      {
+        item= tab->ref.items[keypart]->transform(join->thd,
+                                                 &Item::replace_with_nest_items,
+                                                 (uchar *) &arg);
+        if (item != tab->ref.items[keypart])
+        {
+          tab->ref.items[keypart]= item;
+          Item *real_item= item->real_item();
+          store_key *key_copy= tab->ref.key_copy[keypart];
+          if (key_copy->type() == store_key::FIELD_STORE_KEY)
+          {
+            store_key_field *field_copy= ((store_key_field *)key_copy);
+            DBUG_ASSERT(real_item->type() == Item::FIELD_ITEM);
+            field_copy->change_source_field((Item_field *) real_item);
+          }
+        }
+      }
+    }
+  }
+  substitute_base_to_nest_items2(join, &join->conds);
+}
+
+void substitute_base_to_nest_items2(JOIN *join, Item **cond)
+{
+  SORT_NEST_INFO *sort_nest_info= join->sort_nest_info;
+  Item *orig_cond= *cond;
+  if (!sort_nest_info)
+    return;
+  THD *thd= join->thd;
+  Item *extracted_cond;
+  SELECT_LEX* sl= join->select_lex;
+
+  /*
+    check_cond_extraction_for_nest would set NO_EXTRACTION_FL for
+    all the items that cannot be added to the inner tables of the nest
+  */
+  check_cond_extraction_for_nest(thd, orig_cond,
+                                 &Item::pushable_cond_checker_for_nest,
+                                 (uchar *)(&sort_nest_info->nest_tables_map));
+  /*
+    build_cond_for_grouping_fields would create the entire
+    condition that would be added to the tables inside the nest.
+    This may clone some items too.
+  */
+  extracted_cond= sl->build_cond_for_grouping_fields(thd, orig_cond, TRUE);
+
+  if (extracted_cond)
+  {
+    if (extracted_cond->fix_fields_if_needed(thd, 0))
+      return;
+    /*
+      Remove from the WHERE clause all the conditions that were added
+      to the inner tables of the sort nest
+    */
+    orig_cond= remove_pushed_top_conjuncts(thd, orig_cond);
+    sort_nest_info->nest_cond= extracted_cond;
+  }
+
+  REPLACE_NEST_FIELD_ARG arg= {join};
+  if (orig_cond)
+  {
+    orig_cond= orig_cond->transform(join->thd, &Item::replace_with_nest_items,
+                                    (uchar *) &arg);
+    orig_cond->update_used_tables();
+  }
+  *cond= orig_cond;
+}
+
+/*
+  Add a transformer to this call so that we dont have both
+  check_cond_extraction_for_nest and check_cond_extraction_for_grouping_fields
+*/
+
+void
+check_cond_extraction_for_nest(THD *thd, Item *cond,
+                               Pushdown_checker checker, uchar* arg)
+{
+  if (cond->get_extraction_flag() == NO_EXTRACTION_FL)
+    return;
+  cond->clear_extraction_flag();
+  if (cond->type() == Item::COND_ITEM)
+  {
+    Item_cond_and *and_cond=
+      (((Item_cond*) cond)->functype() == Item_func::COND_AND_FUNC) ?
+      ((Item_cond_and*) cond) : 0;
+
+    List<Item> *arg_list=  ((Item_cond*) cond)->argument_list();
+    List_iterator<Item> li(*arg_list);
+    uint count= 0;         // to count items not containing NO_EXTRACTION_FL
+    uint count_full= 0;    // to count items with FULL_EXTRACTION_FL
+    Item *item;
+    while ((item=li++))
+    {
+      check_cond_extraction_for_nest(thd, item, checker, arg);
+      if (item->get_extraction_flag() !=  NO_EXTRACTION_FL)
+      {
+        count++;
+        if (item->get_extraction_flag() == FULL_EXTRACTION_FL)
+          count_full++;
+      }
+      else if (!and_cond)
+        break;
+    }
+    if ((and_cond && count == 0) || item)
+      cond->set_extraction_flag(NO_EXTRACTION_FL);
+    if (count_full == arg_list->elements)
+    {
+      cond->set_extraction_flag(FULL_EXTRACTION_FL);
+    }
+    if (cond->get_extraction_flag() != 0)
+    {
+      li.rewind();
+      while ((item=li++))
+        item->clear_extraction_flag();
+    }
+  }
+  else
+  {
+    int fl= (cond->*checker)(arg) ?
+      FULL_EXTRACTION_FL : NO_EXTRACTION_FL;
+    cond->set_extraction_flag(fl);
+  }
+}
+
 
 /**
   Calculate the best possible join and initialize the join structure.
@@ -5210,6 +5366,7 @@ make_join_statistics(JOIN *join, List<TABLE_LIST> &tables_list,
   join->sort_by_table= get_sort_by_table(join->order, join->group_list,
                                          join->select_lex->leaf_tables,
                                          join->const_table_map);
+
   /* 
     Update info on indexes that can be used for search lookups as
     reading const tables may has added new sargable predicates. 
@@ -5444,12 +5601,22 @@ make_join_statistics(JOIN *join, List<TABLE_LIST> &tables_list,
   if (pull_out_semijoin_tables(join))
     DBUG_RETURN(TRUE);
 
-  join->join_tab=stat;
   join->top_join_tab_count= table_count;
-  join->map2table=stat_ref;
-  join->table= table_vector;
   join->const_tables=const_count;
   join->found_const_table_map=found_const_table_map;
+  /*
+    Here a call is made to remove the constant from the order by clause,
+    this call would only remove the basic constants. This is done for
+    the ORDER BY LIMIT optimization.
+  */
+  if (join->remove_const_from_order_by())
+    DBUG_RETURN(TRUE);
+
+  (void)propagate_equal_field_for_orderby(join, join->order);
+
+  join->join_tab=stat;
+  join->map2table=stat_ref;
+  join->table= table_vector;
 
   if (join->const_tables != join->table_count)
     optimize_keyuse(join, keyuse_array);
@@ -7075,6 +7242,7 @@ void set_position(JOIN *join,uint idx,JOIN_TAB *table,KEYUSE *key)
   join->positions[idx].sj_strategy= SJ_OPT_NONE;
   join->positions[idx].use_join_buffer= FALSE;
   join->positions[idx].range_rowid_filter_info= 0;
+  join->positions[idx].sort_nest_operation_here= FALSE;
 
   /* Move the const table as down as possible in best_ref */
   JOIN_TAB **pos=join->best_ref+idx+1;
@@ -7922,6 +8090,7 @@ best_access_path(JOIN      *join,
   pos->use_join_buffer= best_uses_jbuf;
   pos->spl_plan= spl_plan;
   pos->range_rowid_filter_info= best_filter;
+  pos->sort_nest_operation_here= FALSE;
    
   loose_scan_opt.save_to_position(s, loose_scan_pos);
 
@@ -8135,6 +8304,7 @@ choose_plan(JOIN *join, table_map join_tables)
                       use_cond_selectivity))
       DBUG_RETURN(TRUE);
   }
+  trace_plan.end();
 
   /* 
     Store the cost of this query into a user variable
@@ -8440,6 +8610,7 @@ optimize_straight_join(JOIN *join, table_map join_tables)
       pushdown_cond_selectivity= table_cond_selectivity(join, idx, s,
                                                         join_tables);
     position->cond_selectivity= pushdown_cond_selectivity;
+    record_count= record_count*pushdown_cond_selectivity;
     ++idx;
   }
 
@@ -8552,6 +8723,7 @@ greedy_search(JOIN      *join,
   JOIN_TAB  *best_table; // the next plan node to be added to the curr QEP
   // ==join->tables or # tables in the sj-mat nest we're optimizing
   uint      n_tables __attribute__((unused));
+  double cardinality= DBL_MAX;
   DBUG_ENTER("greedy_search");
 
   /* number of tables that remain to be optimized */
@@ -8565,9 +8737,11 @@ greedy_search(JOIN      *join,
   do {
     /* Find the extension of the current QEP with the lowest cost */
     join->best_read= DBL_MAX;
+    table_map previous_tables= 0;
     if (best_extension_by_limited_search(join, remaining_tables, idx, record_count,
                                          read_time, search_depth, prune_level,
-                                         use_cond_selectivity))
+                                         use_cond_selectivity,
+                                         previous_tables, FALSE, &cardinality))
       DBUG_RETURN(TRUE);
     /*
       'best_read < DBL_MAX' means that optimizer managed to find
@@ -9166,6 +9340,19 @@ void trace_plan_prefix(JOIN *join, uint idx, table_map remaining_tables)
   }
 }
 
+void trace_order_by_nest(JOIN *join, uint idx, table_map remaining_tables)
+{
+  THD *const thd= join->thd;
+  Json_writer_array plan_prefix(thd, "order_by_nest");
+  for (uint i= 0; i < idx; i++)
+  {
+    TABLE *tr= join->positions[i].table->table;
+    if (tr->map & remaining_tables)
+      plan_prefix.add_table_name(join->positions[i].table);
+  }
+
+}
+
 /**
   Find a good, possibly optimal, query execution plan (QEP) by a possibly
   exhaustive search.
@@ -9293,7 +9480,10 @@ best_extension_by_limited_search(JOIN      *join,
                                  double    read_time,
                                  uint      search_depth,
                                  uint      prune_level,
-                                 uint      use_cond_selectivity)
+                                 uint      use_cond_selectivity,
+                                 table_map previous_tables,
+                                 bool nest_created,
+                                 double *cardinality)
 {
   DBUG_ENTER("best_extension_by_limited_search");
 
@@ -9319,7 +9509,16 @@ best_extension_by_limited_search(JOIN      *join,
   JOIN_TAB *s;
   double best_record_count= DBL_MAX;
   double best_read_time=    DBL_MAX;
-  bool disable_jbuf= join->thd->variables.join_cache_level == 0;
+  bool disable_jbuf= (join->thd->variables.join_cache_level == 0) || nest_created;
+  double fraction_output;
+
+  if (nest_created)
+  {
+    fraction_output= join->select_limit < (*cardinality) ?
+                     (join->select_limit/(*cardinality)) : 1.0;
+  }
+  else
+    fraction_output= 1.0;
 
   DBUG_EXECUTE("opt", print_plan(join, idx, record_count, read_time, read_time,
                                 "part_plan"););
@@ -9348,6 +9547,8 @@ best_extension_by_limited_search(JOIN      *join,
       {
         trace_plan_prefix(join, idx, remaining_tables);
         trace_one_table.add_table_name(s);
+        if (nest_created)
+          trace_order_by_nest(join, idx, previous_tables);
       }
 
       /* Find the best access method from 's' to the current partial plan */
@@ -9361,10 +9562,12 @@ best_extension_by_limited_search(JOIN      *join,
         ? position->range_rowid_filter_info->get_cmp_gain(current_record_count)
         : 0;
       current_read_time=COST_ADD(read_time,
-                                 COST_ADD(position->read_time -
-                                          filter_cmp_gain,
-                                          current_record_count /
-                                          (double) TIME_FOR_COMPARE));
+                                 COST_MULT(
+                                    COST_ADD(position->read_time -
+                                             filter_cmp_gain,
+                                             current_record_count /
+                                             (double) TIME_FOR_COMPARE), fraction_output));
+      current_record_count= COST_MULT(current_record_count, fraction_output);
 
       advance_sj_state(join, remaining_tables, idx, &current_record_count,
                        &current_read_time, &loose_scan_pos);
@@ -9427,9 +9630,12 @@ best_extension_by_limited_search(JOIN      *join,
       double partial_join_cardinality= current_record_count *
                                         pushdown_cond_selectivity;
       if ( (search_depth > 1) && (remaining_tables & ~real_table_bit) & allowed_tables )
-      { /* Recursively expand the current partial plan */
+      {
+        /* Recursively expand the current partial plan */
         swap_variables(JOIN_TAB*, join->best_ref[idx], *pos);
         Json_writer_array trace_rest(thd, "rest_of_plan");
+        bool nest_allow= (join->cur_sj_inner_tables == 0 &&
+                          join->cur_embedding_map == 0);
         if (best_extension_by_limited_search(join,
                                              remaining_tables & ~real_table_bit,
                                              idx + 1,
@@ -9437,24 +9643,58 @@ best_extension_by_limited_search(JOIN      *join,
                                              current_read_time,
                                              search_depth - 1,
                                              prune_level,
-                                             use_cond_selectivity))
+                                             use_cond_selectivity,
+                                             nest_created ? previous_tables :
+                                             previous_tables | real_table_bit,
+                                             nest_created, cardinality))
           DBUG_RETURN(TRUE);
+
+        if (!nest_created && !join->emb_sjm_nest && nest_allow && !join->need_order_nest() &&
+            check_join_prefix_contains_ordering(join, s, previous_tables))
+        {
+          join->positions[idx].sort_nest_operation_here= TRUE;
+          double cost= postjoin_oper_cost(thd, partial_join_cardinality, AVG_REC_LEN, idx);
+          current_read_time= COST_ADD(current_read_time, cost);
+          if (best_extension_by_limited_search(join,
+                                               remaining_tables & ~real_table_bit,
+                                               idx + 1,
+                                               partial_join_cardinality,
+                                               current_read_time,
+                                               search_depth - 1,
+                                               0,
+                                               use_cond_selectivity,
+                                               previous_tables | real_table_bit,
+                                               TRUE, cardinality))
+            DBUG_RETURN(TRUE);
+          join->positions[idx].sort_nest_operation_here= FALSE;
+        }
         swap_variables(JOIN_TAB*, join->best_ref[idx], *pos);
       }
       else
-      { /*
+      {
+        /*
           'join' is either the best partial QEP with 'search_depth' relations,
           or the best complete QEP so far, whichever is smaller.
         */
         if (join->sort_by_table &&
             join->sort_by_table !=
-            join->positions[join->const_tables].table->table)
+            join->positions[join->const_tables].table->table
+            && !nest_created)
+        {
           /*
              We may have to make a temp table, note that this is only a
              heuristic since we cannot know for sure at this point.
              Hence it may be wrong.
           */
-          current_read_time= COST_ADD(current_read_time, current_record_count);
+          double cost= postjoin_oper_cost(thd, partial_join_cardinality, AVG_REC_LEN, idx);
+          current_read_time= COST_ADD(current_read_time, cost);
+        }
+        if (!nest_created)
+        {
+          *cardinality= partial_join_cardinality;
+          trace_one_table.add("cardinality", partial_join_cardinality);
+        }
+        trace_one_table.add("cost_of_plan", current_read_time);
         if (current_read_time < join->best_read)
         {
           memcpy((uchar*) join->best_positions, (uchar*) join->positions,
@@ -10145,7 +10385,7 @@ bool JOIN::get_best_combination()
   JOIN_TAB_RANGE *root_range;
   if (!(root_range= new (thd->mem_root) JOIN_TAB_RANGE))
     DBUG_RETURN(TRUE);
-   root_range->start= join_tab;
+  root_range->start= join_tab;
   /* root_range->end will be set later */
   join_tab_ranges.empty();
 
@@ -10220,7 +10460,7 @@ bool JOIN::get_best_combination()
       j->type=JT_ALL;
       if (best_positions[tablenr].use_join_buffer &&
           tablenr != const_tables)
-	full_join= 1;
+      full_join= 1;
     }
 
     /*if (best_positions[tablenr].sj_strategy == SJ_OPT_LOOSE_SCAN)
@@ -10253,12 +10493,45 @@ bool JOIN::get_best_combination()
       sjm_nest_root= NULL;
       sjm_nest_end= NULL;
     }
+    if (cur_pos->sort_nest_operation_here)
+    {
+      /*
+        Ok, we've entered an ORDERING nest
+        1. Put into main join order a JOIN_TAB that represents a scan
+           in the temptable.
+      */
+      JOIN_TAB *prev= j;
+      if ((prev - (join_tab + const_tables) > 0))
+      {
+        j= j+1;
+        bzero((void*)j, sizeof(JOIN_TAB));
+
+        j->join= this;
+        j->table= NULL; //temporary way to tell SJM tables from others.
+        j->ref.key = -1;
+        j->on_expr_ref= (Item**) &null_ptr;
+        j->is_sort_nest= TRUE;
+        uint tables= prev - (join_tab + const_tables)+1;
+        j->records_read= calculate_record_count_for_sort_nest(this, tables);
+        j->records= (ha_rows) j->records_read;
+        j->cond_selectivity= 1.0;
+      }
+      SORT_NEST_INFO *sort_nest_info;
+      if (!(sort_nest_info= new SORT_NEST_INFO()))
+        return TRUE;
+      sort_nest_info->n_tables= prev - (join_tab + const_tables)+1;
+      sort_nest_info->nest_tab= j;
+      this->sort_nest_info= sort_nest_info;
+      DBUG_ASSERT(sort_nest_info->n_tables != 0);
+    }
   }
   root_range->end= j;
 
   used_tables= OUTER_REF_TABLE_BIT;		// Outer row is already read
   for (j=join_tab, tablenr=0 ; tablenr < table_count ; tablenr++,j++)
   {
+    if (j->is_sort_nest)
+      j++;
     if (j->bush_children)
       j= j->bush_children->start;
 
@@ -10952,6 +11225,8 @@ make_outerjoin_info(JOIN *join)
        tab; 
        tab= next_linear_tab(join, tab, WITH_BUSH_ROOTS))
   {
+    if (tab->is_sort_nest)
+      continue;
     TABLE *table= tab->table;
     TABLE_LIST *tbl= table->pos_in_table_list;
     TABLE_LIST *embedding= tbl->embedding;
@@ -11127,10 +11402,20 @@ make_join_select(JOIN *join,SQL_SELECT *select,COND *cond)
     JOIN_TAB *tab;
     table_map current_map;
     i= join->const_tables;
+    Item *saved_cond= cond;
+    SORT_NEST_INFO *sort_nest_info= join->sort_nest_info;
+    if (join->sort_nest_needed())
+      cond= sort_nest_info->nest_cond;
+
     for (tab= first_depth_first_tab(join); tab;
          tab= next_depth_first_tab(join, tab))
     {
       bool is_hj;
+      if (tab->is_sort_nest)
+      {
+        cond= saved_cond;
+        continue;
+      }
 
       /*
         first_inner is the X in queries like:
@@ -12136,6 +12421,35 @@ end_sj_materialize(JOIN *join, JOIN_TAB *join_tab, bool end_of_records)
 }
 
 
+enum_nested_loop_state
+end_nest_materialization(JOIN *join, JOIN_TAB *join_tab, bool end_of_records)
+{
+  int error;
+  THD *thd= join->thd;
+  SORT_NEST_INFO *nest_info= join->sort_nest_info;
+  DBUG_ENTER("end_sj_materialize");
+  if (!end_of_records)
+  {
+    TABLE *table= nest_info->table;
+    fill_record(thd, table, table->field,
+                nest_info->nest_base_table_cols, TRUE, FALSE);
+
+    if (unlikely(thd->is_error()))
+      DBUG_RETURN(NESTED_LOOP_ERROR); /* purecov: inspected */
+    if (unlikely((error= table->file->ha_write_tmp_row(table->record[0]))))
+    {
+      /* create_myisam_from_heap will generate error if needed */
+      if (table->file->is_fatal_error(error, HA_CHECK_DUP) &&
+          create_internal_tmp_table_from_heap(thd, table,
+                                              nest_info->tmp_table_param.start_recinfo,
+                                              &nest_info->tmp_table_param.recinfo,
+                                              error, 1, NULL))
+        DBUG_RETURN(NESTED_LOOP_ERROR); /* purecov: inspected */
+    }
+  }
+  DBUG_RETURN(NESTED_LOOP_OK);
+}
+
 /* 
   Check whether a join buffer can be used to join the specified table   
 
@@ -12367,7 +12681,7 @@ uint check_join_cache_usage(JOIN_TAB *tab,
     Don't use join buffering if we're dictated not to by no_jbuf_after
     (This is not meaningfully used currently)
   */
-  if (table_index > no_jbuf_after)
+  if (table_index+1 > no_jbuf_after)
     goto no_join_cache;
   
   /*
@@ -12559,7 +12873,7 @@ void check_join_cache_usage_for_tables(JOIN *join, ulonglong options,
     */
     prev_tab= tab - 1;
     if (tab == join->join_tab + join->const_tables ||
-        (tab->bush_root_tab && tab->bush_root_tab->bush_children->start == tab))
+        (tab->bush_root_tab && tab->bush_root_tab->bush_children->start == tab) || tab->is_sort_nest)
       prev_tab= NULL;
 
     switch (tab->type) {
@@ -12588,7 +12902,7 @@ void check_join_cache_usage_for_tables(JOIN *join, ulonglong options,
     default:
       tab->used_join_cache_level= 0;
     }
-    if (!tab->bush_children)
+    if (!tab->bush_children && !tab->is_sort_nest)
       idx++;
   }
 }
@@ -12731,17 +13045,20 @@ make_join_readinfo(JOIN *join, ulonglong options, uint no_jbuf_after)
       Later it should be improved.
     */
 
-    if (tab->bush_root_tab && tab->bush_root_tab->bush_children->start == tab)
+    if ((tab->bush_root_tab && tab->bush_root_tab->bush_children->start == tab) ||
+        tab->is_sort_nest)
       prev_tab= NULL;
-    DBUG_ASSERT(tab->bush_children || tab->table == join->best_positions[i].table->table);
+    DBUG_ASSERT(tab->bush_children || tab->table == join->best_positions[i].table->table
+                || tab->is_sort_nest);
 
     tab->partial_join_cardinality= join->best_positions[i].records_read *
                                    (prev_tab? prev_tab->partial_join_cardinality : 1);
-    if (!tab->bush_children)
+    if (!tab->bush_children && !tab->is_sort_nest)
       i++;
   }
  
   check_join_cache_usage_for_tables(join, options, no_jbuf_after);
+  SORT_NEST_INFO *sort_nest_info= join->sort_nest_info;
   
   JOIN_TAB *first_tab;
   for (tab= first_tab= first_linear_tab(join, WITH_BUSH_ROOTS, WITHOUT_CONST_TABLES);
@@ -12768,7 +13085,8 @@ make_join_readinfo(JOIN *join, ulonglong options, uint no_jbuf_after)
       end_sj_materialize.
     */
     if (!(tab->bush_root_tab && 
-          tab->bush_root_tab->bush_children->end == tab + 1))
+          tab->bush_root_tab->bush_children->end == tab + 1) &&
+        !(sort_nest_info && tab+1 == sort_nest_info->nest_tab))
     {
       tab->next_select=sub_select;		/* normal select */
     }
@@ -12958,7 +13276,7 @@ make_join_readinfo(JOIN *join, ulonglong options, uint no_jbuf_after)
         It could be that sort_by_tab==NULL, and the plan is to use filesort()
         on the first table.
       */
-      if (join->order)
+      if (join->order && !join->sort_nest_info)
       {
         join->simple_order= 0;
         join->need_tmp= 1;
@@ -13711,6 +14029,8 @@ static void update_depend_map(JOIN *join)
        join_tab;
        join_tab= next_linear_tab(join, join_tab, WITH_BUSH_ROOTS))
   {
+    if (join_tab->is_sort_nest)
+      continue;
     TABLE_REF *ref= &join_tab->ref;
     table_map depend_map=0;
     Item **item=ref->items;
@@ -14001,6 +14321,22 @@ remove_const(JOIN *join,ORDER *first_order, COND *cond,
 }
 
 
+bool JOIN::remove_const_from_order_by()
+{
+  ORDER *org_order= order;
+  order=remove_const(this, order,conds,1, &simple_order);
+  if (unlikely(thd->is_error()))
+    return TRUE;
+  /*
+    If we are using ORDER BY NULL or ORDER BY const_expression,
+    return result in any order (even if we are using a GROUP BY)
+  */
+  if (!order && org_order)
+    skip_sort_order= 1;
+  return FALSE;
+}
+
+
 /**
   Filter out ORDER items those are equal to constants in WHERE
 
@@ -14041,6 +14377,198 @@ ORDER *simple_remove_const(ORDER *order, COND *where)
 }
 
 
+/*
+  This function basically tries to propgate all the multiple equalites
+  for the order by items, so that one can use them to generate QEP that would
+  also take into consideration equality propagation.
+  Example
+    select * from t1,t2 where t1.a=t2.a order by t1.a
+
+  So the possible join orders would be:
+
+  t1 join t2 then sort
+  t2 join t1 then sort
+  t1 sort(t1) join t2
+  t2 sort(t2) join t1 => this is only possible when equality propagation is
+                         performed
+*/
+void propagate_equal_field_for_orderby(JOIN *join, ORDER *first_order)
+{
+  ORDER *order;
+  for (order= first_order; order; order= order->next)
+  {
+    if (optimizer_flag(join->thd, OPTIMIZER_SWITCH_ORDERBY_EQ_PROP) &&
+        join->cond_equal)
+    {
+      Item *item= order->item[0];
+      /*
+        TODO: equality substitution in the context of ORDER BY is
+        sometimes allowed when it is not allowed in the general case.
+        We make the below call for its side effect: it will locate the
+        multiple equality the item belongs to and set item->item_equal
+        accordingly.
+      */
+      (void)item->propagate_equal_fields(join->thd,
+                                         Value_source::
+                                         Context_identity(),
+                                         join->cond_equal);
+    }
+  }
+}
+
+/*
+  This function checks if by considering the current join_tab
+  would we be able to achieve the ordering
+*/
+
+bool check_join_prefix_contains_ordering(JOIN *join, JOIN_TAB *tab,
+                                         table_map previous_tables)
+{
+  ORDER *order;
+  for (order= join->order; order; order= order->next)
+  {
+    Item *order_item= order->item[0];
+    table_map order_tables=order_item->used_tables();
+    if (!(order_tables & ~previous_tables) ||
+         (order_item->excl_dep_on_table(previous_tables | tab->table->map)))
+      continue;
+    else
+      return FALSE;
+  }
+  return TRUE;
+}
+
+
+bool setup_sort_nest(JOIN *join)
+{
+  if (!join->sort_nest_needed())
+    return FALSE;
+
+  /*
+    The sort nest is only needed when there are more than one table
+    in the sort nest, else we can just sort with the first table if the
+    sort nest has only one table
+  */
+  SORT_NEST_INFO* sort_nest_info= join->sort_nest_info;
+  THD *thd= join->thd;
+  Field_iterator_table field_iterator;
+
+  JOIN_TAB *start_tab= join->join_tab+join->const_tables, *j, *tab;
+  tab= sort_nest_info->nest_tab;
+  sort_nest_info->nest_tables_map= 0;
+
+  if (unlikely(thd->trace_started()))
+    add_sort_nest_tables_to_trace(join);
+
+  /* This needs to be added to JOIN  structure, looks the best option or we
+     can have a seperate struture NEST_INFO to hold it.
+     Final Implementation here should just walk over the where clause and collect
+     the field for which we should have a temp table field
+  */
+
+  for (j= start_tab; j < tab; j++)
+  {
+    TABLE *table= j->table;
+    field_iterator.set_table(table);
+    sort_nest_info->nest_tables_map|= table->map;
+    for (; !field_iterator.end_of_fields(); field_iterator.next())
+    {
+      Field *field= field_iterator.field();
+      if (!bitmap_is_set(table->read_set, field->field_index))
+        continue;
+      Item *item;
+      if (!(item= field_iterator.create_item(thd)))
+        return TRUE;
+      sort_nest_info->nest_base_table_cols.push_back(item, thd->mem_root);
+    }
+  }
+
+  uint non_order_fields= sort_nest_info->nest_base_table_cols.elements;
+  ORDER *order= join->order;
+
+  /*
+    Order by items need to be in the temp table ,we can avoid the Field items in
+    the order by list but we need to fields inside the temp table for expressions
+  */
+  for (order= join->order; order; order=order->next)
+  {
+    Item *item= order->item[0];
+    Item *res= substitute_for_best_equal_field(thd, NO_PARTICULAR_TAB, item,
+                                               join->cond_equal,
+                                               join->map2table, true);
+    res->update_used_tables();
+    sort_nest_info->nest_base_table_cols.push_back(res, thd->mem_root);
+  }
+
+  DBUG_ASSERT(!tab->table);
+
+  sort_nest_info->tmp_table_param.init();
+  sort_nest_info->tmp_table_param.bit_fields_as_long= TRUE;
+  sort_nest_info->tmp_table_param.field_count= sort_nest_info->nest_base_table_cols.elements;
+  sort_nest_info->tmp_table_param.force_not_null_cols= FALSE;
+
+  const LEX_CSTRING order_nest_name= { STRING_WITH_LEN("order-nest") };
+  if (!(tab->table= create_tmp_table(thd, &sort_nest_info->tmp_table_param,
+                                     sort_nest_info->nest_base_table_cols, (ORDER*) 0,
+                                     FALSE /* distinct */,
+                                     0, /*save_sum_fields*/
+                                     thd->variables.option_bits | TMP_TABLE_ALL_COLUMNS,
+                                     HA_POS_ERROR /*rows_limit */,
+                                     &order_nest_name)))
+    return TRUE; /* purecov: inspected */
+
+  tab->table->map= sort_nest_info->nest_tables_map;
+  sort_nest_info->table= tab->table;
+  tab->type= JT_ALL;
+
+  /*
+    The list of temp table items created here, these are needed for the substitution
+    for items that would be evaluated in POST SORT NEST context
+  */
+  field_iterator.set_table(tab->table);
+  for (; !field_iterator.end_of_fields(); field_iterator.next())
+  {
+    Field *field= field_iterator.field();
+    Item *item;
+    if (!(item= new (thd->mem_root)Item_temptable_field(thd, field)))
+      return TRUE;
+    sort_nest_info->nest_temp_table_cols.push_back(item, thd->mem_root);
+  }
+
+  /*
+    Here we substitute order by items with the items of the temp table
+  */
+  List_iterator_fast<Item> it(sort_nest_info->nest_temp_table_cols);
+  Item *item;
+  order= join->order;
+  uint i=0;
+  while ((item= it++))
+  {
+    if (i++ < non_order_fields)
+      continue;
+    order->item[0]= item;
+    order= order->next;
+  }
+  tab->table->reginfo.join_tab= tab;
+
+  /*
+    Create mapping between base table to temp table
+    Need a key-value structure
+    would like to have base_table_field ----> temp_table_item mapping
+    We can use a hash-set that we already have in the file sql-hset.h
+  */
+
+  /*
+    Setting up the scan on the temp table
+  */
+  tab->read_first_record= join_init_read_record;
+  tab->read_record.read_record_func= rr_sequential;
+  tab[-1].next_select= end_nest_materialization;
+  sort_nest_info->materialized= FALSE;
+
+  return FALSE;
+}
+
 static int
 return_zero_rows(JOIN *join, select_result *result, List<TABLE_LIST> &tables,
 		 List<Item> &fields, bool send_row, ulonglong select_options,
@@ -19721,6 +20249,10 @@ do_select(JOIN *join, Procedure *procedure)
 
     JOIN_TAB *join_tab= join->join_tab +
                         (join->tables_list ? join->const_tables : 0);
+    SORT_NEST_INFO *sort_nest_info= join->sort_nest_info;
+    join_tab= sort_nest_info ? sort_nest_info->nest_tab
+                              : join_tab;
+
     if (join->outer_ref_cond && !join->outer_ref_cond->val_int())
       error= NESTED_LOOP_NO_MORE_ROWS;
     else
@@ -25951,6 +26483,13 @@ bool JOIN_TAB::save_explain_data(Explain_table_access *eta,
                          ctab->emb_sj_nest->sj_subq_pred->get_identifier());
     eta->table_name.copy(table_name_buffer, len, cs);
   }
+  else if (is_sort_nest)
+  {
+    size_t len= my_snprintf(table_name_buffer,
+                         sizeof(table_name_buffer)-1,
+                         "<order-nest>");
+    eta->table_name.copy(table_name_buffer, len, cs);
+  }
   else
   {
     TABLE_LIST *real_table= table->pos_in_table_list;
@@ -28550,6 +29089,39 @@ select_handler *SELECT_LEX::find_select_handler(THD *thd)
   return 0;
 }
 
+double postjoin_oper_cost(THD *thd, double join_record_count, uint rec_len, uint idx)
+{
+  double cost= 0;
+  /*
+    For only one table in the order_nest, we don't need a fill the temp table, we can
+    just read the data into the filesort buffer and read the sorted data from the buffers.
+  */
+  if (idx)
+    cost=  get_tmp_table_write_cost(thd, join_record_count,rec_len) *
+           join_record_count;   // cost to fill tmp table
+
+  cost+= get_tmp_table_lookup_cost(thd, join_record_count,rec_len) *
+         join_record_count;   // cost to perform post join operation used here
+  cost+= get_tmp_table_lookup_cost(thd, join_record_count, rec_len) +
+         (join_record_count == 0 ? 0 :
+          join_record_count * log2 (join_record_count)) *
+         SORT_INDEX_CMP_COST;             // cost to perform  sorting
+  return cost;
+}
+
+
+double calculate_record_count_for_sort_nest(JOIN *join, uint n_tables)
+{
+  double sort_nest_records=1, record_count;
+  JOIN_TAB *tab= join->join_tab + join->const_tables;
+  for (uint j= 0; j < n_tables ;j++, tab++)
+  {
+    record_count= tab->records_read * tab->cond_selectivity;
+    sort_nest_records= COST_MULT(sort_nest_records, record_count);
+  }
+  return sort_nest_records;
+}
+
 
 /**
   @} (end of group Query_Optimizer)
diff --git a/sql/sql_select.h b/sql/sql_select.h
index 40a9ed303f7..e0a9886b984 100644
--- a/sql/sql_select.h
+++ b/sql/sql_select.h
@@ -289,13 +289,14 @@ typedef struct st_join_table {
 
   /* TRUE <=> This join_tab is inside an SJM bush and is the last leaf tab here */
   bool          last_leaf_in_bush;
-  
+
   /*
     ptr  - this is a bush, and ptr points to description of child join_tab
            range
     NULL - this join tab has no bush children
   */
   JOIN_TAB_RANGE *bush_children;
+  JOIN_TAB_RANGE *order_nest_children;
   
   /* Special content for EXPLAIN 'Extra' column or NULL if none */
   enum explain_extra_tag info;
@@ -524,6 +525,12 @@ typedef struct st_join_table {
   /* Becomes true just after the used range filter has been built / filled */
   bool is_rowid_filter_built;
 
+  /*
+    Set to true if we consider creating a nest for a prefix of the JOIN order
+    that satisfies the ordering
+  */
+  bool is_sort_nest;
+
   void build_range_rowid_filter_if_needed();
 
   void cleanup();
@@ -991,6 +998,9 @@ typedef struct st_position
   /* Cost info for the range filter used at this position */
   Range_rowid_filter_cost_info *range_rowid_filter_info;
 
+  /* Flag to be set to TRUE if the join prefix satisfies the ORDER BY CLAUSE */
+  bool sort_nest_operation_here;
+
 } POSITION;
 
 typedef Bounds_checked_array<Item_null_result*> Item_null_array;
@@ -1506,6 +1516,7 @@ class JOIN :public Sql_alloc
     the optimize_cond() call in JOIN::optimize_inner() method.
   */
   bool is_orig_degenerated;
+  SORT_NEST_INFO *sort_nest_info;
 
   JOIN(THD *thd_arg, List<Item> &fields_arg, ulonglong select_options_arg,
        select_result *result_arg)
@@ -1602,6 +1613,7 @@ class JOIN :public Sql_alloc
     sjm_lookup_tables= 0;
     sjm_scan_tables= 0;
     is_orig_degenerated= false;
+    sort_nest_info= NULL;
   }
 
   /* True if the plan guarantees that it will be returned zero or one row */
@@ -1609,6 +1621,11 @@ class JOIN :public Sql_alloc
   /* Number of tables actually joined at the top level */
   uint exec_join_tab_cnt() { return tables_list ? top_join_tab_count : 0; }
 
+  /* TRUE if the sort-nest contains more than one table else FALSE */
+  bool sort_nest_needed() { return sort_nest_info ?
+                                   (sort_nest_info->n_tables == 1 ? FALSE : TRUE):
+                                   FALSE; }
+
   /*
     Number of tables in the join which also includes the temporary tables
     created for GROUP BY, DISTINCT , WINDOW FUNCTION etc.
@@ -1721,7 +1738,8 @@ class JOIN :public Sql_alloc
   JOIN_TAB *get_sort_by_join_tab()
   {
     return (need_tmp || !sort_by_table || skip_sort_order ||
-            ((group || tmp_table_param.sum_func_count) && !group_list)) ?
+            ((group || tmp_table_param.sum_func_count) && !group_list) ||
+            sort_nest_info) ?
               NULL : join_tab+const_tables;
   }
   bool setup_subquery_caches();
@@ -1748,11 +1766,20 @@ class JOIN :public Sql_alloc
   bool test_if_need_tmp_table()
   {
     return ((const_tables != table_count &&
-	    ((select_distinct || !simple_order || !simple_group) ||
+	    ((select_distinct || (!simple_order && !sort_nest_info) || !simple_group) ||
 	     (group_list && order) ||
              MY_TEST(select_options & OPTION_BUFFER_RESULT))) ||
             (rollup.state != ROLLUP::STATE_NONE && select_distinct));
   }
+
+  bool need_order_nest()
+  {
+    return ((const_tables != table_count &&
+            ((select_distinct || group_list) ||
+             MY_TEST(select_options & OPTION_BUFFER_RESULT))) ||
+            (rollup.state != ROLLUP::STATE_NONE && select_distinct) ||
+            select_lex->window_specs.elements > 0 || select_lex->agg_func_used());
+  }
   bool choose_subquery_plan(table_map join_tables);
   void get_partial_cost_and_fanout(int end_tab_idx,
                                    table_map filter_map,
@@ -1783,6 +1810,7 @@ class JOIN :public Sql_alloc
   bool fix_all_splittings_in_plan();
 
   bool transform_in_predicates_into_in_subq(THD *thd);
+  bool remove_const_from_order_by();
 private:
   /**
     Create a temporary table to be used for processing DISTINCT/ORDER
@@ -1853,6 +1881,7 @@ bool is_indexed_agg_distinct(JOIN *join, List<Item_field> *out_args);
 bool simple_pred(Item_func *func_item, Item **args, bool *inv_order);
 int opt_sum_query(THD* thd,
                   List<TABLE_LIST> &tables, List<Item> &all_fields, COND *conds);
+double calculate_record_count_for_sort_nest(JOIN *join, uint n_tables);
 
 /* from sql_delete.cc, used by opt_range.cc */
 extern "C" int refpos_order_cmp(void* arg, const void *a,const void *b);
@@ -2098,6 +2127,10 @@ bool mysql_select(THD *thd,
 void free_underlaid_joins(THD *thd, SELECT_LEX *select);
 bool mysql_explain_union(THD *thd, SELECT_LEX_UNIT *unit,
                          select_result *result);
+void propagate_equal_field_for_orderby(JOIN *join, ORDER *first_order);
+bool check_join_prefix_contains_ordering(JOIN *join, JOIN_TAB *tab,
+                                         table_map previous_tables);
+bool setup_sort_nest(JOIN *join);
 
 /*
   General routine to change field->ptr of a NULL-terminated array of Field
@@ -2449,6 +2482,7 @@ double get_tmp_table_write_cost(THD *thd, double row_count, uint row_size);
 void optimize_keyuse(JOIN *join, DYNAMIC_ARRAY *keyuse_array);
 bool sort_and_filter_keyuse(THD *thd, DYNAMIC_ARRAY *keyuse,
                             bool skip_unprefixed_keyparts);
+double postjoin_oper_cost(THD *thd, double join_record_count, uint rec_len, uint idx);
 
 struct st_cond_statistic
 {


More information about the commits mailing list