From c01f6ef46c8f0ab3faa54e8f040da6e9ddc7fe5b Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Fri, 9 Feb 2024 12:56:26 +0200 Subject: [PATCH] Fix usage of aggregate pathkeys in group_keys_reorder_by_pathkeys() group_keys_reorder_by_pathkeys() function searched for matching pathkeys within root->group_pathkeys. That could lead to picking an aggregate pathkey and using its pathkey->pk_eclass->ec_sortref as an argument of get_sortgroupref_clause_noerr(). Given that ec_sortref of an aggregate pathkey references aggregate targetlist not query targetlist, this leads to incorrect query optimization. Fix this by looking for matching pathkeys only within the first num_groupby_pathkeys pathkeys. Reported-by: David G. Johnston Discussion: https://postgr.es/m/CAKFQuwY3Ek%3DcLThgd8FdaSc5JRDVt0FaV00gMcWra%2BTAR4gGUw%40mail.gmail.com Author: Andrei Lepikhov, Alexander Korotkov --- src/backend/optimizer/path/pathkeys.c | 25 ++++++++++++++++++++---- src/test/regress/expected/aggregates.out | 24 +++++++++++++++++++++++ src/test/regress/sql/aggregates.sql | 19 ++++++++++++++++++ 3 files changed, 64 insertions(+), 4 deletions(-) diff --git a/src/backend/optimizer/path/pathkeys.c b/src/backend/optimizer/path/pathkeys.c index 82ff31273b..bcc1b7a9eb 100644 --- a/src/backend/optimizer/path/pathkeys.c +++ b/src/backend/optimizer/path/pathkeys.c @@ -355,10 +355,14 @@ pathkeys_contained_in(List *keys1, List *keys2) /* * group_keys_reorder_by_pathkeys - * Reorder GROUP BY keys to match the input pathkeys. + * Reorder GROUP BY pathkeys and clauses to match the input pathkeys. * - * Function returns new lists (pathkeys and clauses), original GROUP BY lists - * stay untouched. + * 'pathkeys' is an input list of pathkeys + * '*group_pathkeys' and '*group_clauses' are pathkeys and clauses lists to + * reorder. The pointers are redirected to new lists, original lists + * stay untouched. + * 'num_groupby_pathkeys' is the number of first '*group_pathkeys' items to + * search matching pathkeys. * * Returns the number of GROUP BY keys with a matching pathkey. */ @@ -369,12 +373,24 @@ group_keys_reorder_by_pathkeys(List *pathkeys, List **group_pathkeys, { List *new_group_pathkeys = NIL, *new_group_clauses = NIL; + List *grouping_pathkeys; ListCell *lc; int n; if (pathkeys == NIL || *group_pathkeys == NIL) return 0; + /* + * We're going to search within just the first num_groupby_pathkeys of + * *group_pathkeys. The thing is that root->group_pathkeys is passed as + * *group_pathkeys containing grouping pathkeys altogether with aggregate + * pathkeys. If we process aggregate pathkeys we could get an invalid + * result of get_sortgroupref_clause_noerr(), because their + * pathkey->pk_eclass->ec_sortref doesn't referece query targetlist. So, + * we allocate a separate list of pathkeys for lookups. + */ + grouping_pathkeys = list_copy_head(*group_pathkeys, num_groupby_pathkeys); + /* * Walk the pathkeys (determining ordering of the input path) and see if * there's a matching GROUP BY key. If we find one, we append it to the @@ -395,7 +411,7 @@ group_keys_reorder_by_pathkeys(List *pathkeys, List **group_pathkeys, * there is no sortclause reference for some reason. */ if (foreach_current_index(lc) >= num_groupby_pathkeys || - !list_member_ptr(*group_pathkeys, pathkey) || + !list_member_ptr(grouping_pathkeys, pathkey) || pathkey->pk_eclass->ec_sortref == 0) break; @@ -429,6 +445,7 @@ group_keys_reorder_by_pathkeys(List *pathkeys, List **group_pathkeys, *group_clauses = list_concat_unique_ptr(new_group_clauses, *group_clauses); + list_free(grouping_pathkeys); return n; } diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out index 7a73c19314..f736bab67e 100644 --- a/src/test/regress/expected/aggregates.out +++ b/src/test/regress/expected/aggregates.out @@ -2874,6 +2874,30 @@ SELECT y,x,array_agg(distinct w) FROM btg WHERE y < 0 GROUP BY x,y; RESET enable_incremental_sort; DROP TABLE btg; +-- Check we don't pick aggregate path key instead of grouping path key +CREATE TABLE group_agg_pk AS SELECT + i % 10 AS x, + i % 2 AS y, + i % 2 AS z, + 2 AS w, + i % 10 AS f +FROM generate_series(1,100) AS i; +ANALYZE group_agg_pk; +SET enable_nestloop = off; +SET enable_hashjoin = off; +SELECT + c1.z, c1.w, string_agg(''::text, repeat(''::text, c1.f) ORDER BY c1.x,c1.y) +FROM group_agg_pk c1 JOIN group_agg_pk c2 ON (c1.x = c2.f) +GROUP BY c1.w, c1.z; + z | w | string_agg +---+---+------------ + 0 | 2 | + 1 | 2 | +(2 rows) + +RESET enable_nestloop; +RESET enable_hashjoin; +DROP TABLE group_agg_pk; -- The case, when scanning sort order correspond to aggregate sort order but -- can not be found in the group-by list CREATE TABLE agg_sort_order (c1 int PRIMARY KEY, c2 int); diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql index 916dbf908f..a19ef713d3 100644 --- a/src/test/regress/sql/aggregates.sql +++ b/src/test/regress/sql/aggregates.sql @@ -1231,6 +1231,25 @@ RESET enable_incremental_sort; DROP TABLE btg; +-- Check we don't pick aggregate path key instead of grouping path key +CREATE TABLE group_agg_pk AS SELECT + i % 10 AS x, + i % 2 AS y, + i % 2 AS z, + 2 AS w, + i % 10 AS f +FROM generate_series(1,100) AS i; +ANALYZE group_agg_pk; +SET enable_nestloop = off; +SET enable_hashjoin = off; +SELECT + c1.z, c1.w, string_agg(''::text, repeat(''::text, c1.f) ORDER BY c1.x,c1.y) +FROM group_agg_pk c1 JOIN group_agg_pk c2 ON (c1.x = c2.f) +GROUP BY c1.w, c1.z; +RESET enable_nestloop; +RESET enable_hashjoin; +DROP TABLE group_agg_pk; + -- The case, when scanning sort order correspond to aggregate sort order but -- can not be found in the group-by list CREATE TABLE agg_sort_order (c1 int PRIMARY KEY, c2 int);