3333#include "utils/syscache.h"
3434
3535
36+ #define MAX_FUZZY_DISTANCE 3
37+
3638static RangeTblEntry * scanNameSpaceForRefname (ParseState * pstate ,
3739 const char * refname , int location );
3840static RangeTblEntry * scanNameSpaceForRelid (ParseState * pstate , Oid relid ,
@@ -519,6 +521,101 @@ GetCTEForRTE(ParseState *pstate, RangeTblEntry *rte, int rtelevelsup)
519521 return NULL ; /* keep compiler quiet */
520522}
521523
524+ /*
525+ * updateFuzzyAttrMatchState
526+ * Using Levenshtein distance, consider if column is best fuzzy match.
527+ */
528+ static void
529+ updateFuzzyAttrMatchState (int fuzzy_rte_penalty ,
530+ FuzzyAttrMatchState * fuzzystate , RangeTblEntry * rte ,
531+ const char * actual , const char * match , int attnum )
532+ {
533+ int columndistance ;
534+ int matchlen ;
535+
536+ /* Bail before computing the Levenshtein distance if there's no hope. */
537+ if (fuzzy_rte_penalty > fuzzystate -> distance )
538+ return ;
539+
540+ /*
541+ * Outright reject dropped columns, which can appear here with apparent
542+ * empty actual names, per remarks within scanRTEForColumn().
543+ */
544+ if (actual [0 ] == '\0' )
545+ return ;
546+
547+ /* Use Levenshtein to compute match distance. */
548+ matchlen = strlen (match );
549+ columndistance =
550+ varstr_levenshtein_less_equal (actual , strlen (actual ), match , matchlen ,
551+ 1 , 1 , 1 ,
552+ fuzzystate -> distance + 1
553+ - fuzzy_rte_penalty );
554+
555+ /*
556+ * If more than half the characters are different, don't treat it as a
557+ * match, to avoid making ridiculous suggestions.
558+ */
559+ if (columndistance > matchlen / 2 )
560+ return ;
561+
562+ /*
563+ * From this point on, we can ignore the distinction between the
564+ * RTE-name distance and the column-name distance.
565+ */
566+ columndistance += fuzzy_rte_penalty ;
567+
568+ /*
569+ * If the new distance is less than or equal to that of the best match
570+ * found so far, update fuzzystate.
571+ */
572+ if (columndistance < fuzzystate -> distance )
573+ {
574+ /* Store new lowest observed distance for RTE */
575+ fuzzystate -> distance = columndistance ;
576+ fuzzystate -> rfirst = rte ;
577+ fuzzystate -> first = attnum ;
578+ fuzzystate -> rsecond = NULL ;
579+ fuzzystate -> second = InvalidAttrNumber ;
580+ }
581+ else if (columndistance == fuzzystate -> distance )
582+ {
583+ /*
584+ * This match distance may equal a prior match within this same
585+ * range table. When that happens, the prior match may also be
586+ * given, but only if there is no more than two equally distant
587+ * matches from the RTE (in turn, our caller will only accept
588+ * two equally distant matches overall).
589+ */
590+ if (AttributeNumberIsValid (fuzzystate -> second ))
591+ {
592+ /* Too many RTE-level matches */
593+ fuzzystate -> rfirst = NULL ;
594+ fuzzystate -> first = InvalidAttrNumber ;
595+ fuzzystate -> rsecond = NULL ;
596+ fuzzystate -> second = InvalidAttrNumber ;
597+ /* Clearly, distance is too low a bar (for *any* RTE) */
598+ fuzzystate -> distance = columndistance - 1 ;
599+ }
600+ else if (AttributeNumberIsValid (fuzzystate -> first ))
601+ {
602+ /* Record as provisional second match for RTE */
603+ fuzzystate -> rsecond = rte ;
604+ fuzzystate -> second = attnum ;
605+ }
606+ else if (fuzzystate -> distance <= MAX_FUZZY_DISTANCE )
607+ {
608+ /*
609+ * Record as provisional first match (this can occasionally
610+ * occur because previous lowest distance was "too low a
611+ * bar", rather than being associated with a real match)
612+ */
613+ fuzzystate -> rfirst = rte ;
614+ fuzzystate -> first = attnum ;
615+ }
616+ }
617+ }
618+
522619/*
523620 * scanRTEForColumn
524621 * Search the column names of a single RTE for the given name.
@@ -527,10 +624,14 @@ GetCTEForRTE(ParseState *pstate, RangeTblEntry *rte, int rtelevelsup)
527624 *
528625 * Side effect: if we find a match, mark the RTE as requiring read access
529626 * for the column.
627+ *
628+ * Additional side effect: if fuzzystate is non-NULL, check non-system columns
629+ * for an approximate match and update fuzzystate accordingly.
530630 */
531631Node *
532632scanRTEForColumn (ParseState * pstate , RangeTblEntry * rte , char * colname ,
533- int location )
633+ int location , int fuzzy_rte_penalty ,
634+ FuzzyAttrMatchState * fuzzystate )
534635{
535636 Node * result = NULL ;
536637 int attnum = 0 ;
@@ -548,12 +649,16 @@ scanRTEForColumn(ParseState *pstate, RangeTblEntry *rte, char *colname,
548649 * Should this somehow go wrong and we try to access a dropped column,
549650 * we'll still catch it by virtue of the checks in
550651 * get_rte_attribute_type(), which is called by make_var(). That routine
551- * has to do a cache lookup anyway, so the check there is cheap.
652+ * has to do a cache lookup anyway, so the check there is cheap. Callers
653+ * interested in finding match with shortest distance need to defend
654+ * against this directly, though.
552655 */
553656 foreach (c , rte -> eref -> colnames )
554657 {
658+ const char * attcolname = strVal (lfirst (c ));
659+
555660 attnum ++ ;
556- if (strcmp (strVal ( lfirst ( c )) , colname ) == 0 )
661+ if (strcmp (attcolname , colname ) == 0 )
557662 {
558663 if (result )
559664 ereport (ERROR ,
@@ -566,6 +671,11 @@ scanRTEForColumn(ParseState *pstate, RangeTblEntry *rte, char *colname,
566671 markVarForSelectPriv (pstate , var , rte );
567672 result = (Node * ) var ;
568673 }
674+
675+ /* Updating fuzzy match state, if provided. */
676+ if (fuzzystate != NULL )
677+ updateFuzzyAttrMatchState (fuzzy_rte_penalty , fuzzystate ,
678+ rte , attcolname , colname , attnum );
569679 }
570680
571681 /*
@@ -642,7 +752,8 @@ colNameToVar(ParseState *pstate, char *colname, bool localonly,
642752 continue ;
643753
644754 /* use orig_pstate here to get the right sublevels_up */
645- newresult = scanRTEForColumn (orig_pstate , rte , colname , location );
755+ newresult = scanRTEForColumn (orig_pstate , rte , colname , location ,
756+ 0 , NULL );
646757
647758 if (newresult )
648759 {
@@ -668,36 +779,92 @@ colNameToVar(ParseState *pstate, char *colname, bool localonly,
668779
669780/*
670781 * searchRangeTableForCol
671- * See if any RangeTblEntry could possibly provide the given column name.
672- * If so, return a pointer to the RangeTblEntry; else return NULL .
782+ * See if any RangeTblEntry could possibly provide the given column name (or
783+ * find the best match available). Returns state with relevant details .
673784 *
674785 * This is different from colNameToVar in that it considers every entry in
675786 * the ParseState's rangetable(s), not only those that are currently visible
676787 * in the p_namespace list(s). This behavior is invalid per the SQL spec,
677788 * and it may give ambiguous results (there might be multiple equally valid
678789 * matches, but only one will be returned). This must be used ONLY as a
679790 * heuristic in giving suitable error messages. See errorMissingColumn.
791+ *
792+ * This function is also different in that it will consider approximate
793+ * matches -- if the user entered an alias/column pair that is only slightly
794+ * different from a valid pair, we may be able to infer what they meant to
795+ * type and provide a reasonable hint.
796+ *
797+ * The FuzzyAttrMatchState will have 'rfirst' pointing to the best RTE
798+ * containing the most promising match for the alias and column name. If
799+ * the alias and column names match exactly, 'first' will be InvalidAttrNumber;
800+ * otherwise, it will be the attribute number for the match. In the latter
801+ * case, 'rsecond' may point to a second, equally close approximate match,
802+ * and 'second' will contain the attribute number for the second match.
680803 */
681- static RangeTblEntry *
682- searchRangeTableForCol (ParseState * pstate , char * colname , int location )
804+ static FuzzyAttrMatchState *
805+ searchRangeTableForCol (ParseState * pstate , const char * alias , char * colname ,
806+ int location )
683807{
684808 ParseState * orig_pstate = pstate ;
809+ FuzzyAttrMatchState * fuzzystate = palloc (sizeof (FuzzyAttrMatchState ));
810+
811+ fuzzystate -> distance = MAX_FUZZY_DISTANCE + 1 ;
812+ fuzzystate -> rfirst = NULL ;
813+ fuzzystate -> rsecond = NULL ;
814+ fuzzystate -> first = InvalidAttrNumber ;
815+ fuzzystate -> second = InvalidAttrNumber ;
685816
686817 while (pstate != NULL )
687818 {
688819 ListCell * l ;
689820
690821 foreach (l , pstate -> p_rtable )
691822 {
692- RangeTblEntry * rte = (RangeTblEntry * ) lfirst (l );
823+ RangeTblEntry * rte = (RangeTblEntry * ) lfirst (l );
824+ int fuzzy_rte_penalty = 0 ;
693825
694- if (scanRTEForColumn (orig_pstate , rte , colname , location ))
695- return rte ;
826+ /*
827+ * Typically, it is not useful to look for matches within join
828+ * RTEs; they effectively duplicate other RTEs for our purposes,
829+ * and if a match is chosen from a join RTE, an unhelpful alias is
830+ * displayed in the final diagnostic message.
831+ */
832+ if (rte -> rtekind == RTE_JOIN )
833+ continue ;
834+
835+ /*
836+ * If the user didn't specify an alias, then matches against one
837+ * RTE are as good as another. But if the user did specify an
838+ * alias, then we want at least a fuzzy - and preferably an exact
839+ * - match for the range table entry.
840+ */
841+ if (alias != NULL )
842+ fuzzy_rte_penalty =
843+ varstr_levenshtein (alias , strlen (alias ),
844+ rte -> eref -> aliasname ,
845+ strlen (rte -> eref -> aliasname ),
846+ 1 , 1 , 1 );
847+
848+ /*
849+ * Scan for a matching column; if we find an exact match, we're
850+ * done. Otherwise, update fuzzystate.
851+ */
852+ if (scanRTEForColumn (orig_pstate , rte , colname , location ,
853+ fuzzy_rte_penalty , fuzzystate )
854+ && fuzzy_rte_penalty == 0 )
855+ {
856+ fuzzystate -> rfirst = rte ;
857+ fuzzystate -> first = InvalidAttrNumber ;
858+ fuzzystate -> rsecond = NULL ;
859+ fuzzystate -> second = InvalidAttrNumber ;
860+ return fuzzystate ;
861+ }
696862 }
697863
698864 pstate = pstate -> parentParseState ;
699865 }
700- return NULL ;
866+
867+ return fuzzystate ;
701868}
702869
703870/*
@@ -2860,34 +3027,67 @@ void
28603027errorMissingColumn (ParseState * pstate ,
28613028 char * relname , char * colname , int location )
28623029{
2863- RangeTblEntry * rte ;
3030+ FuzzyAttrMatchState * state ;
3031+ char * closestfirst = NULL ;
28643032
28653033 /*
2866- * If relname was given, just play dumb and report it. (In practice, a
2867- * bad qualification name should end up at errorMissingRTE, not here, so
2868- * no need to work hard on this case.)
3034+ * Search the entire rtable looking for possible matches. If we find one,
3035+ * emit a hint about it.
3036+ *
3037+ * TODO: improve this code (and also errorMissingRTE) to mention using
3038+ * LATERAL if appropriate.
28693039 */
2870- if (relname )
2871- ereport (ERROR ,
2872- (errcode (ERRCODE_UNDEFINED_COLUMN ),
2873- errmsg ("column %s.%s does not exist" , relname , colname ),
2874- parser_errposition (pstate , location )));
3040+ state = searchRangeTableForCol (pstate , relname , colname , location );
28753041
28763042 /*
2877- * Otherwise, search the entire rtable looking for possible matches. If
2878- * we find one, emit a hint about it.
3043+ * Extract closest col string for best match, if any.
28793044 *
2880- * TODO: improve this code (and also errorMissingRTE) to mention using
2881- * LATERAL if appropriate.
3045+ * Infer an exact match referenced despite not being visible from the fact
3046+ * that an attribute number was not present in state passed back -- this is
3047+ * what is reported when !closestfirst. There might also be an exact match
3048+ * that was qualified with an incorrect alias, in which case closestfirst
3049+ * will be set (so hint is the same as generic fuzzy case).
28823050 */
2883- rte = searchRangeTableForCol (pstate , colname , location );
2884-
2885- ereport (ERROR ,
2886- (errcode (ERRCODE_UNDEFINED_COLUMN ),
2887- errmsg ("column \"%s\" does not exist" , colname ),
2888- rte ? errhint ("There is a column named \"%s\" in table \"%s\", but it cannot be referenced from this part of the query." ,
2889- colname , rte -> eref -> aliasname ) : 0 ,
2890- parser_errposition (pstate , location )));
3051+ if (state -> rfirst && AttributeNumberIsValid (state -> first ))
3052+ closestfirst = strVal (list_nth (state -> rfirst -> eref -> colnames ,
3053+ state -> first - 1 ));
3054+
3055+ if (!state -> rsecond )
3056+ {
3057+ /*
3058+ * Handle case where there is zero or one column suggestions to hint,
3059+ * including exact matches referenced but not visible.
3060+ */
3061+ ereport (ERROR ,
3062+ (errcode (ERRCODE_UNDEFINED_COLUMN ),
3063+ relname ?
3064+ errmsg ("column %s.%s does not exist" , relname , colname ):
3065+ errmsg ("column \"%s\" does not exist" , colname ),
3066+ state -> rfirst ? closestfirst ?
3067+ errhint ("Perhaps you meant to reference the column \"%s\".\"%s\"." ,
3068+ state -> rfirst -> eref -> aliasname , closestfirst ):
3069+ errhint ("There is a column named \"%s\" in table \"%s\", but it cannot be referenced from this part of the query." ,
3070+ colname , state -> rfirst -> eref -> aliasname ): 0 ,
3071+ parser_errposition (pstate , location )));
3072+ }
3073+ else
3074+ {
3075+ /* Handle case where there are two equally useful column hints */
3076+ char * closestsecond ;
3077+
3078+ closestsecond = strVal (list_nth (state -> rsecond -> eref -> colnames ,
3079+ state -> second - 1 ));
3080+
3081+ ereport (ERROR ,
3082+ (errcode (ERRCODE_UNDEFINED_COLUMN ),
3083+ relname ?
3084+ errmsg ("column %s.%s does not exist" , relname , colname ):
3085+ errmsg ("column \"%s\" does not exist" , colname ),
3086+ errhint ("Perhaps you meant to reference the column \"%s\".\"%s\" or the column \"%s\".\"%s\"." ,
3087+ state -> rfirst -> eref -> aliasname , closestfirst ,
3088+ state -> rsecond -> eref -> aliasname , closestsecond ),
3089+ parser_errposition (pstate , location )));
3090+ }
28913091}
28923092
28933093
0 commit comments