Index used incorrectly with regular expressions on 7.4.6

Started by Antti Salmelaover 21 years ago2 messageshackers
Jump to latest
#1Antti Salmela
asalmela@iki.fi

Index is used incorrectly if constant part of the string ends with \d,
probably also with other escapes.

foo=# explain select count(*) from loc where url ~
'^http://www\\.7-eleven\\.com/newsroom/articles\\.asp\\?p=\\d+'; QUERY PLAN

Aggregate (cost=3.46..3.46 rows=1 width=0)
-> Index Scan using url_index on loc (cost=0.00..3.46 rows=1 width=0)
Index Cond: ((url >=
'http://www.7-eleven.com/newsroom/articles.asp?p=d'::text) AND
(url < 'http://www.7-eleven.com/newsroom/articles.asp?p=e&#39;::text))
Filter: (url ~
'^http://www\\.7-eleven\\.com/newsroom/articles\\.asp\\?p=\\d+&#39;::text)
(4 rows)
foo=# select count(*) from loc where url ~
'^http://www\\.7-eleven\\.com/newsroom/articles\\.asp\\?p=\\d+&#39;;
count
-------
0
(1 row)

foo=# set enable_indexscan = off;
SET
foo=# explain select count(*) from loc where url ~
'^http://www\\.7-eleven\\.com/newsroom/articles\\.asp\\?p=\\d+&#39;;
QUERY PLAN
Aggregate (cost=3056.41..3056.41 rows=1 width=0)
-> Seq Scan on loc (cost=0.00..3056.40 rows=1 width=0)
Filter: (url ~
'^http://www\\.7-eleven\\.com/newsroom/articles\\.asp\\?p=\\d+&#39;::text)
(3 rows)

foo=# select count(*) from loc where url ~
'^http://www\\.7-eleven\\.com/newsroom/articles\\.asp\\?p=\\d+&#39;;
count
-------
281
(1 row)

--
Antti Salmela

#2Tom Lane
tgl@sss.pgh.pa.us
In reply to: Antti Salmela (#1)
Re: Index used incorrectly with regular expressions on 7.4.6

Antti Salmela <asalmela@iki.fi> writes:

Index is used incorrectly if constant part of the string ends with \d,

Yeah, you're right --- that code predates our use of the new regexp
engine, and it didn't know that escapes aren't simply quoted characters.

Now that I look at it, it's got a multibyte problem too :-(

If you need a patch right away, here's what I applied to 7.4 branch.

regards, tom lane

Index: selfuncs.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v
retrieving revision 1.147.2.3
diff -c -r1.147.2.3 selfuncs.c
*** selfuncs.c	27 Feb 2004 21:44:44 -0000	1.147.2.3
--- selfuncs.c	2 Dec 2004 02:35:48 -0000
***************
*** 3218,3223 ****
--- 3218,3225 ----
  	char	   *match;
  	int			pos,
  				match_pos,
+ 				prev_pos,
+ 				prev_match_pos,
  				paren_depth;
  	char	   *patt;
  	char	   *rest;
***************
*** 3278,3288 ****

/* OK, allocate space for pattern */
match = palloc(strlen(patt) + 1);
! match_pos = 0;

  	/* note start at pos 1 to skip leading ^ */
! 	for (pos = 1; patt[pos]; pos++)
  	{
  		/*
  		 * Check for characters that indicate multiple possible matches
  		 * here. XXX I suspect isalpha() is not an adequately
--- 3280,3292 ----

/* OK, allocate space for pattern */
match = palloc(strlen(patt) + 1);
! prev_match_pos = match_pos = 0;

  	/* note start at pos 1 to skip leading ^ */
! 	for (prev_pos = pos = 1; patt[pos]; )
  	{
+ 		int		len;
+ 
  		/*
  		 * Check for characters that indicate multiple possible matches
  		 * here. XXX I suspect isalpha() is not an adequately
***************
*** 3297,3302 ****
--- 3301,3314 ----
  			break;
  		/*
+ 		 * In AREs, backslash followed by alphanumeric is an escape, not
+ 		 * a quoted character.  Must treat it as having multiple possible
+ 		 * matches.
+ 		 */
+ 		if (patt[pos] == '\\' && isalnum((unsigned char) patt[pos + 1]))
+ 			break;
+ 
+ 		/*
  		 * Check for quantifiers.  Except for +, this means the preceding
  		 * character is optional, so we must remove it from the prefix
  		 * too!
***************
*** 3305,3318 ****
  			patt[pos] == '?' ||
  			patt[pos] == '{')
  		{
! 			if (match_pos > 0)
! 				match_pos--;
! 			pos--;
  			break;
  		}
  		if (patt[pos] == '+')
  		{
! 			pos--;
  			break;
  		}
  		if (patt[pos] == '\\')
--- 3317,3329 ----
  			patt[pos] == '?' ||
  			patt[pos] == '{')
  		{
! 			match_pos = prev_match_pos;
! 			pos = prev_pos;
  			break;
  		}
  		if (patt[pos] == '+')
  		{
! 			pos = prev_pos;
  			break;
  		}
  		if (patt[pos] == '\\')
***************
*** 3322,3328 ****
  			if (patt[pos] == '\0')
  				break;
  		}
! 		match[match_pos++] = patt[pos];
  	}
  	match[match_pos] = '\0';
--- 3333,3346 ----
  			if (patt[pos] == '\0')
  				break;
  		}
! 		/* save position in case we need to back up on next loop cycle */
! 		prev_match_pos = match_pos;
! 		prev_pos = pos;
! 		/* must use encoding-aware processing here */
! 		len = pg_mblen(&patt[pos]);
! 		memcpy(&match[match_pos], &patt[pos], len);
! 		match_pos += len;
! 		pos += len;
  	}

match[match_pos] = '\0';