--- ext/standard/string.c (revision 301280) +++ ext/standard/string.c (working copy) @@ -1674,6 +1674,105 @@ } /* }}} */ +#define FASTSEARCH_MIN 50 + +/* + * Implementation of "fastsearch" algorithm devised by + * Fredrik Lundh: http://effbot.org/zone/stringlib.htm + * Adapted from Python-2.7/Objects/stringlib/fastsearch.h + * + * Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python + * Software Foundation; All Rights Reserved. + * Released under the Python Software Foundation License v2 + */ + +/* Poor man's Bloom filter */ + +#if SIZEOF_LONG == 4 && defined(HAVE_ZEND_LONG64) +# define PHP_BLOOM_WIDTH 64 +#else +# define PHP_BLOOM_WIDTH 32 +#endif + +#define PHP_BLOOM_ADD(mask, ch) \ + ((mask |= (1UL << ((ch) & (PHP_BLOOM_WIDTH -1))))) +#define PHP_BLOOM(mask, ch) \ + ((mask & (1UL << ((ch) & (PHP_BLOOM_WIDTH -1))))) + +/* {{{ This is a fast search code based on the Python search method */ +static char * +php_fastsearch(char *haystack, char *needle, int needle_len, char *end) +{ + char *p = haystack; + int haystack_len = end - haystack; + unsigned long mask; + int skip; + int j, mlast; + size_t i; + + /* Special case one-byte needles & needles longer than haystack */ + if (needle_len == 1) { + return (char *)memchr(p, *needle, (end-p)); + } + if (needle_len > haystack_len) { + return NULL; + } + + mlast = needle_len - 1; + skip = mlast - 1; + end -= needle_len; + + /* Process pattern up to penultimate char. Also calculate the + * longest safe skip distance in lieu of a proper skip table. + */ + for (i = 0; i < mlast; i++) { + PHP_BLOOM_ADD(mask, needle[i]); + if (needle[i] == needle[mlast]) { + skip = mlast - i - 1; + } + } + /* Add pattern[-1] to mask */ + PHP_BLOOM_ADD(mask, needle[mlast]); + + for (i = 0; i <= haystack_len; i++) { + /* Note: using mlast in the skip path slows things down on x86 */ + if (haystack[i+needle_len-1] == needle[needle_len-1]) { + + /* Candidate match, scan the rest + * todo: memcmp() ? */ + for (j = 0; j < mlast; j++) { + if (haystack[i+j] != needle[j]) { + break; + } + } + + if (j == mlast) { + return (char *)(p + i); + } + + /* Mismatch: check if next character is part of pattern */ + if (!PHP_BLOOM(mask, haystack[i+needle_len])) { + i = i + needle_len; + } else { + i = i + skip; + } + } else { + /* Opportunistic skip: the current char is not equal to the + * last pattern char. Check if *next* character is part of + * the pattern. If not, skip away. + */ + if (!PHP_BLOOM(mask, haystack[i+needle_len])) { + i = i + needle_len; + } + } + } + + /* No match. How sad. */ + return NULL; +} +/* }}} */ +/* End Python Fast Search Code */ + /* {{{ proto string strstr(string haystack, string needle[, bool part]) Finds first occurrence of a string within another */ PHP_FUNCTION(strstr) @@ -1696,14 +1795,25 @@ RETURN_FALSE; } - found = php_memnstr(haystack, Z_STRVAL_P(needle), Z_STRLEN_P(needle), haystack + haystack_len); + if (haystack_len >= FASTSEARCH_MIN) { + found = php_fastsearch(haystack, + Z_STRVAL_P(needle), + Z_STRLEN_P(needle), + haystack + haystack_len); + } else { + found = php_memnstr(haystack, + Z_STRVAL_P(needle), + Z_STRLEN_P(needle), + haystack + haystack_len); + } } else { if (php_needle_char(needle, needle_char TSRMLS_CC) != SUCCESS) { RETURN_FALSE; } needle_char[1] = 0; - - found = php_memnstr(haystack, needle_char, 1, haystack + haystack_len); + found = php_memnstr(haystack, + needle_char, 1, + haystack + haystack_len); } if (found) { @@ -1724,6 +1834,7 @@ /* {{{ proto int strpos(string haystack, string needle [, int offset]) Finds position of first occurrence of a string within another */ + PHP_FUNCTION(strpos) { zval *needle; @@ -1732,7 +1843,7 @@ char needle_char[2]; long offset = 0; int haystack_len; - + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sz|l", &haystack, &haystack_len, &needle, &offset) == FAILURE) { return; } @@ -1748,20 +1859,26 @@ RETURN_FALSE; } - found = php_memnstr(haystack + offset, + if (haystack_len >= FASTSEARCH_MIN) { + found = php_fastsearch(haystack + offset, Z_STRVAL_P(needle), Z_STRLEN_P(needle), haystack + haystack_len); + } else { + found = php_memnstr(haystack + offset, + Z_STRVAL_P(needle), + Z_STRLEN_P(needle), + haystack + haystack_len); + } } else { if (php_needle_char(needle, needle_char TSRMLS_CC) != SUCCESS) { RETURN_FALSE; } needle_char[1] = 0; - found = php_memnstr(haystack + offset, needle_char, 1, - haystack + haystack_len); + haystack + haystack_len); } if (found) { @@ -1772,6 +1889,7 @@ } /* }}} */ + /* {{{ proto int stripos(string haystack, string needle [, int offset]) Finds position of first occurrence of a string within another, case insensitive */ PHP_FUNCTION(stripos) @@ -1808,7 +1926,12 @@ needle_dup = estrndup(Z_STRVAL_P(needle), Z_STRLEN_P(needle)); php_strtolower(needle_dup, Z_STRLEN_P(needle)); - found = php_memnstr(haystack_dup + offset, needle_dup, Z_STRLEN_P(needle), haystack_dup + haystack_len); + + if (haystack_len >= FASTSEARCH_MIN) { + found = php_fastsearch(haystack_dup + offset, needle_dup, Z_STRLEN_P(needle), haystack_dup + haystack_len); + } else { + found = php_memnstr(haystack_dup + offset, needle_dup, Z_STRLEN_P(needle), haystack_dup + haystack_len); + } } else { if (php_needle_char(needle, needle_char TSRMLS_CC) != SUCCESS) { efree(haystack_dup); @@ -1816,9 +1939,10 @@ } needle_char[0] = tolower(needle_char[0]); needle_char[1] = '\0'; - found = php_memnstr(haystack_dup + offset, - needle_char, - sizeof(needle_char) - 1, + + found = php_memnstr(haystack_dup + offset, + needle_char, + sizeof(needle_char) - 1, haystack_dup + haystack_len); }