Skip to content

Instantly share code, notes, and snippets.

@aristus
Last active August 29, 2015 13:57
Show Gist options
  • Select an option

  • Save aristus/9688925 to your computer and use it in GitHub Desktop.

Select an option

Save aristus/9688925 to your computer and use it in GitHub Desktop.

Revisions

  1. aristus renamed this gist Mar 21, 2014. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  2. aristus renamed this gist Mar 21, 2014. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  3. aristus created this gist Mar 21, 2014.
    219 changes: 219 additions & 0 deletions php_fastsearch.cc
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,219 @@
    --- ext/standard/string.c (revision 301280)
    +++ ext/standard/string.c (working copy)
    @@ -1674,6 +1674,105 @@
    }
    /* }}} */

    +#define FASTSEARCH_MIN 50
    +
    +/*
    + * Implementation of "fastsearch" algorithm devised by
    + * Fredrik Lundh: http://effbot.org/zone/stringlib.htm
    + * Adapted from Python-2.7/Objects/stringlib/fastsearch.h
    + *
    + * Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python
    + * Software Foundation; All Rights Reserved.
    + * Released under the Python Software Foundation License v2
    + */
    +
    +/* Poor man's Bloom filter */
    +
    +#if SIZEOF_LONG == 4 && defined(HAVE_ZEND_LONG64)
    +# define PHP_BLOOM_WIDTH 64
    +#else
    +# define PHP_BLOOM_WIDTH 32
    +#endif
    +
    +#define PHP_BLOOM_ADD(mask, ch) \
    + ((mask |= (1UL << ((ch) & (PHP_BLOOM_WIDTH -1)))))
    +#define PHP_BLOOM(mask, ch) \
    + ((mask & (1UL << ((ch) & (PHP_BLOOM_WIDTH -1)))))
    +
    +/* {{{ This is a fast search code based on the Python search method */
    +static char *
    +php_fastsearch(char *haystack, char *needle, int needle_len, char *end)
    +{
    + char *p = haystack;
    + int haystack_len = end - haystack;
    + unsigned long mask;
    + int skip;
    + int j, mlast;
    + size_t i;
    +
    + /* Special case one-byte needles & needles longer than haystack */
    + if (needle_len == 1) {
    + return (char *)memchr(p, *needle, (end-p));
    + }
    + if (needle_len > haystack_len) {
    + return NULL;
    + }
    +
    + mlast = needle_len - 1;
    + skip = mlast - 1;
    + end -= needle_len;
    +
    + /* Process pattern up to penultimate char. Also calculate the
    + * longest safe skip distance in lieu of a proper skip table.
    + */
    + for (i = 0; i < mlast; i++) {
    + PHP_BLOOM_ADD(mask, needle[i]);
    + if (needle[i] == needle[mlast]) {
    + skip = mlast - i - 1;
    + }
    + }
    + /* Add pattern[-1] to mask */
    + PHP_BLOOM_ADD(mask, needle[mlast]);
    +
    + for (i = 0; i <= haystack_len; i++) {
    + /* Note: using mlast in the skip path slows things down on x86 */
    + if (haystack[i+needle_len-1] == needle[needle_len-1]) {
    +
    + /* Candidate match, scan the rest
    + * todo: memcmp() ? */
    + for (j = 0; j < mlast; j++) {
    + if (haystack[i+j] != needle[j]) {
    + break;
    + }
    + }
    +
    + if (j == mlast) {
    + return (char *)(p + i);
    + }
    +
    + /* Mismatch: check if next character is part of pattern */
    + if (!PHP_BLOOM(mask, haystack[i+needle_len])) {
    + i = i + needle_len;
    + } else {
    + i = i + skip;
    + }
    + } else {
    + /* Opportunistic skip: the current char is not equal to the
    + * last pattern char. Check if *next* character is part of
    + * the pattern. If not, skip away.
    + */
    + if (!PHP_BLOOM(mask, haystack[i+needle_len])) {
    + i = i + needle_len;
    + }
    + }
    + }
    +
    + /* No match. How sad. */
    + return NULL;
    +}
    +/* }}} */
    +/* End Python Fast Search Code */
    +
    /* {{{ proto string strstr(string haystack, string needle[, bool part])
    Finds first occurrence of a string within another */
    PHP_FUNCTION(strstr)
    @@ -1696,14 +1795,25 @@
    RETURN_FALSE;
    }

    - found = php_memnstr(haystack, Z_STRVAL_P(needle), Z_STRLEN_P(needle), haystack + haystack_len);
    + if (haystack_len >= FASTSEARCH_MIN) {
    + found = php_fastsearch(haystack,
    + Z_STRVAL_P(needle),
    + Z_STRLEN_P(needle),
    + haystack + haystack_len);
    + } else {
    + found = php_memnstr(haystack,
    + Z_STRVAL_P(needle),
    + Z_STRLEN_P(needle),
    + haystack + haystack_len);
    + }
    } else {
    if (php_needle_char(needle, needle_char TSRMLS_CC) != SUCCESS) {
    RETURN_FALSE;
    }
    needle_char[1] = 0;
    -
    - found = php_memnstr(haystack, needle_char, 1, haystack + haystack_len);
    + found = php_memnstr(haystack,
    + needle_char, 1,
    + haystack + haystack_len);
    }

    if (found) {
    @@ -1724,6 +1834,7 @@

    /* {{{ proto int strpos(string haystack, string needle [, int offset])
    Finds position of first occurrence of a string within another */
    +
    PHP_FUNCTION(strpos)
    {
    zval *needle;
    @@ -1732,7 +1843,7 @@
    char needle_char[2];
    long offset = 0;
    int haystack_len;
    -
    +
    if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sz|l", &haystack, &haystack_len, &needle, &offset) == FAILURE) {
    return;
    }
    @@ -1748,20 +1859,26 @@
    RETURN_FALSE;
    }

    - found = php_memnstr(haystack + offset,
    + if (haystack_len >= FASTSEARCH_MIN) {
    + found = php_fastsearch(haystack + offset,
    Z_STRVAL_P(needle),
    Z_STRLEN_P(needle),
    haystack + haystack_len);
    + } else {
    + found = php_memnstr(haystack + offset,
    + Z_STRVAL_P(needle),
    + Z_STRLEN_P(needle),
    + haystack + haystack_len);
    + }
    } else {
    if (php_needle_char(needle, needle_char TSRMLS_CC) != SUCCESS) {
    RETURN_FALSE;
    }
    needle_char[1] = 0;
    -
    found = php_memnstr(haystack + offset,
    needle_char,
    1,
    - haystack + haystack_len);
    + haystack + haystack_len);
    }

    if (found) {
    @@ -1772,6 +1889,7 @@
    }
    /* }}} */

    +
    /* {{{ proto int stripos(string haystack, string needle [, int offset])
    Finds position of first occurrence of a string within another, case insensitive */
    PHP_FUNCTION(stripos)
    @@ -1808,7 +1926,12 @@

    needle_dup = estrndup(Z_STRVAL_P(needle), Z_STRLEN_P(needle));
    php_strtolower(needle_dup, Z_STRLEN_P(needle));
    - found = php_memnstr(haystack_dup + offset, needle_dup, Z_STRLEN_P(needle), haystack_dup + haystack_len);
    +
    + if (haystack_len >= FASTSEARCH_MIN) {
    + found = php_fastsearch(haystack_dup + offset, needle_dup, Z_STRLEN_P(needle), haystack_dup + haystack_len);
    + } else {
    + found = php_memnstr(haystack_dup + offset, needle_dup, Z_STRLEN_P(needle), haystack_dup + haystack_len);
    + }
    } else {
    if (php_needle_char(needle, needle_char TSRMLS_CC) != SUCCESS) {
    efree(haystack_dup);
    @@ -1816,9 +1939,10 @@
    }
    needle_char[0] = tolower(needle_char[0]);
    needle_char[1] = '\0';
    - found = php_memnstr(haystack_dup + offset,
    - needle_char,
    - sizeof(needle_char) - 1,
    +
    + found = php_memnstr(haystack_dup + offset,
    + needle_char,
    + sizeof(needle_char) - 1,
    haystack_dup + haystack_len);
    }