Skip to content

Instantly share code, notes, and snippets.

@peterdm
Last active August 2, 2022 19:07
Show Gist options
  • Select an option

  • Save peterdm/e1ebf6e17505e73ab10834d2458d60c9 to your computer and use it in GitHub Desktop.

Select an option

Save peterdm/e1ebf6e17505e73ab10834d2458d60c9 to your computer and use it in GitHub Desktop.

Revisions

  1. peterdm revised this gist Aug 2, 2022. No changes.
  2. peterdm created this gist Aug 2, 2022.
    183 changes: 183 additions & 0 deletions gistfile1.txt
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,183 @@

    # Original Index Structure (just took the pages field for brevity)

    PUT test_index
    {
    "settings": {
    "analysis": {
    "char_filter": {
    "punct_annotation": {
    "type": "mapping",
    "mappings": [
    ". => \\n_PERIOD_\\n",
    "\\n => \\n_NEWLINE_\\n",
    ", => \\n_COMMA_\\n",
    "; => \\n_SEMI_\\n"
    ]
    },
    "punctuation": {"type": "mapping", "mappings": [".=>"]}
    },
    "normalizer": {
    "keyword_normalizer": {"type": "custom", "char_filter": [], "filter": ["lowercase", "asciifolding"]}
    },
    "filter": {
    "english_stop": {"type": "stop", "stopwords": "_english_"},
    "english_stemmer": {"type": "stemmer", "language": "english"},
    "english_possessive_stemmer": {"type": "stemmer", "language": "possessive_english"}
    },
    "analyzer": {
    "cleanedStem": {
    "tokenizer": "standard",
    "filter": ["english_possessive_stemmer", "lowercase", "english_stop", "english_stemmer"],
    "char_filter": ["punct_annotation"],
    "type": "custom"
    },
    "exactMatch": {
    "tokenizer": "standard",
    "filter": ["lowercase"],
    "char_filter": ["punct_annotation"],
    "type": "custom"
    },
    "exactInput": {"tokenizer": "whitespace", "filter": ["lowercase"], "type": "custom"},
    "suggester_lowercase": {
    "tokenizer": "standard",
    "char_filter": ["punctuation"],
    "filter": ["lowercase", "asciifolding"]
    }
    }
    }
    },
    "mappings": {
    "_source": {"enabled": true},
    "dynamic": "false",
    "properties": {
    "pages": {
    "type": "nested",
    "properties": {
    "timestamp": {"type": "date", "store": true, "format": "strict_date_hour_minute_second_fraction"},
    "url": {
    "type": "keyword",
    "store": true
    },
    "type": {
    "type": "keyword",
    "store": true
    },
    "text": {
    "type": "text",
    "store": true,
    "index_options": "offsets",
    "fielddata": true,
    "fields": {
    "stemmed": {"type": "text", "analyzer": "cleanedStem", "index_options": "offsets"},
    "exact": {"type": "text", "analyzer": "exactMatch", "index_options": "offsets"}
    }
    }
    }
    }
    }
    }
    }


    GET test_index/_mapping/field/pages.text

    PUT test_index/_mapping
    {
    "properties": {
    "all_pages": {
    "type": "text",
    "similarity": "spread_similarity",
    "fields": {
    "exact": {
    "type": "text",
    "similarity": "spread_similarity",
    "index_options": "offsets",
    "analyzer": "exactMatch"
    },
    "stemmed": {
    "type": "text",
    "similarity": "spread_similarity",
    "index_options": "offsets",
    "analyzer": "cleanedStem"
    }
    }
    },
    "pages": {
    "type": "nested",
    "properties": {
    "text": {
    "type": "text",
    "store": true,
    "index_options": "offsets",
    "fielddata": true,
    "fields": {
    "stemmed": {"type": "text", "analyzer": "cleanedStem", "index_options": "offsets"},
    "exact": {"type": "text", "analyzer": "exactMatch", "index_options": "offsets"}
    },
    "copy_to": "all_pages"
    }
    }
    }
    }
    }


    GET test_index/_settings

    PUT test_index/_settings
    {
    "settings": {
    "index": {
    "similarity": {
    "spread_similarity": {
    "type": "BM25",
    "b": 1,
    "k1": 1.75
    }
    }
    }
    }
    }

    # Close index before changes
    POST test_index/_close

    # Can writes happen?
    POST test_index/_doc/1
    {
    "pages": [
    {
    "text": ["Hello World!", "Is this index open?"]
    }
    ]
    }

    # Can reads happen
    GET test_index/_search
    {
    "query": {
    "nested": {
    "path": "pages",
    "query": {
    "match_phrase": {
    "pages.text.stemmed": "hello world"
    }
    }
    }
    }
    }

    GET test_index/_search
    {
    "query": {
    "match_phrase": {
    "all_pages.stemmed": "hello world"
    }
    }
    }

    # Reopen index after changes
    POST test_index/_open

    DELETE test_index/_doc/1