Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save cheekybastard/e1a484c81155f48ae6d8 to your computer and use it in GitHub Desktop.

Select an option

Save cheekybastard/e1a484c81155f48ae6d8 to your computer and use it in GitHub Desktop.

Revisions

  1. @karmi karmi revised this gist May 24, 2011. 1 changed file with 2 additions and 3 deletions.
    5 changes: 2 additions & 3 deletions elastic_search_ngram_analyzer_for_urls.sh
    Original file line number Diff line number Diff line change
    @@ -18,7 +18,7 @@ curl -X PUT localhost:9200/ngram_test -d '
    "filter" : {
    "url_stop" : {
    "type" : "stop",
    "stopwords" : ["http", "https", ":", "/", ".", "html"]
    "stopwords" : ["http", "https"]
    },
    "url_ngram" : {
    "type" : "nGram",
    @@ -69,7 +69,6 @@ do
    echo; echo; echo ">>> ${url}"
    if which open &> /dev/null; then
    open "${url}&pretty=true"
    else
    curl "${url}&pretty=true"
    fi
    curl "${url}&pretty=true"
    done
  2. @karmi karmi revised this gist May 24, 2011. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions elastic_search_ngram_analyzer_for_urls.sh
    Original file line number Diff line number Diff line change
    @@ -12,11 +12,11 @@ curl -X PUT localhost:9200/ngram_test -d '
    "url_analyzer" : {
    "type" : "custom",
    "tokenizer" : "lowercase",
    "filter" : ["lowercase", "stop", "url_filter", "url_ngram"]
    "filter" : ["stop", "url_stop", "url_ngram"]
    }
    },
    "filter" : {
    "url_filter" : {
    "url_stop" : {
    "type" : "stop",
    "stopwords" : ["http", "https", ":", "/", ".", "html"]
    },
  3. @karmi karmi revised this gist May 24, 2011. 1 changed file with 1 addition and 0 deletions.
    1 change: 1 addition & 0 deletions elastic_search_ngram_analyzer_for_urls.sh
    Original file line number Diff line number Diff line change
    @@ -60,6 +60,7 @@ http://localhost:9200/ngram_test/_search?q=url:heise
    http://localhost:9200/ngram_test/_search?q=url:eis
    http://localhost:9200/ngram_test/_search?q=url:berlin
    http://localhost:9200/ngram_test/_search?q=url:wetter
    http://localhost:9200/ngram_test/_search?q=url:kroatien
    http://localhost:9200/ngram_test/_search?q=url:(urlaub%20kroatien)
    '

  4. @karmi karmi revised this gist May 24, 2011. 2 changed files with 74 additions and 66 deletions.
    66 changes: 0 additions & 66 deletions custom_analyzer.sh
    Original file line number Diff line number Diff line change
    @@ -1,66 +0,0 @@
    curl -X DELETE localhost:9200/custom_analyzer_test
    curl -X PUT localhost:9200/custom_analyzer_test -d '
    {
    "settings" : {
    "index" : {
    "number_of_shards" : 1,
    "number_of_replicas" : 0,
    "analysis" : {
    "analyzer" : {
    "url_analyzer" : {
    "type" : "custom",
    "tokenizer" : "lowercase",
    "filter" : ["lowercase", "stop", "url_filter", "url_ngram"]
    }
    },
    "filter" : {
    "url_filter" : {
    "type" : "stop",
    "stopwords" : ["http", "https", ":", "/", ".", "html"]
    },
    "url_ngram" : {
    "type" : "nGram"
    }
    }
    }
    }
    },
    "mappings": {
    "url": {
    "properties": {
    "url": {
    "type": "string",
    "analyzer": "url_analyzer",
    "boost": 10
    },
    "title": {
    "type": "string",
    "analyzer": "snowball",
    "boost": 5
    },
    "description": {
    "type": "string",
    "analyzer": "snowball"
    },
    "tags": {
    "type": "string",
    "analyzer": "keyword"
    }
    }
    }
    }
    }
    '

    curl -X POST "http://localhost:9200/custom_analyzer_test/url?refresh=true" -d '
    {
    "url" : "http://www.euruko2011.org",
    "title" : "EURUKO 2011",
    "description" : "The greatest Ruby conference in Europe!",
    "tags" : ["ruby", "conference"]
    }'

    # curl "http://localhost:9200/custom_analyzer_test/_analyze?text=http://euruko2011.org/speakers.html&analyzer=url_analyzer"

    curl "http://localhost:9200/custom_analyzer_test/_search?q=url:peak"
    74 changes: 74 additions & 0 deletions elastic_search_ngram_analyzer_for_urls.sh
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,74 @@
    # ========================================
    # Testing n-gram analysis in ElasticSearch
    # ========================================

    curl -X DELETE localhost:9200/ngram_test
    curl -X PUT localhost:9200/ngram_test -d '
    {
    "settings" : {
    "index" : {
    "analysis" : {
    "analyzer" : {
    "url_analyzer" : {
    "type" : "custom",
    "tokenizer" : "lowercase",
    "filter" : ["lowercase", "stop", "url_filter", "url_ngram"]
    }
    },
    "filter" : {
    "url_filter" : {
    "type" : "stop",
    "stopwords" : ["http", "https", ":", "/", ".", "html"]
    },
    "url_ngram" : {
    "type" : "nGram",
    "min_gram" : 3,
    "max_gram" : 5
    }
    }
    }
    }
    },
    "mappings": {
    "url": {
    "properties": {
    "url": {
    "type": "string",
    "analyzer": "url_analyzer",
    "boost": 10
    }
    }
    }
    }
    }
    '

    curl -X POST "http://localhost:9200/ngram_test/url" -d '{ "url" : "http://heise.de" }'
    curl -X POST "http://localhost:9200/ngram_test/url" -d '{ "url" : "http://heisewetter.de" }'
    curl -X POST "http://localhost:9200/ngram_test/url" -d '{ "url" : "http://eisenwerken.de" }'
    curl -X POST "http://localhost:9200/ngram_test/url" -d '{ "url" : "http://eisenwerkenberlin.de" }'
    curl -X POST "http://localhost:9200/ngram_test/url" -d '{ "url" : "http://urlaubinkroatien.de" }'
    curl -X POST "http://localhost:9200/ngram_test/url" -d '{ "url" : "http://besteurlaubinkroatien.de" }'
    curl -X POST "http://localhost:9200/ngram_test/url" -d '{ "url" : "http://kroatien.de" }'
    curl -X POST "http://localhost:9200/ngram_test/_refresh"

    # curl "http://localhost:9200/ngram_test/_analyze?text=http://heise.de&analyzer=url_analyzer"

    URLS='
    http://localhost:9200/ngram_test/_search?q=url:heise
    http://localhost:9200/ngram_test/_search?q=url:eis
    http://localhost:9200/ngram_test/_search?q=url:berlin
    http://localhost:9200/ngram_test/_search?q=url:wetter
    http://localhost:9200/ngram_test/_search?q=url:(urlaub%20kroatien)
    '

    for url in ${URLS}
    do
    echo; echo; echo ">>> ${url}"
    if which open &> /dev/null; then
    open "${url}&pretty=true"
    else
    curl "${url}&pretty=true"
    fi
    done
  5. @karmi karmi revised this gist May 24, 2011. 1 changed file with 5 additions and 9 deletions.
    14 changes: 5 additions & 9 deletions custom_analyzer.sh
    Original file line number Diff line number Diff line change
    @@ -9,21 +9,17 @@ curl -X PUT localhost:9200/custom_analyzer_test -d '
    "analyzer" : {
    "url_analyzer" : {
    "type" : "custom",
    "tokenizer" : "url_ngram",
    "filter" : ["stop", "url_filter", "lowercase"]
    }
    },
    "tokenizer" : {
    "url_ngram" : {
    "type" : "nGram",
    "min_gram" : 1,
    "max_gram" : 5
    "tokenizer" : "lowercase",
    "filter" : ["lowercase", "stop", "url_filter", "url_ngram"]
    }
    },
    "filter" : {
    "url_filter" : {
    "type" : "stop",
    "stopwords" : ["http", "https", ":", "/", ".", "html"]
    },
    "url_ngram" : {
    "type" : "nGram"
    }
    }
    }
  6. @karmi karmi revised this gist May 24, 2011. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions custom_analyzer.sh
    Original file line number Diff line number Diff line change
    @@ -9,12 +9,12 @@ curl -X PUT localhost:9200/custom_analyzer_test -d '
    "analyzer" : {
    "url_analyzer" : {
    "type" : "custom",
    "tokenizer" : "trigram",
    "tokenizer" : "url_ngram",
    "filter" : ["stop", "url_filter", "lowercase"]
    }
    },
    "tokenizer" : {
    "trigram" : {
    "url_ngram" : {
    "type" : "nGram",
    "min_gram" : 1,
    "max_gram" : 5
  7. @karmi karmi revised this gist May 24, 2011. 1 changed file with 3 additions and 3 deletions.
    6 changes: 3 additions & 3 deletions custom_analyzer.sh
    Original file line number Diff line number Diff line change
    @@ -9,15 +9,15 @@ curl -X PUT localhost:9200/custom_analyzer_test -d '
    "analyzer" : {
    "url_analyzer" : {
    "type" : "custom",
    "tokenizer" : "nGram",
    "tokenizer" : "trigram",
    "filter" : ["stop", "url_filter", "lowercase"]
    }
    },
    "tokenizer" : {
    "trigram" : {
    "type" : "nGram",
    "min_gram" : 3,
    "max_gram" : 3
    "min_gram" : 1,
    "max_gram" : 5
    }
    },
    "filter" : {
  8. @karmi karmi revised this gist May 24, 2011. 1 changed file with 15 additions and 2 deletions.
    17 changes: 15 additions & 2 deletions custom_analyzer.sh
    Original file line number Diff line number Diff line change
    @@ -10,7 +10,20 @@ curl -X PUT localhost:9200/custom_analyzer_test -d '
    "url_analyzer" : {
    "type" : "custom",
    "tokenizer" : "nGram",
    "filter" : ["stop", "lowercase"]
    "filter" : ["stop", "url_filter", "lowercase"]
    }
    },
    "tokenizer" : {
    "trigram" : {
    "type" : "nGram",
    "min_gram" : 3,
    "max_gram" : 3
    }
    },
    "filter" : {
    "url_filter" : {
    "type" : "stop",
    "stopwords" : ["http", "https", ":", "/", ".", "html"]
    }
    }
    }
    @@ -54,4 +67,4 @@ curl -X POST "http://localhost:9200/custom_analyzer_test/url?refresh=true" -d

    # curl "http://localhost:9200/custom_analyzer_test/_analyze?text=http://euruko2011.org/speakers.html&analyzer=url_analyzer"

    curl http://localhost:9200/custom_analyzer_test/_search?q=url:peak
    curl "http://localhost:9200/custom_analyzer_test/_search?q=url:peak"
  9. @karmi karmi revised this gist May 24, 2011. 1 changed file with 7 additions and 3 deletions.
    10 changes: 7 additions & 3 deletions custom_analyzer.sh
    Original file line number Diff line number Diff line change
    @@ -4,12 +4,12 @@ curl -X PUT localhost:9200/custom_analyzer_test -d '
    "settings" : {
    "index" : {
    "number_of_shards" : 1,
    "number_of_replicas" : 2,
    "number_of_replicas" : 0,
    "analysis" : {
    "analyzer" : {
    "url_analyzer" : {
    "type" : "custom",
    "tokenizer" : "uax_url_email",
    "tokenizer" : "nGram",
    "filter" : ["stop", "lowercase"]
    }
    }
    @@ -44,10 +44,14 @@ curl -X PUT localhost:9200/custom_analyzer_test -d '
    }
    '

    curl -X POST http://localhost:9200/custom_analyzer_test/url -d '
    curl -X POST "http://localhost:9200/custom_analyzer_test/url?refresh=true" -d '
    {
    "url" : "http://www.euruko2011.org",
    "title" : "EURUKO 2011",
    "description" : "The greatest Ruby conference in Europe!",
    "tags" : ["ruby", "conference"]
    }'

    # curl "http://localhost:9200/custom_analyzer_test/_analyze?text=http://euruko2011.org/speakers.html&analyzer=url_analyzer"

    curl http://localhost:9200/custom_analyzer_test/_search?q=url:peak
  10. @karmi karmi created this gist May 24, 2011.
    53 changes: 53 additions & 0 deletions custom_analyzer.sh
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,53 @@
    curl -X DELETE localhost:9200/custom_analyzer_test
    curl -X PUT localhost:9200/custom_analyzer_test -d '
    {
    "settings" : {
    "index" : {
    "number_of_shards" : 1,
    "number_of_replicas" : 2,
    "analysis" : {
    "analyzer" : {
    "url_analyzer" : {
    "type" : "custom",
    "tokenizer" : "uax_url_email",
    "filter" : ["stop", "lowercase"]
    }
    }
    }
    }
    },
    "mappings": {
    "url": {
    "properties": {
    "url": {
    "type": "string",
    "analyzer": "url_analyzer",
    "boost": 10
    },
    "title": {
    "type": "string",
    "analyzer": "snowball",
    "boost": 5
    },
    "description": {
    "type": "string",
    "analyzer": "snowball"
    },
    "tags": {
    "type": "string",
    "analyzer": "keyword"
    }
    }
    }
    }
    }
    '

    curl -X POST http://localhost:9200/custom_analyzer_test/url -d '
    {
    "url" : "http://www.euruko2011.org",
    "title" : "EURUKO 2011",
    "description" : "The greatest Ruby conference in Europe!",
    "tags" : ["ruby", "conference"]
    }'