d68fbe50 · May 26, 2022 06:08 · Jun 21, 2015 · Jun 1, 2015 · Jun 1, 2015 · Jun 1, 2015
diff --git a/.elasticsearch_cheat_sheet.md b/.elasticsearch_cheat_sheet.md
@@ -0,0 +1 @@
+# ElasticSearch cheat sheet
diff --git a/search.md b/search.md
@@ -22,4 +22,495 @@ GET http://localhost:9200/_search?size=5&from=0
 
 # Page 2
 GET http://localhost:9200/_search?size=5&from=5
-```
+```
+
+## Single field search (match)
+
+The go-to query when you need to run a query on any __one field__. Main use is for full-text searches.
+
+##### Catches results with "cancer" OR "research"
+
+	{
+	    "query": {
+	        "match": {
+	            "post_title": {
+	            	"query": "cancer research"
+	             }
+	        }
+	    }
+	}
+
+##### Catches results with "cancer" AND "research"
+
+	{
+	    "query": {
+	        "match": {
+	            "post_title": {
+	            	"query": "cancer research",
+	            	"operator": "AND"
+	             }
+	        }
+	    }
+	}
+
+##### Catches results with a minimum of 75% of query matched
+
+[Minimum should match docs](http://www.elasticsearch.org/guide/en/elasticsearch/reference/current//query-dsl-minimum-should-match.html)
+
+	{
+	    "query": {
+	        "match": {
+	            "post_title": {
+	            	"query": "cancer research",
+	            	"minimum_should_match": "75%"
+	             }
+	        }
+	    }
+	}
+
+
+## Multi-field search
+
+### Mappable query strings (advanced search)
+
+The simpliest multi-field query to deal with is the one where we can map search terms to specific fields.
+
+	{
+	  "query": {
+	    "bool": {
+	      "should": [
+	        { "match": { "title":  "War and Peace" }},
+	        { "match": { "author": "Leo Tolstoy"   }}
+	      ]
+	    }
+	  }
+	}
+
+The bool query takes a more-matches-is-better approach, so the score from each match clause will be added together to provide the final score for each document. __Queryies at the same level have the same weight.__
+
+	{
+	  "query": {
+	    "bool": {
+	      "should": [
+	        { "match": { "title":  "War and Peace" }},
+	        { "match": { "author": "Leo Tolstoy"   }},
+	        { "bool":  {
+	          "should": [
+	            { "match": { "translator": "Constance Garnett" }},
+	            { "match": { "translator": "Louise Maude"      }}
+	          ]
+	        }}
+	      ]
+	    }
+	  }
+	}
+
+The above query also queries for specific translators, but because it's on a lower level than the title and author queries, it doesn't contribute as much to the overall score of documents.
+
+To further boost the importance of the title and author queries, we can __boost__ their scores. Boost levels between 1 and 10 are reasonable. Higher than that, there isn't much affect.
+
+	{
+	  "query": {
+	    "bool": {
+	      "should": [
+	        { "match": { 
+	            "title":  {
+	              "query": "War and Peace",
+	              "boost": 2
+	        }}},
+	        { "match": { 
+	            "author":  {
+	              "query": "Leo Tolstoy",
+	              "boost": 2
+	        }}},
+	        { "bool":  { 
+	            "should": [
+	              { "match": { "translator": "Constance Garnett" }},
+	              { "match": { "translator": "Louise Maude"      }}
+	            ]
+	        }}
+	      ]
+	    }
+	  }
+	}
+
+
+### Single, unmappable query string (single search box)
+
+
+#### Best fields strategy
+
+This strategy is best when the query is likely to be found in a single field.
+
+When searching for words that represent a concept, such as "cancer research," the words mean more together than they do individually. Documents should have as many words in the query in the SAME field and the score should come from the best matching field.
+
+	{
+	    "query": {
+	        "bool": {
+	            "should": [
+	                { "match": { "title": "Brown fox" }},
+	                { "match": { "body":  "Brown fox" }}
+	            ]
+	        }
+	    }
+	}
+
+The bool query calculates the score like this;
+
+* Run both queries in the should clause
+* Add scores together
+* Divide by the number of clauses (2)
+
+This has the potential to give unrelevant results because if "brown" or "fox" is NOT found in one of the fields, it seriously affects the relevance results. The "title" and "body" fields are competing with each other.
+
+What if we used the score from the best-matching field as the overall score for the query? This would give preference to a single field that contain both of the words we are looking for, rather than preference to the same word repeated in different fields.
+
+##### dis_max ("OR") query
+
+Returns documents that match any of these queries and return the score of the best matching query.
+
+	{
+	    "query": {
+	        "dis_max": {
+	            "queries": [
+	                { "match": { "title": "Brown fox" }},
+	                { "match": { "body":  "Brown fox" }}
+	            ]
+	        }
+	    }
+	}
+
+Sometimes you may need to employ a tie breaking strategy if one word is found in each field -- this would result in every document having fields with equal scores.
+
+###### tie_breaker
+
+Adding a tie breaker allows you to take the score from the other matching clases into account. This also adds the other fields' scores times 0.3 and adds it to the overall score. With a tie breaker, all matching clauses count, but the best matching clause counds the most. Keep the tie breaker betwee 0.1 and 0.4.
+
+	{
+	    "query": {
+	        "dis_max": {
+	            "queries": [
+	                { "match": { "title": "Quick pets" }},
+	                { "match": { "body":  "Quick pets" }}
+	            ],
+	            "tie_breaker": 0.3
+	        }
+	    }
+	}
+
+##### Shorthand
+
+You can use the multi_match to run the same query in a quicker way.
+
+	{
+	    "multi_match": {
+	        "query":                "Quick brown fox",
+	        "type":                 "best_fields", 
+	        "fields":               [ "title", "body" ],
+	        "tie_breaker":          0.3,
+	        "minimum_should_match": "30%" 
+	    }
+	}
+
+##### Wildcards in field names
+
+	{
+	    "multi_match": {
+	        "query":                "Quick brown fox",
+	        "type":                 "best_fields", 
+	        "fields":               [ "*_title", "body" ],
+	        "tie_breaker":          0.3,
+	        "minimum_should_match": "30%" 
+	    }
+	}
+
+
+##### Boosting individual fields
+
+	{
+	    "multi_match": {
+	        "query":                "Quick brown fox",
+	        "type":                 "best_fields", 
+	        "fields":               [ "*_title", "body^2" ],
+	        "tie_breaker":          0.3,
+	        "minimum_should_match": "30%" 
+	    }
+	}
+
+
+#### Most fields strategy
+
+Designed to find the most fields matching any words, rather than to find the most matching words across all fields. Cannot use the minimum_should_match parameter to reduce long tail of less relevant results. Term frequencies are different in each field and could interfere with each other to produce badly ordered results. Field-centric instead of term-centric.
+
+A common technique for fine-tuning relevance is to index the same data into multiple fields, each with their own analysis chain.
+
+The main field may contain words in the stemmed form and synonyms. It is used to match as many documents as possible.
+
+The same text could then be indexed into other fields to provide more precise matching. One field may contain the unstemmed version, another removes accent marks, and another may use shingles to provide information about word proximity.
+
+These other fields act as _signals_ to increase the relevance score of each matching document. __The more fields that match the better.__
+
+
+##### Multifield mapping
+
+	"mappings": {
+        "my_type": {
+            "properties": {
+                "title": { 
+                    "type":     "string",
+                    "analyzer": "english",
+                    "fields": {
+                        "std":   { 
+                            "type":     "string",
+                            "analyzer": "standard"
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+The title field is stemmed by the english analyzer, while the title.std uses the standard analyzer, so it is not stemmed.
+
+	{
+	   "query": {
+	        "multi_match": {
+	            "query":  "jumping rabbits",
+	            "type":   "most_fields", 
+	            "fields": [ "title", "title.std" ]
+	        }
+	    }
+	}
+
+The query checks against both the stemmed and unstemmed fields and combines the scores from all matching fields. So if a document contains the __exact__ words from the query, it will rank higher than a query that matching only the stemmed versions.
+
+
+#### Cross fields strategy
+
+This strategy is best if the query is likely to be found across multiple fields (address). Takes a term-centric approach.
+
+For some entities, the identifying information is spread across multiple fields, each of which contains just part of the whole (first name field, last name field). In this case, we want to find as many words as possible in any of the listed fields.
+
+	{
+	    "query": {
+	        "multi_match": {
+	            "query":       "peter smith",
+	            "type":        "cross_fields", 
+	            "operator":    "and",
+	            "fields":      [ "first_name", "last_name" ]
+	        }
+	    }
+	}
+
+
+
+
+## Custom _all fields
+
+The _all field indexes the values from all other fields as one big string. You can create custom _all fields to get the same affect with other fields. For example, combining a first_name and last_name field into one field:
+
+	{
+	    "mappings": {
+	        "person": {
+	            "properties": {
+	                "first_name": {
+	                    "type":     "string",
+	                    "copy_to":  "full_name" 
+	                },
+	                "last_name": {
+	                    "type":     "string",
+	                    "copy_to":  "full_name" 
+	                },
+	                "full_name": {
+	                    "type":     "string"
+	                }
+	            }
+	        }
+	    }
+	}
+
+
+## Proximity matching (match_phrase)
+
+Giving higher relevance to documents that contain the query words closer together, but they require all terms to be present.
+
+	{
+	    "query": {
+	        "match_phrase": {
+	            "title": "quick brown fox"
+	        }
+	    }
+	}
+
+OR
+
+	"match": {
+	    "title": {
+	        "query": "quick brown fox",
+	        "type":  "phrase"
+	    }
+	}
+
+What match_phrase does:
+
+* Analyzes the query string to produce a list of terms
+* Searches for all the terms, but only keeps documents which contains all of the search terms in the same positions, relative to each other.
+
+To be less strict about positioning (if we want "quick fox" to return), we can introduce the "slop" parameter that talls the query how far apart terms are allowed to be while still considering it a match.
+
+	{
+	    "query": {
+	        "match_phrase": {
+	            "title": "quick brown fox",
+	            "slop": 1
+	        }
+	    }
+	}
+
+If you give a higer slop value, say 50, the query will still give back documents where words aren't super close together, but it will give a higher score to documents where the words are closer together.
+
+#### Make sure tersm in arrays aren't positioned next to each other
+
+	{
+	    "properties": {
+	        "names": {
+	            "type":                "string",
+	            "position_offset_gap": 100
+	        }
+	    }
+	}
+
+#### Use proximity query as a signal
+
+Since proximity queries exclude results that do not contain all terms, we can implement the proximity query as a signal -- as one of potentially many queries, each of which contribute to the overall score for each document (most fields).
+
+	{
+	  "query": {
+	    "bool": {
+	      "must": {
+	        "match": { 
+	          "title": {
+	            "query":                "quick brown fox",
+	            "minimum_should_match": "30%"
+	          }
+	        }
+	      },
+	      "should": {
+	        "match_phrase": { 
+	          "title": {
+	            "query": "quick brown fox",
+	            "slop":  50
+	          }
+	        }
+	      }
+	    }
+	  }
+	}
+
+This query uses the match_phrase to help with relevance, while the match query is used to determine which documents are returned.
+
+## Improving performance
+
+Phrase and proximity queries are expensive. Some ways to help with query time:
+
+### Rescore results
+
+A simple match query will already have ranked documents which contain all search terms near the top of the list. Really, we just want to rerank the top results to give an extra relevance bump to documents that also match the phrase query. Taking the above query, let's just rescore the top results:
+
+	{
+	    "query": {
+	        "match": {  
+	            "title": {
+	                "query":                "quick brown fox",
+	                "minimum_should_match": "30%"
+	            }
+	        }
+	    },
+	    "rescore": {
+	        "window_size": 50, 
+	        "query": {         
+	            "rescore_query": {
+	                "match_phrase": {
+	                    "title": {
+	                        "query": "quick brown fox",
+	                        "slop":  50
+	                    }
+	                }
+	            }
+	        }
+	    }
+	}
+
+window_size is the amount of results to rescore.
+
+
+## Shingles
+
+Group word pairs together (2, 3, 4, etc.. words) to maintain meaning between words and can be a good alternative to match_phrase queries because they are a lot quicker.
+
+### Creating shingles
+
+#### Analyzer
+
+	{
+	    "settings": {
+	        "number_of_shards": 1,  
+	        "analysis": {
+	            "filter": {
+	                "my_shingle_filter": {
+	                    "type":             "shingle",
+	                    "min_shingle_size": 2, 
+	                    "max_shingle_size": 2, 
+	                    "output_unigrams":  false   
+	                }
+	            },
+	            "analyzer": {
+	                "my_shingle_analyzer": {
+	                    "type":             "custom",
+	                    "tokenizer":        "standard",
+	                    "filter": [
+	                        "lowercase",
+	                        "my_shingle_filter" 
+	                    ]
+	                }
+	            }
+	        }
+	    }
+	}
+
+#### Field mapping
+
+	{
+	    "my_type": {
+	        "properties": {
+	            "title": {
+	                "type": "string",
+	                "fields": {
+	                    "shingles": {
+	                        "type":     "string",
+	                        "analyzer": "my_shingle_analyzer"
+	                    }
+	                }
+	            }
+	        }
+	    }
+	}
+
+#### Add shingles as a signal
+
+	{
+	   "query": {
+	      "bool": {
+	         "must": {
+	            "match": {
+	               "title": "the hungry alligator ate sue"
+	            }
+	         },
+	         "should": {
+	            "match": {
+	               "title.shingles": "the hungry alligator ate sue"
+	            }
+	         }
+	      }
+	   }
+	}
diff --git a/basic.md → data-manipulation.md b/basic.md → data-manipulation.md
@@ -1,5 +1,4 @@
-# Basic stuff
-## GET, PUT, DELETE data, indicies, etc...
+# Data manipulation
 
 #### GET data
 ```

diff --git a/basic.md b/basic.md
@@ -1,4 +1,5 @@
-# Basic stuff GET, PUT, DELETE data, indicies, etc...
+# Basic stuff
+## GET, PUT, DELETE data, indicies, etc...
 
 #### GET data
 ```

diff --git a/analyzers.md b/analyzers.md
@@ -2,5 +2,5 @@
 
 ### Test an analyzer
 ```
-http://localhost:9200/:index/_analyze?analyzer=default&text=test+text
+GET http://localhost:9200/:index/_analyze?analyzer=default&text=test+text
 ```
diff --git a/basic.md b/basic.md
@@ -0,0 +1,26 @@
+# Basic stuff GET, PUT, DELETE data, indicies, etc...
+
+#### GET data
+```
+GET http://localhost:9200/:index/:type/:id
+```
+
+#### PUT (or update) data
+```
+PUT http://localhost:9200/:index/:type/(:id)
+```
+
+#### DELETE data
+```	
+# All resources
+DELETE http://localhost:9200/_all
+
+# An index
+DELETE http://localhost:9200/:index
+
+# A type
+DELETE http://localhost:9200/:index/:type
+
+# An item
+DELETE http://localhost:9200/:index/:type/:id
+```
diff --git a/elasticsearch.md → mapping.md b/elasticsearch.md → mapping.md
@@ -1,31 +1,4 @@
-# Elastic search cheat sheet
-
-#### GET data
-```
-GET http://localhost:9200/:index/:type/:id
-```
-
-#### PUT (or update) data
-```
-PUT http://localhost:9200/:index/:type/(:id)
-```
-
-#### DELETE data
-```	
-# All resources
-DELETE http://localhost:9200/_all
-
-# An index
-DELETE http://localhost:9200/:index
-
-# A type
-DELETE http://localhost:9200/:index/:type
-
-# An item
-DELETE http://localhost:9200/:index/:type/:id
-```
-
-#### Datatype mapping
+# Datatype mapping
 
 When you insert data into elastic search, it uses dynamic detection to determine what kind of data each field is.
 
@@ -48,7 +21,7 @@ GET http://localhost:9200/:index/:type/_mapping
 
 
 ##### PUT (or update) mapping
-
+```
 	PUT http://localhost:9200/:index/:type -d '{
 		"mappings": {
 			"tweet": {
@@ -58,4 +31,5 @@ GET http://localhost:9200/:index/:type/_mapping
 				
 			}
 		}
-	}'
+	}'
+```
diff --git a/search.md b/search.md
@@ -1,3 +1,5 @@
+# Search
+
 The `hits` object gives you the top 10 hits that matched the query. The `score` represents how well the results matched the query.
 ```
 # Entire database

diff --git a/elasticsearch.md b/elasticsearch.md
@@ -25,32 +25,6 @@ DELETE http://localhost:9200/:index/:type
 DELETE http://localhost:9200/:index/:type/:id
 ```
 
-#### Searching
-
-The `hits` object gives you the top 10 hits that matched the query. The `score` represents how well the results matched the query.
-```
-# Entire database
-GET http://localhost:9200/_search
-
-# One index
-GET http://localhost:9200/:index/_search
-
-# Multuple indecies
-GET http://localhost:9200/:index,:index/_search
-
-# Wildcards
-GET http://localhost:9200/hub*/_search
-```
-
-##### Pagination
-```
-# Page 1
-GET http://localhost:9200/_search?size=5&from=0
-
-# Page 2
-GET http://localhost:9200/_search?size=5&from=5
-```
-
 #### Datatype mapping
 
 When you insert data into elastic search, it uses dynamic detection to determine what kind of data each field is.

diff --git a/search.md b/search.md
@@ -0,0 +1,23 @@
+The `hits` object gives you the top 10 hits that matched the query. The `score` represents how well the results matched the query.
+```
+# Entire database
+GET http://localhost:9200/_search
+
+# One index
+GET http://localhost:9200/:index/_search
+
+# Multuple indecies
+GET http://localhost:9200/:index,:index/_search
+
+# Wildcards
+GET http://localhost:9200/hub*/_search
+```
+
+##### Pagination
+```
+# Page 1
+GET http://localhost:9200/_search?size=5&from=0
+
+# Page 2
+GET http://localhost:9200/_search?size=5&from=5
+```
diff --git a/analyzers.md b/analyzers.md
@@ -0,0 +1,6 @@
+# Analyzers
+
+### Test an analyzer
+```
+http://localhost:9200/:index/_analyze?analyzer=default&text=test+text
+```
diff --git a/elasticsearch.md b/elasticsearch.md
@@ -1,71 +1,64 @@
 # Elastic search cheat sheet
 
 #### GET data
-
-	GET http://localhost:9200/:index/:type/:id
+```
+GET http://localhost:9200/:index/:type/:id
+```
 
 #### PUT (or update) data
+```
+PUT http://localhost:9200/:index/:type/(:id)
+```
 
-	PUT http://localhost:9200/:index/:type/(:id)
+#### DELETE data
+```	
+# All resources
+DELETE http://localhost:9200/_all
 
+# An index
+DELETE http://localhost:9200/:index
 
-#### Check data existence
+# A type
+DELETE http://localhost:9200/:index/:type
 
-	HEAD localhost:9200/:index/:type/:id
-
-#### DELETE data
-
-	# All resources
-	DELETE http://localhost:9200/_all
-
-	# An index
-	DELETE http://localhost:9200/:index
-
-	# A type
-	DELETE http://localhost:9200/:index/:type
-
-	# An item
-	DELETE http://localhost:9200/:index/:type/:id
+# An item
+DELETE http://localhost:9200/:index/:type/:id
+```
 
 #### Searching
 
 The `hits` object gives you the top 10 hits that matched the query. The `score` represents how well the results matched the query.
+```
+# Entire database
+GET http://localhost:9200/_search
 
-	# Empty searches
+# One index
+GET http://localhost:9200/:index/_search
 
-	# Entire database
-	GET http://localhost:9200/_search
-
-	# One index
-	GET http://localhost:9200/:index/_search
-
-	# Multuple indecies
-	GET http://localhost:9200/:index,:index/_search
-
-	# Wildcards
-	GET http://localhost:9200/hub*/_search
+# Multuple indecies
+GET http://localhost:9200/:index,:index/_search
 
-##### Pagination
-
-	# Page 1
-	GET http://localhost:9200/_search?size=5&from=0
-
-	# Page 2
-	GET http://localhost:9200/_search?size=5&from=5
+# Wildcards
+GET http://localhost:9200/hub*/_search
+```
 
-##### On fields
-
-Good for development, not for production. If you do not specify a field, elasticsearch automatically uses the `_all` field, which searches through all fields.
+##### Pagination
+```
+# Page 1
+GET http://localhost:9200/_search?size=5&from=0
 
-	GET /_search?q:fieldName:query
+# Page 2
+GET http://localhost:9200/_search?size=5&from=5
+```
 
 #### Datatype mapping
 
 When you insert data into elastic search, it uses dynamic detection to determine what kind of data each field is.
 
 ##### Get mapping
-
-	GET http://localhost:9200/:index/:type/_mapping
+```
+GET http://localhost:9200/:index/:type/_mapping
+```
 
 ##### Changing mapping
 
@@ -74,8 +67,6 @@ When you insert data into elastic search, it uses dynamic detection to determine
 
 ##### Reindex data
 
-**Lookup more detailed information on how to do this**
-
 * Create a new index with the new mapping (see PUT mapping below)
 * Pull in documents from the old index using a [scrolled search](http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/search-request-scroll.html) and index them to the new index using the [bulk API](http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/docs-bulk.html). Note: make sure that you include search_type=scan in your search request. This disables sorting and makes "deep paging" efficient.
 * Update [index alias](http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/indices-aliases.html).

diff --git a/elasticsearch.md b/elasticsearch.md
@@ -0,0 +1,96 @@
+# Elastic search cheat sheet
+
+#### GET data
+
+	GET http://localhost:9200/:index/:type/:id
+
+#### PUT (or update) data
+
+	PUT http://localhost:9200/:index/:type/(:id)
+
+
+#### Check data existence
+
+	HEAD localhost:9200/:index/:type/:id
+
+#### DELETE data
+
+	# All resources
+	DELETE http://localhost:9200/_all
+
+	# An index
+	DELETE http://localhost:9200/:index
+
+	# A type
+	DELETE http://localhost:9200/:index/:type
+
+	# An item
+	DELETE http://localhost:9200/:index/:type/:id
+
+#### Searching
+
+The `hits` object gives you the top 10 hits that matched the query. The `score` represents how well the results matched the query.
+
+	# Empty searches
+
+	# Entire database
+	GET http://localhost:9200/_search
+
+	# One index
+	GET http://localhost:9200/:index/_search
+
+	# Multuple indecies
+	GET http://localhost:9200/:index,:index/_search
+
+	# Wildcards
+	GET http://localhost:9200/hub*/_search
+
+##### Pagination
+
+	# Page 1
+	GET http://localhost:9200/_search?size=5&from=0
+
+	# Page 2
+	GET http://localhost:9200/_search?size=5&from=5
+
+##### On fields
+
+Good for development, not for production. If you do not specify a field, elasticsearch automatically uses the `_all` field, which searches through all fields.
+
+	GET /_search?q:fieldName:query
+
+#### Datatype mapping
+
+When you insert data into elastic search, it uses dynamic detection to determine what kind of data each field is.
+
+##### Get mapping
+
+	GET http://localhost:9200/:index/:type/_mapping
+
+##### Changing mapping
+
+* If you are adding fields, there is no need to reindex.
+* If you need to change a field, you need to reindex your data. [An article on the subject](http://www.elasticsearch.org/blog/changing-mapping-with-zero-downtime/)
+
+##### Reindex data
+
+**Lookup more detailed information on how to do this**
+
+* Create a new index with the new mapping (see PUT mapping below)
+* Pull in documents from the old index using a [scrolled search](http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/search-request-scroll.html) and index them to the new index using the [bulk API](http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/docs-bulk.html). Note: make sure that you include search_type=scan in your search request. This disables sorting and makes "deep paging" efficient.
+* Update [index alias](http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/indices-aliases.html).
+* Delete the old index
+
+
+##### PUT (or update) mapping
+
+	PUT http://localhost:9200/:index/:type -d '{
+		"mappings": {
+			"tweet": {
+				"properties": {
+					…
+				}
+				
+			}
+		}
+	}'