hmldd · August 8, 2024 23:41 · Jul 6, 2022 · Aug 29, 2019 · Aug 5, 2017
diff --git a/scroll.py b/scroll.py
@@ -61,3 +61,5 @@ def process_hits(hits):
 
     # Get the number of results that returned in the last scroll
     scroll_size = len(data['hits']['hits'])
+
+es.clear_scroll(scroll_id=sid)
diff --git a/scroll.py b/scroll.py
@@ -48,15 +48,13 @@ def process_hits(hits):
 sid = data['_scroll_id']
 scroll_size = len(data['hits']['hits'])
 
-# Before scroll, process current batch of hits
-process_hits(data['hits']['hits'])
-
 while scroll_size > 0:
     "Scrolling..."
-    data = es.scroll(scroll_id=sid, scroll='2m')
-
-    # Process current batch of hits
+
+    # Before scroll, process current batch of hits
     process_hits(data['hits']['hits'])
+
+    data = es.scroll(scroll_id=sid, scroll='2m')
 
     # Update the scroll ID
     sid = data['_scroll_id']

diff --git a/scroll.py b/scroll.py
@@ -0,0 +1,65 @@
+# coding:utf-8
+
+from elasticsearch import Elasticsearch
+import json
+
+# Define config
+host = "127.0.0.1"
+port = 9200
+timeout = 1000
+index = "index"
+doc_type = "type"
+size = 1000
+body = {}
+
+# Init Elasticsearch instance
+es = Elasticsearch(
+    [
+        {
+            'host': host,
+            'port': port
+        }
+    ],
+    timeout=timeout
+)
+
+
+# Process hits here
+def process_hits(hits):
+    for item in hits:
+        print(json.dumps(item, indent=2))
+
+
+# Check index exists
+if not es.indices.exists(index=index):
+    print("Index " + index + " not exists")
+    exit()
+
+# Init scroll by search
+data = es.search(
+    index=index,
+    doc_type=doc_type,
+    scroll='2m',
+    size=size,
+    body=body
+)
+
+# Get the scroll ID
+sid = data['_scroll_id']
+scroll_size = len(data['hits']['hits'])
+
+# Before scroll, process current batch of hits
+process_hits(data['hits']['hits'])
+
+while scroll_size > 0:
+    "Scrolling..."
+    data = es.scroll(scroll_id=sid, scroll='2m')
+
+    # Process current batch of hits
+    process_hits(data['hits']['hits'])
+
+    # Update the scroll ID
+    sid = data['_scroll_id']
+
+    # Get the number of results that returned in the last scroll
+    scroll_size = len(data['hits']['hits'])