Skip to content

Instantly share code, notes, and snippets.

@hmldd
Last active August 8, 2024 23:41
Show Gist options
  • Save hmldd/44d12d3a61a8d8077a3091c4ff7b9307 to your computer and use it in GitHub Desktop.
Save hmldd/44d12d3a61a8d8077a3091c4ff7b9307 to your computer and use it in GitHub Desktop.

Revisions

  1. hmldd revised this gist Jul 6, 2022. 1 changed file with 2 additions and 0 deletions.
    2 changes: 2 additions & 0 deletions scroll.py
    Original file line number Diff line number Diff line change
    @@ -61,3 +61,5 @@ def process_hits(hits):

    # Get the number of results that returned in the last scroll
    scroll_size = len(data['hits']['hits'])

    es.clear_scroll(scroll_id=sid)
  2. hmldd revised this gist Aug 29, 2019. 1 changed file with 4 additions and 6 deletions.
    10 changes: 4 additions & 6 deletions scroll.py
    Original file line number Diff line number Diff line change
    @@ -48,15 +48,13 @@ def process_hits(hits):
    sid = data['_scroll_id']
    scroll_size = len(data['hits']['hits'])

    # Before scroll, process current batch of hits
    process_hits(data['hits']['hits'])

    while scroll_size > 0:
    "Scrolling..."
    data = es.scroll(scroll_id=sid, scroll='2m')

    # Process current batch of hits

    # Before scroll, process current batch of hits
    process_hits(data['hits']['hits'])

    data = es.scroll(scroll_id=sid, scroll='2m')

    # Update the scroll ID
    sid = data['_scroll_id']
  3. hmldd created this gist Aug 5, 2017.
    65 changes: 65 additions & 0 deletions scroll.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,65 @@
    # coding:utf-8

    from elasticsearch import Elasticsearch
    import json

    # Define config
    host = "127.0.0.1"
    port = 9200
    timeout = 1000
    index = "index"
    doc_type = "type"
    size = 1000
    body = {}

    # Init Elasticsearch instance
    es = Elasticsearch(
    [
    {
    'host': host,
    'port': port
    }
    ],
    timeout=timeout
    )


    # Process hits here
    def process_hits(hits):
    for item in hits:
    print(json.dumps(item, indent=2))


    # Check index exists
    if not es.indices.exists(index=index):
    print("Index " + index + " not exists")
    exit()

    # Init scroll by search
    data = es.search(
    index=index,
    doc_type=doc_type,
    scroll='2m',
    size=size,
    body=body
    )

    # Get the scroll ID
    sid = data['_scroll_id']
    scroll_size = len(data['hits']['hits'])

    # Before scroll, process current batch of hits
    process_hits(data['hits']['hits'])

    while scroll_size > 0:
    "Scrolling..."
    data = es.scroll(scroll_id=sid, scroll='2m')

    # Process current batch of hits
    process_hits(data['hits']['hits'])

    # Update the scroll ID
    sid = data['_scroll_id']

    # Get the number of results that returned in the last scroll
    scroll_size = len(data['hits']['hits'])