Skip to content

Instantly share code, notes, and snippets.

@GuyPaddock
Created August 8, 2024 03:31
Show Gist options
  • Save GuyPaddock/3c5bdadfb2ad4d8ba794834806a2308c to your computer and use it in GitHub Desktop.
Save GuyPaddock/3c5bdadfb2ad4d8ba794834806a2308c to your computer and use it in GitHub Desktop.

Revisions

  1. GuyPaddock created this gist Aug 8, 2024.
    45 changes: 45 additions & 0 deletions index_data.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,45 @@
    import json
    from elasticsearch import Elasticsearch

    # Load JSON data
    with open("documents.json", "r") as file:
    documents_raw = json.load(file)

    # Flatten the documents structure
    documents = []
    for course in documents_raw:
    course_name = course['course']
    for doc in course['documents']:
    doc['course'] = course_name
    documents.append(doc)

    # Initialize Elasticsearch client
    es_client = Elasticsearch('http://localhost:9200')

    # Define index settings and mappings
    def create_index(index_name="course-questions"):
    index_settings = {
    "settings": {
    "number_of_shards": 1,
    "number_of_replicas": 0
    },
    "mappings": {
    "properties": {
    "text": {"type": "text"},
    "section": {"type": "text"},
    "question": {"type": "text"},
    "course": {"type": "keyword"}
    }
    }
    }
    if not es_client.indices.exists(index=index_name):
    es_client.indices.create(index=index_name, body=index_settings)

    # Create the index
    create_index()

    # Index documents
    for doc in documents:
    es_client.index(index="course-questions", body=doc)

    print("Data indexed successfully")