{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.1.2\u001b[0m\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3.11 -m pip install --upgrade pip\u001b[0m\n",
      "Note: you may need to restart the kernel to use updated packages.\n"
     ]
    }
   ],
   "source": [
    "%pip install --quiet pandas neo4j-rust-ext"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from neo4j import GraphDatabase\n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "NEO4J_URI=\"bolt://localhost\"\n",
    "NEO4J_USERNAME=\"neo4j\"\n",
    "NEO4J_PASSWORD=\"password\"\n",
    "NEO4J_DATABASE=\"graphrag\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def batched_import(statement, df, batch_size=1000):\n",
    "    total = len(df)\n",
    "    start_s = time.time()\n",
    "    for start in range(0,total, batch_size):\n",
    "        batch = df.iloc[start: min(start+batch_size,total)]\n",
    "        result = driver.execute_query(\"UNWIND $rows AS value \" + statement, \n",
    "                                      rows=batch.to_dict('records'),\n",
    "                                      database_=NEO4J_DATABASE)\n",
    "        print(result.summary.counters)\n",
    "    print(f'{total} rows in { time.time() - start_s} s.')    \n",
    "    return total"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "create constraint chunk_id if not exists for (c:__Chunk__) require c.id is unique\n",
      "\n",
      "create constraint document_id if not exists for (d:__Document__) require d.id is unique\n",
      "\n",
      "create constraint entity_id if not exists for (c:__Community__) require c.community is unique\n",
      "\n",
      "create constraint entity_id if not exists for (e:__Entity__) require e.id is unique\n",
      "\n",
      "create constraint entity_title if not exists for (e:__Entity__) require e.title is unique\n",
      "\n",
      "create constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique\n"
     ]
    }
   ],
   "source": [
    "# create constraints\n",
    "\n",
    "statements = \"\"\"\n",
    "create constraint chunk_id if not exists for (c:__Chunk__) require c.id is unique;\n",
    "create constraint document_id if not exists for (d:__Document__) require d.id is unique;\n",
    "create constraint entity_id if not exists for (c:__Community__) require c.community is unique;\n",
    "create constraint entity_id if not exists for (e:__Entity__) require e.id is unique;\n",
    "create constraint entity_title if not exists for (e:__Entity__) require e.title is unique;\n",
    "create constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique;\n",
    "\"\"\".split(\";\")\n",
    "\n",
    "for s in statements:\n",
    "    if len((s or \"\").strip()) > 0:\n",
    "        print(s)\n",
    "        driver.execute_query(query_=s,database_=NEO4J_DATABASE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "GRAPHRAG_FOLDER=\"/Users/mh/d/llm/graphrag/ragtest/output/20240703-144633/artifacts\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>level</th>\n",
       "      <th>clustered_graph</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>&lt;graphml xmlns=\"http://graphml.graphdrawing.or...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>&lt;graphml xmlns=\"http://graphml.graphdrawing.or...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>&lt;graphml xmlns=\"http://graphml.graphdrawing.or...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   level                                    clustered_graph\n",
       "0      0  <graphml xmlns=\"http://graphml.graphdrawing.or...\n",
       "1      1  <graphml xmlns=\"http://graphml.graphdrawing.or...\n",
       "2      2  <graphml xmlns=\"http://graphml.graphdrawing.or..."
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_base_entity_graph.parquet')\n",
    "df.head()\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'_contains_updates': True, 'properties_set': 1}\n",
      "1 rows in 0.0053479671478271484 s.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>title</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>c305886e4aa2f6efcf64b57762777055</td>\n",
       "      <td>book.txt</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                 id     title\n",
       "0  c305886e4aa2f6efcf64b57762777055  book.txt"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# import documents\n",
    "statement = \"\"\"\n",
    "MERGE (d:__Document__ {id:value.id})\n",
    "SET d += value {.title}\n",
    "// , text_unit_ids:value.text_unit_ids, raw_content:substring(value.raw_content,0,1000)};\n",
    "\"\"\"\n",
    "df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_documents.parquet', columns=[\"id\", \"title\"])\n",
    "\n",
    "batched_import(statement, df)\n",
    "\n",
    "df.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'_contains_updates': True, 'properties_set': 462}\n",
      "231 rows in 0.02997303009033203 s.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>chunk_id</th>\n",
       "      <th>chunk</th>\n",
       "      <th>n_tokens</th>\n",
       "      <th>document_ids</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>680dd6d2a970a49082fa4f34bf63a34e</td>\n",
       "      <td>﻿The Project Gutenberg eBook of A Christmas Ca...</td>\n",
       "      <td>300</td>\n",
       "      <td>[c305886e4aa2f6efcf64b57762777055]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>95f1f8f5bdbf0bee3a2c6f2f4a4907f6</td>\n",
       "      <td>THE PROJECT GUTENBERG EBOOK A CHRISTMAS CAROL...</td>\n",
       "      <td>300</td>\n",
       "      <td>[c305886e4aa2f6efcf64b57762777055]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                           chunk_id  \\\n",
       "0  680dd6d2a970a49082fa4f34bf63a34e   \n",
       "1  95f1f8f5bdbf0bee3a2c6f2f4a4907f6   \n",
       "\n",
       "                                               chunk  n_tokens  \\\n",
       "0  ﻿The Project Gutenberg eBook of A Christmas Ca...       300   \n",
       "1   THE PROJECT GUTENBERG EBOOK A CHRISTMAS CAROL...       300   \n",
       "\n",
       "                         document_ids  \n",
       "0  [c305886e4aa2f6efcf64b57762777055]  \n",
       "1  [c305886e4aa2f6efcf64b57762777055]  "
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# import text units\n",
    "statement = \"\"\"\n",
    "MERGE (c:__Chunk__ {id:value.chunk_id})\n",
    "SET c += value {.chunk, .n_tokens}\n",
    "WITH *\n",
    "UNWIND value.document_ids as doc_id\n",
    "MATCH (d:__Document__ {id:doc_id})\n",
    "MERGE (d)<-[:PART_OF]-(c)\n",
    "RETURN count(distinct c) as chunksCreated\n",
    "\"\"\"\n",
    "\n",
    "df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_base_text_units.parquet', \n",
    "                     columns=[\"chunk_id\",\"chunk\",\"n_tokens\",\"document_ids\"])\n",
    "\n",
    "batched_import(statement, df)\n",
    "\n",
    "df.head(2)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'_contains_updates': True, 'properties_set': 4155}\n",
      "831 rows in 0.13263273239135742 s.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>level</th>\n",
       "      <th>title</th>\n",
       "      <th>type</th>\n",
       "      <th>description</th>\n",
       "      <th>source_id</th>\n",
       "      <th>human_readable_id</th>\n",
       "      <th>id</th>\n",
       "      <th>top_level_node_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>\"PROJECT GUTENBERG\"</td>\n",
       "      <td>\"ORGANIZATION\"</td>\n",
       "      <td>Project Gutenberg is a pioneering organization...</td>\n",
       "      <td>01e84646075b255eab0a34d872336a89,10bab8e9773ee...</td>\n",
       "      <td>0</td>\n",
       "      <td>b45241d70f0e43fca764df95b2b81f77</td>\n",
       "      <td>b45241d70f0e43fca764df95b2b81f77</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>\"UNITED STATES\"</td>\n",
       "      <td>\"GEO\"</td>\n",
       "      <td>The United States is prominently recognized fo...</td>\n",
       "      <td>01e84646075b255eab0a34d872336a89,28f242c451594...</td>\n",
       "      <td>1</td>\n",
       "      <td>4119fd06010c494caa07f439b333f4c5</td>\n",
       "      <td>4119fd06010c494caa07f439b333f4c5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   level                title            type  \\\n",
       "0      0  \"PROJECT GUTENBERG\"  \"ORGANIZATION\"   \n",
       "1      0      \"UNITED STATES\"           \"GEO\"   \n",
       "\n",
       "                                         description  \\\n",
       "0  Project Gutenberg is a pioneering organization...   \n",
       "1  The United States is prominently recognized fo...   \n",
       "\n",
       "                                           source_id  human_readable_id  \\\n",
       "0  01e84646075b255eab0a34d872336a89,10bab8e9773ee...                  0   \n",
       "1  01e84646075b255eab0a34d872336a89,28f242c451594...                  1   \n",
       "\n",
       "                                 id                 top_level_node_id  \n",
       "0  b45241d70f0e43fca764df95b2b81f77  b45241d70f0e43fca764df95b2b81f77  \n",
       "1  4119fd06010c494caa07f439b333f4c5  4119fd06010c494caa07f439b333f4c5  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# import nodes\n",
    "\n",
    "statement = \"\"\"\n",
    "MERGE (n:__Entity__ {id:value.id})\n",
    "SET n += value {.level, .top_level_node_id, .human_readable_id, .description, \n",
    "    title:replace(value.title,'\"','')}\n",
    "WITH n, value\n",
    "CALL apoc.create.addLabels(n, case when value.type is null then [] else [apoc.text.upperCamelCase(replace(value.type,'\"',''))] end) yield node\n",
    "UNWIND split(value.source_id,\",\") as source_id\n",
    "MATCH (c:__Chunk__ {id:source_id})\n",
    "RETURN count(distinct n) as createdNodes\n",
    "\"\"\"\n",
    "\n",
    "df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_nodes.parquet',\n",
    "                     columns=[\"level\",\"title\",\"type\",\"description\",\"source_id\",\"human_readable_id\",\"id\",\"top_level_node_id\"])\n",
    "\n",
    "batched_import(statement, df)\n",
    "df.head(2)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'_contains_updates': True, 'properties_set': 2052}\n",
      "342 rows in 0.013482093811035156 s.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source</th>\n",
       "      <th>target</th>\n",
       "      <th>id</th>\n",
       "      <th>rank</th>\n",
       "      <th>weight</th>\n",
       "      <th>human_readable_id</th>\n",
       "      <th>description</th>\n",
       "      <th>text_unit_ids</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>\"PROJECT GUTENBERG\"</td>\n",
       "      <td>\"A CHRISTMAS CAROL\"</td>\n",
       "      <td>b84d71ed9c3b45819eb3205fd28e13a0</td>\n",
       "      <td>20</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>\"Project Gutenberg is responsible for releasin...</td>\n",
       "      <td>[680dd6d2a970a49082fa4f34bf63a34e]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>\"PROJECT GUTENBERG\"</td>\n",
       "      <td>\"SUZANNE SHELL\"</td>\n",
       "      <td>b0b464bc92a541e48547fe9738378dab</td>\n",
       "      <td>15</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "      <td>\"Suzanne Shell produced the eBook version of '...</td>\n",
       "      <td>[680dd6d2a970a49082fa4f34bf63a34e]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                source               target                                id  \\\n",
       "0  \"PROJECT GUTENBERG\"  \"A CHRISTMAS CAROL\"  b84d71ed9c3b45819eb3205fd28e13a0   \n",
       "1  \"PROJECT GUTENBERG\"      \"SUZANNE SHELL\"  b0b464bc92a541e48547fe9738378dab   \n",
       "\n",
       "   rank  weight human_readable_id  \\\n",
       "0    20     1.0                 0   \n",
       "1    15     1.0                 1   \n",
       "\n",
       "                                         description  \\\n",
       "0  \"Project Gutenberg is responsible for releasin...   \n",
       "1  \"Suzanne Shell produced the eBook version of '...   \n",
       "\n",
       "                        text_unit_ids  \n",
       "0  [680dd6d2a970a49082fa4f34bf63a34e]  \n",
       "1  [680dd6d2a970a49082fa4f34bf63a34e]  "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# import relationships\n",
    "\n",
    "statement = \"\"\"\n",
    "    MATCH (source:__Entity__ {title:replace(value.source,'\"','')})\n",
    "    MATCH (target:__Entity__ {title:replace(value.target,'\"','')})\n",
    "    // todo rel-type from source-target labels?\n",
    "    MERGE (source)-[rel:RELATED]->(target)\n",
    "    SET rel += value {.id, .rank, .weight, .human_readable_id, .description, text_unit_ids:value.text_unit_ids}\n",
    "    RETURN count(*) as createdRels\n",
    "\"\"\"\n",
    "\n",
    "df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_relationships.parquet',\n",
    "                     columns=[\"source\",\"target\",\"id\",\"rank\",\"weight\",\"human_readable_id\",\"description\",\"text_unit_ids\"])\n",
    "\n",
    "batched_import(statement, df)\n",
    "\n",
    "df.head(2)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'_contains_updates': True, 'properties_set': 94}\n",
      "47 rows in 0.021432161331176758 s.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>level</th>\n",
       "      <th>title</th>\n",
       "      <th>text_unit_ids</th>\n",
       "      <th>relationship_ids</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>Community 2</td>\n",
       "      <td>[0546d296a4d3bb0486bd0c94c01dc9be,0d6bc6e701a0...</td>\n",
       "      <td>[ba481175ee1d4329bf07757a30abd3a1, 8d8da35190b...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4</td>\n",
       "      <td>0</td>\n",
       "      <td>Community 4</td>\n",
       "      <td>[054bdcba0a3690b43609d9226a47f84d,3a450ed2b7fb...</td>\n",
       "      <td>[929f30875e1744b49e7b416eaf5a790c, 4920fda0318...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  id  level        title                                      text_unit_ids  \\\n",
       "0  2      0  Community 2  [0546d296a4d3bb0486bd0c94c01dc9be,0d6bc6e701a0...   \n",
       "1  4      0  Community 4  [054bdcba0a3690b43609d9226a47f84d,3a450ed2b7fb...   \n",
       "\n",
       "                                    relationship_ids  \n",
       "0  [ba481175ee1d4329bf07757a30abd3a1, 8d8da35190b...  \n",
       "1  [929f30875e1744b49e7b416eaf5a790c, 4920fda0318...  "
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# import communities\n",
    "\n",
    "statement = \"\"\"\n",
    "MERGE (c:__Community__ {community:value.id})\n",
    "SET c += value {.level, .title}\n",
    "/*\n",
    "UNWIND value.text_unit_ids as text_unit_id\n",
    "MATCH (t:__Chunk__ {id:text_unit_id})\n",
    "MERGE (c)-[:HAS_CHUNK]->(t)\n",
    "WITH distinct c, value\n",
    "*/\n",
    "WITH *\n",
    "UNWIND value.relationship_ids as rel_id\n",
    "MATCH (start:__Entity__)-[:RELATED {id:rel_id}]->(end:__Entity__)\n",
    "MERGE (start)-[:IN_COMMUNITY]->(c)\n",
    "MERGE (end)-[:IN_COMMUNITY]->(c)\n",
    "RETURn count(distinct c) as createdCommunities\n",
    "\"\"\"\n",
    "\n",
    "df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_communities.parquet', \n",
    "                     columns=[\"id\",\"level\",\"title\",\"text_unit_ids\",\"relationship_ids\"])\n",
    "batched_import(statement, df)\n",
    "\n",
    "df.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'_contains_updates': True, 'properties_set': 329}\n",
      "47 rows in 0.022797107696533203 s.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>community</th>\n",
       "      <th>level</th>\n",
       "      <th>title</th>\n",
       "      <th>summary</th>\n",
       "      <th>findings</th>\n",
       "      <th>rank</th>\n",
       "      <th>rank_explanation</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>e7822326-4da8-4954-afa9-be7f4f5791a5</td>\n",
       "      <td>42</td>\n",
       "      <td>2</td>\n",
       "      <td>Scrooge's Supernatural Encounters: Marley's Gh...</td>\n",
       "      <td>This report delves into the pivotal supernatur...</td>\n",
       "      <td>[{'explanation': 'Marley's Ghost plays a cruci...</td>\n",
       "      <td>8.0</td>\n",
       "      <td>The impact severity rating is high due to the ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>8a5afac1-99ef-4f01-a1b1-f044ce392ff9</td>\n",
       "      <td>43</td>\n",
       "      <td>2</td>\n",
       "      <td>The Ghost's Influence on Scrooge's Transformation</td>\n",
       "      <td>This report delves into the pivotal role of 'T...</td>\n",
       "      <td>[{'explanation': 'The Ghost, identified at tim...</td>\n",
       "      <td>8.5</td>\n",
       "      <td>The impact severity rating is high due to the ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                     id community  level  \\\n",
       "0  e7822326-4da8-4954-afa9-be7f4f5791a5        42      2   \n",
       "1  8a5afac1-99ef-4f01-a1b1-f044ce392ff9        43      2   \n",
       "\n",
       "                                               title  \\\n",
       "0  Scrooge's Supernatural Encounters: Marley's Gh...   \n",
       "1  The Ghost's Influence on Scrooge's Transformation   \n",
       "\n",
       "                                             summary  \\\n",
       "0  This report delves into the pivotal supernatur...   \n",
       "1  This report delves into the pivotal role of 'T...   \n",
       "\n",
       "                                            findings  rank  \\\n",
       "0  [{'explanation': 'Marley's Ghost plays a cruci...   8.0   \n",
       "1  [{'explanation': 'The Ghost, identified at tim...   8.5   \n",
       "\n",
       "                                    rank_explanation  \n",
       "0  The impact severity rating is high due to the ...  \n",
       "1  The impact severity rating is high due to the ...  "
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# import communities\n",
    "\n",
    "statement = \"\"\"\n",
    "MERGE (c:__Community__ {community:value.community})\n",
    "// we can also extract findings as separate nodes\n",
    "WITH c, value, [f in value.findings | apoc.text.join([k in keys(f) | k+\": \"+f[k]],',\\n')] as findings\n",
    "SET c += value {.level, .title, .summary, findings, .rank, .rank_explanation, .id}\n",
    "RETURn count(distinct c) as createdCommunities\n",
    "\"\"\"\n",
    "\n",
    "df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_community_reports.parquet',\n",
    "                     columns=[\"id\",\"community\",\"level\",\"title\",\"summary\", \"findings\",\"rank\",\"rank_explanation\"])\n",
    "\n",
    "batched_import(statement, df)\n",
    "df.head(2)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}