{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.1.2\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3.11 -m pip install --upgrade pip\u001b[0m\n", "Note: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ "%pip install --quiet pandas neo4j-rust-ext" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from neo4j import GraphDatabase\n", "import time" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "NEO4J_URI=\"bolt://localhost\"\n", "NEO4J_USERNAME=\"neo4j\"\n", "NEO4J_PASSWORD=\"password\"\n", "NEO4J_DATABASE=\"graphrag\"" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def batched_import(statement, df, batch_size=1000):\n", " total = len(df)\n", " start_s = time.time()\n", " for start in range(0,total, batch_size):\n", " batch = df.iloc[start: min(start+batch_size,total)]\n", " result = driver.execute_query(\"UNWIND $rows AS value \" + statement, \n", " rows=batch.to_dict('records'),\n", " database_=NEO4J_DATABASE)\n", " print(result.summary.counters)\n", " print(f'{total} rows in { time.time() - start_s} s.') \n", " return total" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "create constraint chunk_id if not exists for (c:__Chunk__) require c.id is unique\n", "\n", "create constraint document_id if not exists for (d:__Document__) require d.id is unique\n", "\n", "create constraint entity_id if not exists for (c:__Community__) require c.community is unique\n", "\n", "create constraint entity_id if not exists for (e:__Entity__) require e.id is unique\n", "\n", "create constraint entity_title if not exists for (e:__Entity__) require e.title is unique\n", "\n", "create constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique\n" ] } ], "source": [ "# create constraints\n", "\n", "statements = \"\"\"\n", "create constraint chunk_id if not exists for (c:__Chunk__) require c.id is unique;\n", "create constraint document_id if not exists for (d:__Document__) require d.id is unique;\n", "create constraint entity_id if not exists for (c:__Community__) require c.community is unique;\n", "create constraint entity_id if not exists for (e:__Entity__) require e.id is unique;\n", "create constraint entity_title if not exists for (e:__Entity__) require e.title is unique;\n", "create constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique;\n", "\"\"\".split(\";\")\n", "\n", "for s in statements:\n", " if len((s or \"\").strip()) > 0:\n", " print(s)\n", " driver.execute_query(query_=s,database_=NEO4J_DATABASE)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "GRAPHRAG_FOLDER=\"/Users/mh/d/llm/graphrag/ragtest/output/20240703-144633/artifacts\"" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
levelclustered_graph
00<graphml xmlns=\"http://graphml.graphdrawing.or...
11<graphml xmlns=\"http://graphml.graphdrawing.or...
22<graphml xmlns=\"http://graphml.graphdrawing.or...
\n", "
" ], "text/plain": [ " level clustered_graph\n", "0 0 \n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtitle
0c305886e4aa2f6efcf64b57762777055book.txt
\n", "" ], "text/plain": [ " id title\n", "0 c305886e4aa2f6efcf64b57762777055 book.txt" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# import documents\n", "statement = \"\"\"\n", "MERGE (d:__Document__ {id:value.id})\n", "SET d += value {.title}\n", "// , text_unit_ids:value.text_unit_ids, raw_content:substring(value.raw_content,0,1000)};\n", "\"\"\"\n", "df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_documents.parquet', columns=[\"id\", \"title\"])\n", "\n", "batched_import(statement, df)\n", "\n", "df.head()\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'_contains_updates': True, 'properties_set': 462}\n", "231 rows in 0.02997303009033203 s.\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
chunk_idchunkn_tokensdocument_ids
0680dd6d2a970a49082fa4f34bf63a34eThe Project Gutenberg eBook of A Christmas Ca...300[c305886e4aa2f6efcf64b57762777055]
195f1f8f5bdbf0bee3a2c6f2f4a4907f6THE PROJECT GUTENBERG EBOOK A CHRISTMAS CAROL...300[c305886e4aa2f6efcf64b57762777055]
\n", "
" ], "text/plain": [ " chunk_id \\\n", "0 680dd6d2a970a49082fa4f34bf63a34e \n", "1 95f1f8f5bdbf0bee3a2c6f2f4a4907f6 \n", "\n", " chunk n_tokens \\\n", "0 The Project Gutenberg eBook of A Christmas Ca... 300 \n", "1 THE PROJECT GUTENBERG EBOOK A CHRISTMAS CAROL... 300 \n", "\n", " document_ids \n", "0 [c305886e4aa2f6efcf64b57762777055] \n", "1 [c305886e4aa2f6efcf64b57762777055] " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# import text units\n", "statement = \"\"\"\n", "MERGE (c:__Chunk__ {id:value.chunk_id})\n", "SET c += value {.chunk, .n_tokens}\n", "WITH *\n", "UNWIND value.document_ids as doc_id\n", "MATCH (d:__Document__ {id:doc_id})\n", "MERGE (d)<-[:PART_OF]-(c)\n", "RETURN count(distinct c) as chunksCreated\n", "\"\"\"\n", "\n", "df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_base_text_units.parquet', \n", " columns=[\"chunk_id\",\"chunk\",\"n_tokens\",\"document_ids\"])\n", "\n", "batched_import(statement, df)\n", "\n", "df.head(2)\n" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'_contains_updates': True, 'properties_set': 4155}\n", "831 rows in 0.13263273239135742 s.\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
leveltitletypedescriptionsource_idhuman_readable_ididtop_level_node_id
00\"PROJECT GUTENBERG\"\"ORGANIZATION\"Project Gutenberg is a pioneering organization...01e84646075b255eab0a34d872336a89,10bab8e9773ee...0b45241d70f0e43fca764df95b2b81f77b45241d70f0e43fca764df95b2b81f77
10\"UNITED STATES\"\"GEO\"The United States is prominently recognized fo...01e84646075b255eab0a34d872336a89,28f242c451594...14119fd06010c494caa07f439b333f4c54119fd06010c494caa07f439b333f4c5
\n", "
" ], "text/plain": [ " level title type \\\n", "0 0 \"PROJECT GUTENBERG\" \"ORGANIZATION\" \n", "1 0 \"UNITED STATES\" \"GEO\" \n", "\n", " description \\\n", "0 Project Gutenberg is a pioneering organization... \n", "1 The United States is prominently recognized fo... \n", "\n", " source_id human_readable_id \\\n", "0 01e84646075b255eab0a34d872336a89,10bab8e9773ee... 0 \n", "1 01e84646075b255eab0a34d872336a89,28f242c451594... 1 \n", "\n", " id top_level_node_id \n", "0 b45241d70f0e43fca764df95b2b81f77 b45241d70f0e43fca764df95b2b81f77 \n", "1 4119fd06010c494caa07f439b333f4c5 4119fd06010c494caa07f439b333f4c5 " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# import nodes\n", "\n", "statement = \"\"\"\n", "MERGE (n:__Entity__ {id:value.id})\n", "SET n += value {.level, .top_level_node_id, .human_readable_id, .description, \n", " title:replace(value.title,'\"','')}\n", "WITH n, value\n", "CALL apoc.create.addLabels(n, case when value.type is null then [] else [apoc.text.upperCamelCase(replace(value.type,'\"',''))] end) yield node\n", "UNWIND split(value.source_id,\",\") as source_id\n", "MATCH (c:__Chunk__ {id:source_id})\n", "RETURN count(distinct n) as createdNodes\n", "\"\"\"\n", "\n", "df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_nodes.parquet',\n", " columns=[\"level\",\"title\",\"type\",\"description\",\"source_id\",\"human_readable_id\",\"id\",\"top_level_node_id\"])\n", "\n", "batched_import(statement, df)\n", "df.head(2)\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'_contains_updates': True, 'properties_set': 2052}\n", "342 rows in 0.013482093811035156 s.\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
sourcetargetidrankweighthuman_readable_iddescriptiontext_unit_ids
0\"PROJECT GUTENBERG\"\"A CHRISTMAS CAROL\"b84d71ed9c3b45819eb3205fd28e13a0201.00\"Project Gutenberg is responsible for releasin...[680dd6d2a970a49082fa4f34bf63a34e]
1\"PROJECT GUTENBERG\"\"SUZANNE SHELL\"b0b464bc92a541e48547fe9738378dab151.01\"Suzanne Shell produced the eBook version of '...[680dd6d2a970a49082fa4f34bf63a34e]
\n", "
" ], "text/plain": [ " source target id \\\n", "0 \"PROJECT GUTENBERG\" \"A CHRISTMAS CAROL\" b84d71ed9c3b45819eb3205fd28e13a0 \n", "1 \"PROJECT GUTENBERG\" \"SUZANNE SHELL\" b0b464bc92a541e48547fe9738378dab \n", "\n", " rank weight human_readable_id \\\n", "0 20 1.0 0 \n", "1 15 1.0 1 \n", "\n", " description \\\n", "0 \"Project Gutenberg is responsible for releasin... \n", "1 \"Suzanne Shell produced the eBook version of '... \n", "\n", " text_unit_ids \n", "0 [680dd6d2a970a49082fa4f34bf63a34e] \n", "1 [680dd6d2a970a49082fa4f34bf63a34e] " ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# import relationships\n", "\n", "statement = \"\"\"\n", " MATCH (source:__Entity__ {title:replace(value.source,'\"','')})\n", " MATCH (target:__Entity__ {title:replace(value.target,'\"','')})\n", " // todo rel-type from source-target labels?\n", " MERGE (source)-[rel:RELATED]->(target)\n", " SET rel += value {.id, .rank, .weight, .human_readable_id, .description, text_unit_ids:value.text_unit_ids}\n", " RETURN count(*) as createdRels\n", "\"\"\"\n", "\n", "df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_relationships.parquet',\n", " columns=[\"source\",\"target\",\"id\",\"rank\",\"weight\",\"human_readable_id\",\"description\",\"text_unit_ids\"])\n", "\n", "batched_import(statement, df)\n", "\n", "df.head(2)\n" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'_contains_updates': True, 'properties_set': 94}\n", "47 rows in 0.021432161331176758 s.\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idleveltitletext_unit_idsrelationship_ids
020Community 2[0546d296a4d3bb0486bd0c94c01dc9be,0d6bc6e701a0...[ba481175ee1d4329bf07757a30abd3a1, 8d8da35190b...
140Community 4[054bdcba0a3690b43609d9226a47f84d,3a450ed2b7fb...[929f30875e1744b49e7b416eaf5a790c, 4920fda0318...
\n", "
" ], "text/plain": [ " id level title text_unit_ids \\\n", "0 2 0 Community 2 [0546d296a4d3bb0486bd0c94c01dc9be,0d6bc6e701a0... \n", "1 4 0 Community 4 [054bdcba0a3690b43609d9226a47f84d,3a450ed2b7fb... \n", "\n", " relationship_ids \n", "0 [ba481175ee1d4329bf07757a30abd3a1, 8d8da35190b... \n", "1 [929f30875e1744b49e7b416eaf5a790c, 4920fda0318... " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# import communities\n", "\n", "statement = \"\"\"\n", "MERGE (c:__Community__ {community:value.id})\n", "SET c += value {.level, .title}\n", "/*\n", "UNWIND value.text_unit_ids as text_unit_id\n", "MATCH (t:__Chunk__ {id:text_unit_id})\n", "MERGE (c)-[:HAS_CHUNK]->(t)\n", "WITH distinct c, value\n", "*/\n", "WITH *\n", "UNWIND value.relationship_ids as rel_id\n", "MATCH (start:__Entity__)-[:RELATED {id:rel_id}]->(end:__Entity__)\n", "MERGE (start)-[:IN_COMMUNITY]->(c)\n", "MERGE (end)-[:IN_COMMUNITY]->(c)\n", "RETURn count(distinct c) as createdCommunities\n", "\"\"\"\n", "\n", "df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_communities.parquet', \n", " columns=[\"id\",\"level\",\"title\",\"text_unit_ids\",\"relationship_ids\"])\n", "batched_import(statement, df)\n", "\n", "df.head(2)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'_contains_updates': True, 'properties_set': 329}\n", "47 rows in 0.022797107696533203 s.\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idcommunityleveltitlesummaryfindingsrankrank_explanation
0e7822326-4da8-4954-afa9-be7f4f5791a5422Scrooge's Supernatural Encounters: Marley's Gh...This report delves into the pivotal supernatur...[{'explanation': 'Marley's Ghost plays a cruci...8.0The impact severity rating is high due to the ...
18a5afac1-99ef-4f01-a1b1-f044ce392ff9432The Ghost's Influence on Scrooge's TransformationThis report delves into the pivotal role of 'T...[{'explanation': 'The Ghost, identified at tim...8.5The impact severity rating is high due to the ...
\n", "
" ], "text/plain": [ " id community level \\\n", "0 e7822326-4da8-4954-afa9-be7f4f5791a5 42 2 \n", "1 8a5afac1-99ef-4f01-a1b1-f044ce392ff9 43 2 \n", "\n", " title \\\n", "0 Scrooge's Supernatural Encounters: Marley's Gh... \n", "1 The Ghost's Influence on Scrooge's Transformation \n", "\n", " summary \\\n", "0 This report delves into the pivotal supernatur... \n", "1 This report delves into the pivotal role of 'T... \n", "\n", " findings rank \\\n", "0 [{'explanation': 'Marley's Ghost plays a cruci... 8.0 \n", "1 [{'explanation': 'The Ghost, identified at tim... 8.5 \n", "\n", " rank_explanation \n", "0 The impact severity rating is high due to the ... \n", "1 The impact severity rating is high due to the ... " ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# import communities\n", "\n", "statement = \"\"\"\n", "MERGE (c:__Community__ {community:value.community})\n", "// we can also extract findings as separate nodes\n", "WITH c, value, [f in value.findings | apoc.text.join([k in keys(f) | k+\": \"+f[k]],',\\n')] as findings\n", "SET c += value {.level, .title, .summary, findings, .rank, .rank_explanation, .id}\n", "RETURn count(distinct c) as createdCommunities\n", "\"\"\"\n", "\n", "df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_community_reports.parquet',\n", " columns=[\"id\",\"community\",\"level\",\"title\",\"summary\", \"findings\",\"rank\",\"rank_explanation\"])\n", "\n", "batched_import(statement, df)\n", "df.head(2)\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.8" } }, "nbformat": 4, "nbformat_minor": 2 }