{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.1.2\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3.11 -m pip install --upgrade pip\u001b[0m\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"%pip install --quiet pandas neo4j-rust-ext"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from neo4j import GraphDatabase\n",
"import time"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"NEO4J_URI=\"bolt://localhost\"\n",
"NEO4J_USERNAME=\"neo4j\"\n",
"NEO4J_PASSWORD=\"password\"\n",
"NEO4J_DATABASE=\"graphrag\""
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def batched_import(statement, df, batch_size=1000):\n",
" total = len(df)\n",
" start_s = time.time()\n",
" for start in range(0,total, batch_size):\n",
" batch = df.iloc[start: min(start+batch_size,total)]\n",
" result = driver.execute_query(\"UNWIND $rows AS value \" + statement, \n",
" rows=batch.to_dict('records'),\n",
" database_=NEO4J_DATABASE)\n",
" print(result.summary.counters)\n",
" print(f'{total} rows in { time.time() - start_s} s.') \n",
" return total"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"create constraint chunk_id if not exists for (c:__Chunk__) require c.id is unique\n",
"\n",
"create constraint document_id if not exists for (d:__Document__) require d.id is unique\n",
"\n",
"create constraint entity_id if not exists for (c:__Community__) require c.community is unique\n",
"\n",
"create constraint entity_id if not exists for (e:__Entity__) require e.id is unique\n",
"\n",
"create constraint entity_title if not exists for (e:__Entity__) require e.title is unique\n",
"\n",
"create constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique\n"
]
}
],
"source": [
"# create constraints\n",
"\n",
"statements = \"\"\"\n",
"create constraint chunk_id if not exists for (c:__Chunk__) require c.id is unique;\n",
"create constraint document_id if not exists for (d:__Document__) require d.id is unique;\n",
"create constraint entity_id if not exists for (c:__Community__) require c.community is unique;\n",
"create constraint entity_id if not exists for (e:__Entity__) require e.id is unique;\n",
"create constraint entity_title if not exists for (e:__Entity__) require e.title is unique;\n",
"create constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique;\n",
"\"\"\".split(\";\")\n",
"\n",
"for s in statements:\n",
" if len((s or \"\").strip()) > 0:\n",
" print(s)\n",
" driver.execute_query(query_=s,database_=NEO4J_DATABASE)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"GRAPHRAG_FOLDER=\"/Users/mh/d/llm/graphrag/ragtest/output/20240703-144633/artifacts\""
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" level | \n",
" clustered_graph | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" <graphml xmlns=\"http://graphml.graphdrawing.or... | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" <graphml xmlns=\"http://graphml.graphdrawing.or... | \n",
"
\n",
" \n",
" | 2 | \n",
" 2 | \n",
" <graphml xmlns=\"http://graphml.graphdrawing.or... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" level clustered_graph\n",
"0 0 \n",
"\n",
"\n",
" \n",
" \n",
" | \n",
" id | \n",
" title | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" c305886e4aa2f6efcf64b57762777055 | \n",
" book.txt | \n",
"
\n",
" \n",
"
\n",
""
],
"text/plain": [
" id title\n",
"0 c305886e4aa2f6efcf64b57762777055 book.txt"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# import documents\n",
"statement = \"\"\"\n",
"MERGE (d:__Document__ {id:value.id})\n",
"SET d += value {.title}\n",
"// , text_unit_ids:value.text_unit_ids, raw_content:substring(value.raw_content,0,1000)};\n",
"\"\"\"\n",
"df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_documents.parquet', columns=[\"id\", \"title\"])\n",
"\n",
"batched_import(statement, df)\n",
"\n",
"df.head()\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'_contains_updates': True, 'properties_set': 462}\n",
"231 rows in 0.02997303009033203 s.\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" chunk_id | \n",
" chunk | \n",
" n_tokens | \n",
" document_ids | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 680dd6d2a970a49082fa4f34bf63a34e | \n",
" The Project Gutenberg eBook of A Christmas Ca... | \n",
" 300 | \n",
" [c305886e4aa2f6efcf64b57762777055] | \n",
"
\n",
" \n",
" | 1 | \n",
" 95f1f8f5bdbf0bee3a2c6f2f4a4907f6 | \n",
" THE PROJECT GUTENBERG EBOOK A CHRISTMAS CAROL... | \n",
" 300 | \n",
" [c305886e4aa2f6efcf64b57762777055] | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" chunk_id \\\n",
"0 680dd6d2a970a49082fa4f34bf63a34e \n",
"1 95f1f8f5bdbf0bee3a2c6f2f4a4907f6 \n",
"\n",
" chunk n_tokens \\\n",
"0 The Project Gutenberg eBook of A Christmas Ca... 300 \n",
"1 THE PROJECT GUTENBERG EBOOK A CHRISTMAS CAROL... 300 \n",
"\n",
" document_ids \n",
"0 [c305886e4aa2f6efcf64b57762777055] \n",
"1 [c305886e4aa2f6efcf64b57762777055] "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# import text units\n",
"statement = \"\"\"\n",
"MERGE (c:__Chunk__ {id:value.chunk_id})\n",
"SET c += value {.chunk, .n_tokens}\n",
"WITH *\n",
"UNWIND value.document_ids as doc_id\n",
"MATCH (d:__Document__ {id:doc_id})\n",
"MERGE (d)<-[:PART_OF]-(c)\n",
"RETURN count(distinct c) as chunksCreated\n",
"\"\"\"\n",
"\n",
"df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_base_text_units.parquet', \n",
" columns=[\"chunk_id\",\"chunk\",\"n_tokens\",\"document_ids\"])\n",
"\n",
"batched_import(statement, df)\n",
"\n",
"df.head(2)\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'_contains_updates': True, 'properties_set': 4155}\n",
"831 rows in 0.13263273239135742 s.\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" level | \n",
" title | \n",
" type | \n",
" description | \n",
" source_id | \n",
" human_readable_id | \n",
" id | \n",
" top_level_node_id | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" \"PROJECT GUTENBERG\" | \n",
" \"ORGANIZATION\" | \n",
" Project Gutenberg is a pioneering organization... | \n",
" 01e84646075b255eab0a34d872336a89,10bab8e9773ee... | \n",
" 0 | \n",
" b45241d70f0e43fca764df95b2b81f77 | \n",
" b45241d70f0e43fca764df95b2b81f77 | \n",
"
\n",
" \n",
" | 1 | \n",
" 0 | \n",
" \"UNITED STATES\" | \n",
" \"GEO\" | \n",
" The United States is prominently recognized fo... | \n",
" 01e84646075b255eab0a34d872336a89,28f242c451594... | \n",
" 1 | \n",
" 4119fd06010c494caa07f439b333f4c5 | \n",
" 4119fd06010c494caa07f439b333f4c5 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" level title type \\\n",
"0 0 \"PROJECT GUTENBERG\" \"ORGANIZATION\" \n",
"1 0 \"UNITED STATES\" \"GEO\" \n",
"\n",
" description \\\n",
"0 Project Gutenberg is a pioneering organization... \n",
"1 The United States is prominently recognized fo... \n",
"\n",
" source_id human_readable_id \\\n",
"0 01e84646075b255eab0a34d872336a89,10bab8e9773ee... 0 \n",
"1 01e84646075b255eab0a34d872336a89,28f242c451594... 1 \n",
"\n",
" id top_level_node_id \n",
"0 b45241d70f0e43fca764df95b2b81f77 b45241d70f0e43fca764df95b2b81f77 \n",
"1 4119fd06010c494caa07f439b333f4c5 4119fd06010c494caa07f439b333f4c5 "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# import nodes\n",
"\n",
"statement = \"\"\"\n",
"MERGE (n:__Entity__ {id:value.id})\n",
"SET n += value {.level, .top_level_node_id, .human_readable_id, .description, \n",
" title:replace(value.title,'\"','')}\n",
"WITH n, value\n",
"CALL apoc.create.addLabels(n, case when value.type is null then [] else [apoc.text.upperCamelCase(replace(value.type,'\"',''))] end) yield node\n",
"UNWIND split(value.source_id,\",\") as source_id\n",
"MATCH (c:__Chunk__ {id:source_id})\n",
"RETURN count(distinct n) as createdNodes\n",
"\"\"\"\n",
"\n",
"df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_nodes.parquet',\n",
" columns=[\"level\",\"title\",\"type\",\"description\",\"source_id\",\"human_readable_id\",\"id\",\"top_level_node_id\"])\n",
"\n",
"batched_import(statement, df)\n",
"df.head(2)\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'_contains_updates': True, 'properties_set': 2052}\n",
"342 rows in 0.013482093811035156 s.\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" source | \n",
" target | \n",
" id | \n",
" rank | \n",
" weight | \n",
" human_readable_id | \n",
" description | \n",
" text_unit_ids | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" \"PROJECT GUTENBERG\" | \n",
" \"A CHRISTMAS CAROL\" | \n",
" b84d71ed9c3b45819eb3205fd28e13a0 | \n",
" 20 | \n",
" 1.0 | \n",
" 0 | \n",
" \"Project Gutenberg is responsible for releasin... | \n",
" [680dd6d2a970a49082fa4f34bf63a34e] | \n",
"
\n",
" \n",
" | 1 | \n",
" \"PROJECT GUTENBERG\" | \n",
" \"SUZANNE SHELL\" | \n",
" b0b464bc92a541e48547fe9738378dab | \n",
" 15 | \n",
" 1.0 | \n",
" 1 | \n",
" \"Suzanne Shell produced the eBook version of '... | \n",
" [680dd6d2a970a49082fa4f34bf63a34e] | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" source target id \\\n",
"0 \"PROJECT GUTENBERG\" \"A CHRISTMAS CAROL\" b84d71ed9c3b45819eb3205fd28e13a0 \n",
"1 \"PROJECT GUTENBERG\" \"SUZANNE SHELL\" b0b464bc92a541e48547fe9738378dab \n",
"\n",
" rank weight human_readable_id \\\n",
"0 20 1.0 0 \n",
"1 15 1.0 1 \n",
"\n",
" description \\\n",
"0 \"Project Gutenberg is responsible for releasin... \n",
"1 \"Suzanne Shell produced the eBook version of '... \n",
"\n",
" text_unit_ids \n",
"0 [680dd6d2a970a49082fa4f34bf63a34e] \n",
"1 [680dd6d2a970a49082fa4f34bf63a34e] "
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# import relationships\n",
"\n",
"statement = \"\"\"\n",
" MATCH (source:__Entity__ {title:replace(value.source,'\"','')})\n",
" MATCH (target:__Entity__ {title:replace(value.target,'\"','')})\n",
" // todo rel-type from source-target labels?\n",
" MERGE (source)-[rel:RELATED]->(target)\n",
" SET rel += value {.id, .rank, .weight, .human_readable_id, .description, text_unit_ids:value.text_unit_ids}\n",
" RETURN count(*) as createdRels\n",
"\"\"\"\n",
"\n",
"df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_relationships.parquet',\n",
" columns=[\"source\",\"target\",\"id\",\"rank\",\"weight\",\"human_readable_id\",\"description\",\"text_unit_ids\"])\n",
"\n",
"batched_import(statement, df)\n",
"\n",
"df.head(2)\n"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'_contains_updates': True, 'properties_set': 94}\n",
"47 rows in 0.021432161331176758 s.\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" level | \n",
" title | \n",
" text_unit_ids | \n",
" relationship_ids | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 2 | \n",
" 0 | \n",
" Community 2 | \n",
" [0546d296a4d3bb0486bd0c94c01dc9be,0d6bc6e701a0... | \n",
" [ba481175ee1d4329bf07757a30abd3a1, 8d8da35190b... | \n",
"
\n",
" \n",
" | 1 | \n",
" 4 | \n",
" 0 | \n",
" Community 4 | \n",
" [054bdcba0a3690b43609d9226a47f84d,3a450ed2b7fb... | \n",
" [929f30875e1744b49e7b416eaf5a790c, 4920fda0318... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id level title text_unit_ids \\\n",
"0 2 0 Community 2 [0546d296a4d3bb0486bd0c94c01dc9be,0d6bc6e701a0... \n",
"1 4 0 Community 4 [054bdcba0a3690b43609d9226a47f84d,3a450ed2b7fb... \n",
"\n",
" relationship_ids \n",
"0 [ba481175ee1d4329bf07757a30abd3a1, 8d8da35190b... \n",
"1 [929f30875e1744b49e7b416eaf5a790c, 4920fda0318... "
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# import communities\n",
"\n",
"statement = \"\"\"\n",
"MERGE (c:__Community__ {community:value.id})\n",
"SET c += value {.level, .title}\n",
"/*\n",
"UNWIND value.text_unit_ids as text_unit_id\n",
"MATCH (t:__Chunk__ {id:text_unit_id})\n",
"MERGE (c)-[:HAS_CHUNK]->(t)\n",
"WITH distinct c, value\n",
"*/\n",
"WITH *\n",
"UNWIND value.relationship_ids as rel_id\n",
"MATCH (start:__Entity__)-[:RELATED {id:rel_id}]->(end:__Entity__)\n",
"MERGE (start)-[:IN_COMMUNITY]->(c)\n",
"MERGE (end)-[:IN_COMMUNITY]->(c)\n",
"RETURn count(distinct c) as createdCommunities\n",
"\"\"\"\n",
"\n",
"df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_communities.parquet', \n",
" columns=[\"id\",\"level\",\"title\",\"text_unit_ids\",\"relationship_ids\"])\n",
"batched_import(statement, df)\n",
"\n",
"df.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'_contains_updates': True, 'properties_set': 329}\n",
"47 rows in 0.022797107696533203 s.\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" community | \n",
" level | \n",
" title | \n",
" summary | \n",
" findings | \n",
" rank | \n",
" rank_explanation | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" e7822326-4da8-4954-afa9-be7f4f5791a5 | \n",
" 42 | \n",
" 2 | \n",
" Scrooge's Supernatural Encounters: Marley's Gh... | \n",
" This report delves into the pivotal supernatur... | \n",
" [{'explanation': 'Marley's Ghost plays a cruci... | \n",
" 8.0 | \n",
" The impact severity rating is high due to the ... | \n",
"
\n",
" \n",
" | 1 | \n",
" 8a5afac1-99ef-4f01-a1b1-f044ce392ff9 | \n",
" 43 | \n",
" 2 | \n",
" The Ghost's Influence on Scrooge's Transformation | \n",
" This report delves into the pivotal role of 'T... | \n",
" [{'explanation': 'The Ghost, identified at tim... | \n",
" 8.5 | \n",
" The impact severity rating is high due to the ... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id community level \\\n",
"0 e7822326-4da8-4954-afa9-be7f4f5791a5 42 2 \n",
"1 8a5afac1-99ef-4f01-a1b1-f044ce392ff9 43 2 \n",
"\n",
" title \\\n",
"0 Scrooge's Supernatural Encounters: Marley's Gh... \n",
"1 The Ghost's Influence on Scrooge's Transformation \n",
"\n",
" summary \\\n",
"0 This report delves into the pivotal supernatur... \n",
"1 This report delves into the pivotal role of 'T... \n",
"\n",
" findings rank \\\n",
"0 [{'explanation': 'Marley's Ghost plays a cruci... 8.0 \n",
"1 [{'explanation': 'The Ghost, identified at tim... 8.5 \n",
"\n",
" rank_explanation \n",
"0 The impact severity rating is high due to the ... \n",
"1 The impact severity rating is high due to the ... "
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# import communities\n",
"\n",
"statement = \"\"\"\n",
"MERGE (c:__Community__ {community:value.community})\n",
"// we can also extract findings as separate nodes\n",
"WITH c, value, [f in value.findings | apoc.text.join([k in keys(f) | k+\": \"+f[k]],',\\n')] as findings\n",
"SET c += value {.level, .title, .summary, findings, .rank, .rank_explanation, .id}\n",
"RETURn count(distinct c) as createdCommunities\n",
"\"\"\"\n",
"\n",
"df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_community_reports.parquet',\n",
" columns=[\"id\",\"community\",\"level\",\"title\",\"summary\", \"findings\",\"rank\",\"rank_explanation\"])\n",
"\n",
"batched_import(statement, df)\n",
"df.head(2)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}