In [2]:
%pip install --quiet pandas neo4j-rust-ext


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
from neo4j import GraphDatabase
import time

In [4]:
NEO4J_URI="bolt://localhost"
NEO4J_USERNAME="neo4j"
NEO4J_PASSWORD="password"
NEO4J_DATABASE="graphrag"

In [5]:
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

In [6]:
def batched_import(statement, df, batch_size=1000):
    total = len(df)
    start_s = time.time()
    for start in range(0,total, batch_size):
        batch = df.iloc[start: min(start+batch_size,total)]
        result = driver.execute_query("UNWIND $rows AS value " + statement, 
                                      rows=batch.to_dict('records'),
                                      database_=NEO4J_DATABASE)
        print(result.summary.counters)
    print(f'{total} rows in { time.time() - start_s} s.')    
    return total

In [7]:
# create constraints

statements = """
create constraint chunk_id if not exists for (c:__Chunk__) require c.id is unique;
create constraint document_id if not exists for (d:__Document__) require d.id is unique;
create constraint entity_id if not exists for (c:__Community__) require c.community is unique;
create constraint entity_id if not exists for (e:__Entity__) require e.id is unique;
create constraint entity_title if not exists for (e:__Entity__) require e.title is unique;
create constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique;
""".split(";")

for s in statements:
    if len((s or "").strip()) > 0:
        print(s)
        driver.execute_query(query_=s,database_=NEO4J_DATABASE)


create constraint chunk_id if not exists for (c:__Chunk__) require c.id is unique

create constraint document_id if not exists for (d:__Document__) require d.id is unique

create constraint entity_id if not exists for (c:__Community__) require c.community is unique

create constraint entity_id if not exists for (e:__Entity__) require e.id is unique

create constraint entity_title if not exists for (e:__Entity__) require e.title is unique

create constraint related_id if not exists for ()-[rel:RELATED]->() require rel.id is unique


In [8]:
GRAPHRAG_FOLDER="/Users/mh/d/llm/graphrag/ragtest/output/20240703-144633/artifacts"

In [9]:
df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_base_entity_graph.parquet')
df.head()



Unnamed: 0,level,clustered_graph
0,0,"<graphml xmlns=""http://graphml.graphdrawing.or..."
1,1,"<graphml xmlns=""http://graphml.graphdrawing.or..."
2,2,"<graphml xmlns=""http://graphml.graphdrawing.or..."


In [10]:
# import documents
statement = """
MERGE (d:__Document__ {id:value.id})
SET d += value {.title}
// , text_unit_ids:value.text_unit_ids, raw_content:substring(value.raw_content,0,1000)};
"""
df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_documents.parquet', columns=["id", "title"])

batched_import(statement, df)

df.head()


{'_contains_updates': True, 'properties_set': 1}
1 rows in 0.0053479671478271484 s.


Unnamed: 0,id,title
0,c305886e4aa2f6efcf64b57762777055,book.txt


In [11]:
# import text units
statement = """
MERGE (c:__Chunk__ {id:value.chunk_id})
SET c += value {.chunk, .n_tokens}
WITH *
UNWIND value.document_ids as doc_id
MATCH (d:__Document__ {id:doc_id})
MERGE (d)<-[:PART_OF]-(c)
RETURN count(distinct c) as chunksCreated
"""

df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_base_text_units.parquet', 
                     columns=["chunk_id","chunk","n_tokens","document_ids"])

batched_import(statement, df)

df.head(2)


{'_contains_updates': True, 'properties_set': 462}
231 rows in 0.02997303009033203 s.


Unnamed: 0,chunk_id,chunk,n_tokens,document_ids
0,680dd6d2a970a49082fa4f34bf63a34e,﻿The Project Gutenberg eBook of A Christmas Ca...,300,[c305886e4aa2f6efcf64b57762777055]
1,95f1f8f5bdbf0bee3a2c6f2f4a4907f6,THE PROJECT GUTENBERG EBOOK A CHRISTMAS CAROL...,300,[c305886e4aa2f6efcf64b57762777055]


In [12]:
# import nodes

statement = """
MERGE (n:__Entity__ {id:value.id})
SET n += value {.level, .top_level_node_id, .human_readable_id, .description, 
    title:replace(value.title,'"','')}
WITH n, value
CALL apoc.create.addLabels(n, case when value.type is null then [] else [apoc.text.upperCamelCase(replace(value.type,'"',''))] end) yield node
UNWIND split(value.source_id,",") as source_id
MATCH (c:__Chunk__ {id:source_id})
RETURN count(distinct n) as createdNodes
"""

df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_nodes.parquet',
                     columns=["level","title","type","description","source_id","human_readable_id","id","top_level_node_id"])

batched_import(statement, df)
df.head(2)


{'_contains_updates': True, 'properties_set': 4155}
831 rows in 0.13263273239135742 s.


Unnamed: 0,level,title,type,description,source_id,human_readable_id,id,top_level_node_id
0,0,"""PROJECT GUTENBERG""","""ORGANIZATION""",Project Gutenberg is a pioneering organization...,"01e84646075b255eab0a34d872336a89,10bab8e9773ee...",0,b45241d70f0e43fca764df95b2b81f77,b45241d70f0e43fca764df95b2b81f77
1,0,"""UNITED STATES""","""GEO""",The United States is prominently recognized fo...,"01e84646075b255eab0a34d872336a89,28f242c451594...",1,4119fd06010c494caa07f439b333f4c5,4119fd06010c494caa07f439b333f4c5


In [13]:
# import relationships

statement = """
    MATCH (source:__Entity__ {title:replace(value.source,'"','')})
    MATCH (target:__Entity__ {title:replace(value.target,'"','')})
    // todo rel-type from source-target labels?
    MERGE (source)-[rel:RELATED]->(target)
    SET rel += value {.id, .rank, .weight, .human_readable_id, .description, text_unit_ids:value.text_unit_ids}
    RETURN count(*) as createdRels
"""

df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_relationships.parquet',
                     columns=["source","target","id","rank","weight","human_readable_id","description","text_unit_ids"])

batched_import(statement, df)

df.head(2)


{'_contains_updates': True, 'properties_set': 2052}
342 rows in 0.013482093811035156 s.


Unnamed: 0,source,target,id,rank,weight,human_readable_id,description,text_unit_ids
0,"""PROJECT GUTENBERG""","""A CHRISTMAS CAROL""",b84d71ed9c3b45819eb3205fd28e13a0,20,1.0,0,"""Project Gutenberg is responsible for releasin...",[680dd6d2a970a49082fa4f34bf63a34e]
1,"""PROJECT GUTENBERG""","""SUZANNE SHELL""",b0b464bc92a541e48547fe9738378dab,15,1.0,1,"""Suzanne Shell produced the eBook version of '...",[680dd6d2a970a49082fa4f34bf63a34e]


In [14]:
# import communities

statement = """
MERGE (c:__Community__ {community:value.id})
SET c += value {.level, .title}
/*
UNWIND value.text_unit_ids as text_unit_id
MATCH (t:__Chunk__ {id:text_unit_id})
MERGE (c)-[:HAS_CHUNK]->(t)
WITH distinct c, value
*/
WITH *
UNWIND value.relationship_ids as rel_id
MATCH (start:__Entity__)-[:RELATED {id:rel_id}]->(end:__Entity__)
MERGE (start)-[:IN_COMMUNITY]->(c)
MERGE (end)-[:IN_COMMUNITY]->(c)
RETURn count(distinct c) as createdCommunities
"""

df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_communities.parquet', 
                     columns=["id","level","title","text_unit_ids","relationship_ids"])
batched_import(statement, df)

df.head(2)

{'_contains_updates': True, 'properties_set': 94}
47 rows in 0.021432161331176758 s.


Unnamed: 0,id,level,title,text_unit_ids,relationship_ids
0,2,0,Community 2,"[0546d296a4d3bb0486bd0c94c01dc9be,0d6bc6e701a0...","[ba481175ee1d4329bf07757a30abd3a1, 8d8da35190b..."
1,4,0,Community 4,"[054bdcba0a3690b43609d9226a47f84d,3a450ed2b7fb...","[929f30875e1744b49e7b416eaf5a790c, 4920fda0318..."


In [15]:
# import communities

statement = """
MERGE (c:__Community__ {community:value.community})
// we can also extract findings as separate nodes
WITH c, value, [f in value.findings | apoc.text.join([k in keys(f) | k+": "+f[k]],',\n')] as findings
SET c += value {.level, .title, .summary, findings, .rank, .rank_explanation, .id}
RETURn count(distinct c) as createdCommunities
"""

df = pd.read_parquet(f'{GRAPHRAG_FOLDER}/create_final_community_reports.parquet',
                     columns=["id","community","level","title","summary", "findings","rank","rank_explanation"])

batched_import(statement, df)
df.head(2)


{'_contains_updates': True, 'properties_set': 329}
47 rows in 0.022797107696533203 s.


Unnamed: 0,id,community,level,title,summary,findings,rank,rank_explanation
0,e7822326-4da8-4954-afa9-be7f4f5791a5,42,2,Scrooge's Supernatural Encounters: Marley's Gh...,This report delves into the pivotal supernatur...,[{'explanation': 'Marley's Ghost plays a cruci...,8.0,The impact severity rating is high due to the ...
1,8a5afac1-99ef-4f01-a1b1-f044ce392ff9,43,2,The Ghost's Influence on Scrooge's Transformation,This report delves into the pivotal role of 'T...,"[{'explanation': 'The Ghost, identified at tim...",8.5,The impact severity rating is high due to the ...
