deepakkashyap3013 · May 29, 2025 15:16
diff --git a/main.py b/main.py
 import os
 import csv
 import logging
 import uuid
 import json
 from typing import Dict, List, Tuple, Any
 from retry import retry
 from dotenv import load_dotenv
 from neo4j import GraphDatabase
 import pandas as pd
 from langchain_openai import ChatOpenAI, OpenAIEmbeddings
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_core.documents import Document
 from langchain_core.prompts import PromptTemplate
 from pinecone import Pinecone, ServerlessSpec
 import openai
 from dataclasses import dataclass

 # Load environment variables
 load_dotenv(dotenv_path='.env')

 openai.api_key = os.getenv('OPENAI_API_KEY')

 # Set up logging
 logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
 )

 logger = logging.getLogger(__name__)

 NODES = [
    "Person", "Company", "Topic", "Founder"
 ]

 @dataclass
 class Entity:
    """Represents an extracted entity from transcript text"""
    name: str
    type: str  # PERSON, COMPANY, TOPIC, etc.
    context: str
    confidence: float = 0.8

 @dataclass
 class Relationship:
    """Represents a relationship between two entities"""
    source: str
    target: str
    relation_type: str
    context: str
    confidence: float = 0.8

 class ConfigManager:
    """Manages configuration and API keys"""
    
    def __init__(self):
        self.required_keys = [
            'OPENAI_API_KEY',
            'PINECONE_API_KEY',
            'PINECONE_HOST',
            'NEO4J_URI',
            'NEO4J_USERNAME',
            'NEO4J_PASSWORD'
        ]
        self.validate_environment()
    
    def validate_environment(self):
        """Ensure all required environment variables are set"""
        missing_keys = [key for key in self.required_keys if not os.getenv(key)]
        if missing_keys:
            raise ValueError(f"Missing required environment variables: {missing_keys}")
    
    @property
    def openai_api_key(self) -> str:
        return os.getenv('OPENAI_API_KEY')
    
    @property
    def pinecone_config(self) -> Dict[str, str]:
        return {
            'api_key': os.getenv('PINECONE_API_KEY'),
            'host': os.getenv('PINECONE_HOST')
        }
    
    @property
    def neo4j_config(self) -> Dict[str, str]:
        return {
            'uri': os.getenv('NEO4J_URI'),
            'user': os.getenv('NEO4J_USERNAME'),
            'password': os.getenv('NEO4J_PASSWORD'),
            'database': os.getenv('NEO4J_DATABASE')
        }


 @retry(tries=10, delay=10)
 def set_uniquness_constraints(tx, node):
    query = f"""CREATE CONSTRAINT IF NOT EXISTS FOR 
    (n:{node}) REQUIRE n.id IS UNIQUE;"""

    _ = tx.run(query)
    
 class EntityExtractor:
    """Extracts entities and relationships from transcript text using LLM"""
    
    def __init__(self, config: ConfigManager):
        self.llm = ChatOpenAI(
            temperature=0.1,
            openai_api_key=config.openai_api_key,
            model_name="gpt-4"
        )
        
        # Prompt template for entity extraction
        self.entity_prompt = PromptTemplate(
            input_variables=["text", "founder_name"],
            template="""
            You are an expert at extracting structured information from business meeting transcripts.
            
            Founder: {founder_name}
            
            From the following transcript text, extract:
            1. PEOPLE mentioned (names, roles, companies they work for)
            2. COMPANIES mentioned 
            3. TOPICS discussed (products, strategies, challenges, opportunities)
            4. RELATIONSHIPS between entities (who works with whom, what topics relate to what companies, etc.)
            
            Text: {text}
            
            Return your response as valid JSON with this structure:
            {{
                "entities": [
                    {{"name": "entity_name", "type": "PERSON|COMPANY|TOPIC", "context": "relevant_context", "confidence": 0.9}}
                ],
                "relationships": [
                    {{"source": "entity1", "target": "entity2", "relation_type": "WORKS_WITH|DISCUSSES|LEADS|etc", "context": "context", "confidence": 0.8}}
                ]
            }}
            
            Focus on business-relevant information. Be precise with entity names and relationship types.
            """
        )
    
    def extract_from_text(self, text: str, founder_name: str) -> Tuple[List[Entity], List[Relationship]]:
        """Extract entities and relationships from a text chunk"""
        try:
            # Get LLM response
            prompt = self.entity_prompt.format(text=text, founder_name=founder_name)
            response = self.llm.predict(prompt)
            
            # Parse JSON response
            data = json.loads(response)
            
            # Convert to our data structures
            entities = [Entity(**entity) for entity in data.get('entities', [])]
            relationships = [Relationship(**rel) for rel in data.get('relationships', [])]
            
            return entities, relationships
            
        except Exception as e:
            logger.error(f"Error extracting entities: {e}")
            return [], []
    
 class PineconeVectorStore:
    """Manages Pinecone vector storage operations"""
    
    def __init__(self, config: ConfigManager, index_name: str = "founder-transcripts"):
        # Initialize Pinecone
        self.pc = Pinecone(
            api_key=config.pinecone_config['api_key']
        )
        
        self.embeddings = OpenAIEmbeddings(
            openai_api_key=config.openai_api_key, 
            model="text-embedding-3-large", 
            dimensions=3072
        )
        
        if not self.pc.has_index(index_name):
            self.index = self.pc.create_index(
                name=index_name,
                dimension=3072,
                metric="cosine",
                spec=ServerlessSpec(
                    cloud='aws',
                    region='us-east-1'
                )
            )
        
        self.index = self.pc.Index(index_name)
        
    def add_documents(self, documents: List[Document], namespace: str = 'ns1', batch_size: int = 32):
        """Add documents to vector store with namespace"""
        
        doc_embeds = self.embed([doc.page_content for doc in documents])

        vectors = []
        for doc, embed in zip(documents, doc_embeds):
            metadata = doc.metadata.copy() if doc.metadata else {}
            metadata['text'] = doc.page_content
            
            vectors.append({
                "id": str(hash(doc.page_content)),
                "values": embed,
                "metadata": metadata
            })

        self.index.upsert(
            vectors=vectors,
            namespace=namespace,
            batch_size=batch_size
        )
        
    def similarity_search(self, query: str, namespace: str = 'ns1', k: int = 5):
        """Search for similar documents within a namespace"""
        query_embedding = self.embed([query])[0]

        return self.index.query(
            namespace=namespace,
            vector=query_embedding,
            top_k=k,
            include_values=False,
            include_metadata=True
        )
        
    def embed(self, texts: list[str]) -> list[list[float]]:
        """Embed texts using OpenAI's embedding model"""
        res = openai.embeddings.create(
            input=texts,
            model="text-embedding-3-large"
        )
        return [r.embedding for r in res.data]


 class TranscriptProcessor:
    """Processes transcript files and chunks them appropriately"""
    
    def __init__(self):
        self.text_splitter = RecursiveCharacterTextSplitter(
            separators=[
                "\n\n",
                "\n",
                " ",
                ".",
                ",",
                "\u200b",  # Zero-width space
                "\uff0c",  # Fullwidth comma
                "\u3001",  # Ideographic comma
                "\uff0e",  # Fullwidth full stop
                "\u3002",  # Ideographic full stop
                "",
            ],
            chunk_size=1000,
            chunk_overlap=0,
            length_function=len,
        )
    
    def load_csv_transcript(self, file_path: str) -> List[Dict]:
        """Load transcript from CSV file"""
        transcripts = []
        
        with open(file_path, 'r', encoding='utf-8') as file:
            reader = csv.DictReader(file)
            for row in reader:
                transcripts.append(row)
        
        return transcripts
    
    def process_transcript_row(self, row: Dict) -> Document:
        """Convert a transcript row to a Document object"""
        # Combine speaker and text for full context
        content = f"Speaker: {row.get('speaker', 'Unknown')}\nText: {row.get('text', '')}"
        
        metadata = {
            'speaker': row.get('speaker', 'Unknown'),
            'timestamp': row.get('timestamp', ''),
            'meeting_id': row.get('meeting_id', str(uuid.uuid4())),
            'source': 'transcript'
        }
        
        return Document(page_content=content, metadata=metadata)
    
    def chunk_documents(self, documents: List[Document]) -> List[Document]:
        """Split documents into smaller chunks for better processing"""
        return self.text_splitter.split_documents(documents)


 class Neo4jGraphStore:
    """Manages Neo4j graph database operations"""
    
    def __init__(self, config: ConfigManager):
        try:
            self.driver = GraphDatabase.driver(
                config.neo4j_config['uri'],
                auth=(config.neo4j_config['user'], config.neo4j_config['password']),
                database=config.neo4j_config['database']
            )
            
            # Test the connection
            self.driver.verify_connectivity()
            logger.info("Successfully connected to Neo4j database")
        except Exception as e:
            logger.error(f"Failed to connect to Neo4j: {str(e)}")
            raise ValueError("Could not connect to Neo4j database. Please check your connection URI and credentials.")
        self.setup_constraints()
        
    def setup_constraints(self):
        """Set up database constraints and indexes"""
        with self.driver.session() as session:
            for node in NODES:
                session.execute_write(set_uniquness_constraints, node)
                logger.info(f"Set uniqueness constraint for {node}")
                
    
    def add_founder(self, founder_name: str, namespace: str):
        """Add a founder node with namespace isolation"""
        with self.driver.session() as session:
            session.run(
                """
                MERGE (f:Founder {name: $founder_name, namespace: $namespace})
                SET f.created_at = datetime()
                """,
                founder_name=founder_name,
                namespace=namespace
            )
    
    def add_entities(self, entities: List[Entity], founder_name: str, namespace: str):
        """Add entities to the graph with founder association"""
        with self.driver.session() as session:
            for entity in entities:
                # Add the entity node
                session.run(
                    f"""
                    MERGE (e:{entity.type} {{name: $name, namespace: $namespace}})
                    SET e.context = $context, 
                        e.confidence = $confidence,
                        e.updated_at = datetime()
                    """,
                    name=entity.name,
                    namespace=namespace,
                    context=entity.context,
                    confidence=entity.confidence
                )
                
                # Link to founder
                session.run(
                    f"""
                    MATCH (f:Founder {{name: $founder_name, namespace: $namespace}})
                    MATCH (e:{entity.type} {{name: $entity_name, namespace: $namespace}})
                    MERGE (f)-[:MENTIONS]->(e)
                    """,
                    founder_name=founder_name,
                    entity_name=entity.name,
                    namespace=namespace
                )
    
    def add_relationships(self, relationships: List[Relationship], namespace: str):
        """Add relationships between entities"""
        with self.driver.session() as session:
            for rel in relationships:
                session.run(
                    """
                    MATCH (a {name: $source, namespace: $namespace})
                    MATCH (b {name: $target, namespace: $namespace})
                    CALL apoc.create.relationship(a, $relation_type, {
                        context: $context,
                        confidence: $confidence,
                        created_at: datetime()
                    }, b) YIELD rel
                    RETURN rel
                    """,
                    source=rel.source,
                    target=rel.target,
                    relation_type=rel.relation_type,
                    context=rel.context,
                    confidence=rel.confidence,
                    namespace=namespace
                )
    
    def query_entities_by_founder(self, founder_name: str, namespace: str, entity_type: str = None) -> List[Dict]:
        """Query entities associated with a specific founder"""
        with self.driver.session() as session:
            if entity_type:
                query = f"""
                MATCH (f:Founder {{name: $founder_name, namespace: $namespace}})-[:MENTIONS]->(e:{entity_type})
                RETURN e.name as name, e.context as context, e.confidence as confidence
                """
            else:
                query = """
                MATCH (f:Founder {name: $founder_name, namespace: $namespace})-[:MENTIONS]->(e)
                RETURN labels(e)[0] as type, e.name as name, e.context as context, e.confidence as confidence
                """
            
            result = session.run(query, founder_name=founder_name, namespace=namespace)
            return [record.data() for record in result]
    
    def query_relationships(self, entity_name: str, namespace: str) -> List[Dict]:
        """Query relationships for a specific entity"""
        with self.driver.session() as session:
            result = session.run(
                """
                MATCH (a {name: $entity_name, namespace: $namespace})-[r]->(b)
                RETURN a.name as source, type(r) as relation, b.name as target, r.context as context
                UNION
                MATCH (a)-[r]->(b {name: $entity_name, namespace: $namespace})
                RETURN a.name as source, type(r) as relation, b.name as target, r.context as context
                """,
                entity_name=entity_name,
                namespace=namespace
            )
            return [record.data() for record in result]
    
    def close(self):
        """Close database connection"""
        self.driver.close()
        
 class GraphRAGSystem:
    """Main system that orchestrates graph and vector operations"""
    
    def __init__(self, config: ConfigManager):
        self.config = config
        self.graph_store = Neo4jGraphStore(config)
        self.vector_store = PineconeVectorStore(config)
        self.transcript_processor = TranscriptProcessor()
        self.entity_extractor = EntityExtractor(config)
        
        self.qa_llm = ChatOpenAI(
            temperature=0.2,
            openai_api_key=config.openai_api_key,
            model_name="gpt-4"
        )

    def process_transcript_file(self, file_path: str, founder_name: str):
        """Process a complete transcript file for a founder"""
        logger.info(f"Processing transcript file for {founder_name}")
        
        # Create namespace for this founder
        namespace = founder_name.lower().replace(' ', '_')
        
        # Add founder to graph
        self.graph_store.add_founder(founder_name, namespace)
        
        # Load and process transcript
        transcript_rows = self.transcript_processor.load_csv_transcript(file_path)
        documents = [self.transcript_processor.process_transcript_row(row) for row in transcript_rows]
        
        # Chunk documents for better processing
        chunked_docs = self.transcript_processor.chunk_documents(documents)
        
        # Add to vector store with optimized batching
        # Process in smaller batches to avoid context length issues
        batch_size = 32  # Smaller batch size for upserts
        embedding_chunk_size = 100  # Smaller chunks for embeddings to avoid context length issues
        
        for i in range(0, len(chunked_docs), embedding_chunk_size):
            batch_docs = chunked_docs[i:i + embedding_chunk_size]
            logger.info(f"Processing batch {i//embedding_chunk_size + 1} of {len(chunked_docs)//embedding_chunk_size + 1}")
            self.vector_store.add_documents(
                batch_docs,
                namespace=namespace,
                batch_size=batch_size
            )
        
        # Extract entities and relationships from each chunk
        all_entities = []
        all_relationships = []
        
        for doc in chunked_docs:
            entities, relationships = self.entity_extractor.extract_from_text(
                doc.page_content, 
                founder_name
            )
            all_entities.extend(entities)
            all_relationships.extend(relationships)
        
        # Add to graph store
        if all_entities:
            self.graph_store.add_entities(all_entities, founder_name, namespace)
        
        if all_relationships:
            self.graph_store.add_relationships(all_relationships, namespace)
        
        logger.info(f"Processed {len(documents)} transcript entries, extracted {len(all_entities)} entities and {len(all_relationships)} relationships")

    def hybrid_search(self, query: str, founder_name: str, k: int = 5) -> Dict[str, Any]:
        """Perform hybrid search combining graph and vector results"""
        namespace = founder_name.lower().replace(' ', '_')
        
        # Vector search for semantic similarity
        vector_results = self.vector_store.similarity_search(query, namespace, k)
        
        # Graph search for entity-based queries
        # First, try to identify entities in the query
        query_entities, _ = self.entity_extractor.extract_from_text(query, founder_name)
        
        graph_results = []
        for entity in query_entities:
            # Get related entities and relationships
            related_entities = self.graph_store.query_entities_by_founder(founder_name, namespace)
            relationships = self.graph_store.query_relationships(entity.name, namespace)
            
            graph_results.extend(related_entities)
            graph_results.extend(relationships)
        
        return {
            'vector_results': vector_results,
            'graph_results': graph_results,
            'query_entities': query_entities
        }

    def answer_question(self, question: str, founder_name: str) -> str:
        """Generate answer using hybrid GraphRAG approach"""
        # Get hybrid search results
        search_results = self.hybrid_search(question, founder_name)
        
        # Prepare context from both sources
        context_parts = []
        
        # Add vector search context
        for doc in search_results['vector_results']:
            context_parts.append(f"Transcript: {doc.page_content}")
        
        # Add graph context
        for result in search_results['graph_results']:
            if 'name' in result and 'context' in result:
                context_parts.append(f"Entity: {result['name']} - {result['context']}")
            elif 'source' in result and 'relation' in result:
                context_parts.append(f"Relationship: {result['source']} {result['relation']} {result['target']}")
        
        # Combine context
        combined_context = "\n\n".join(context_parts[:10])  # Limit context size
        
        # Generate answer using LLM with enhanced context
        prompt = f"""
        Based on the following information about {founder_name}'s meetings and business activities, answer the question comprehensively.
        
        Context from transcripts and knowledge graph:
        {combined_context}
        
        Question: {question}
        
        Provide a detailed answer based on the available information. If the information is insufficient, say so clearly.
        """
        
        return self.qa_llm.predict(prompt)
        

 def main(): 
    config = ConfigManager()
    system = GraphRAGSystem(config)
    
    # Example: Process a transcript file
    founder_name = "Ronak"
    transcript_file = "adaptive-anand.csv"  # Your CSV file path
    
    try:
        # Process the transcript
        # system.process_transcript_file(transcript_file, founder_name)
        
        # Ask questions
        questions = [
            "What challenges did the founder discuss?",
            "Who are the key people mentioned in the meetings?",
            "What companies were discussed?",
            "What strategies were mentioned for growth?"
        ]
        
        for question in questions:
            print(f"\nQuestion: {question}")
            answer = system.answer_question(question, founder_name)
            print(f"Answer: {answer}")
            print("-" * 80)
    
    except Exception as e:
        logger.error(f"Error in main execution: {e}")
    
    finally:
        # Clean up connections
        system.graph_store.close()   
    

 if __name__ == "__main__":
    main()
	import os
	import csv
	import logging
	import uuid
	import json
	from typing import Dict, List, Tuple, Any
	from retry import retry
	from dotenv import load_dotenv
	from neo4j import GraphDatabase
	import pandas as pd
	from langchain_openai import ChatOpenAI, OpenAIEmbeddings
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_core.documents import Document
	from langchain_core.prompts import PromptTemplate
	from pinecone import Pinecone, ServerlessSpec
	import openai
	from dataclasses import dataclass

	# Load environment variables
	load_dotenv(dotenv_path='.env')

	openai.api_key = os.getenv('OPENAI_API_KEY')

	# Set up logging
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
	datefmt="%Y-%m-%d %H:%M:%S",
	)

	logger = logging.getLogger(__name__)

	NODES = [
	"Person", "Company", "Topic", "Founder"
	]

	@dataclass
	class Entity:
	"""Represents an extracted entity from transcript text"""
	name: str
	type: str # PERSON, COMPANY, TOPIC, etc.
	context: str
	confidence: float = 0.8

	@dataclass
	class Relationship:
	"""Represents a relationship between two entities"""
	source: str
	target: str
	relation_type: str
	context: str
	confidence: float = 0.8

	class ConfigManager:
	"""Manages configuration and API keys"""

	def __init__(self):
	self.required_keys = [
	'OPENAI_API_KEY',
	'PINECONE_API_KEY',
	'PINECONE_HOST',
	'NEO4J_URI',
	'NEO4J_USERNAME',
	'NEO4J_PASSWORD'
	]
	self.validate_environment()

	def validate_environment(self):
	"""Ensure all required environment variables are set"""
	missing_keys = [key for key in self.required_keys if not os.getenv(key)]
	if missing_keys:
	raise ValueError(f"Missing required environment variables: {missing_keys}")

	@property
	def openai_api_key(self) -> str:
	return os.getenv('OPENAI_API_KEY')

	@property
	def pinecone_config(self) -> Dict[str, str]:
	return {
	'api_key': os.getenv('PINECONE_API_KEY'),
	'host': os.getenv('PINECONE_HOST')
	}

	@property
	def neo4j_config(self) -> Dict[str, str]:
	return {
	'uri': os.getenv('NEO4J_URI'),
	'user': os.getenv('NEO4J_USERNAME'),
	'password': os.getenv('NEO4J_PASSWORD'),
	'database': os.getenv('NEO4J_DATABASE')
	}


	@retry(tries=10, delay=10)
	def set_uniquness_constraints(tx, node):
	query = f"""CREATE CONSTRAINT IF NOT EXISTS FOR
	(n:{node}) REQUIRE n.id IS UNIQUE;"""

	_ = tx.run(query)

	class EntityExtractor:
	"""Extracts entities and relationships from transcript text using LLM"""

	def __init__(self, config: ConfigManager):
	self.llm = ChatOpenAI(
	temperature=0.1,
	openai_api_key=config.openai_api_key,
	model_name="gpt-4"
	)

	# Prompt template for entity extraction
	self.entity_prompt = PromptTemplate(
	input_variables=["text", "founder_name"],
	template="""
	You are an expert at extracting structured information from business meeting transcripts.

	Founder: {founder_name}

	From the following transcript text, extract:
	1. PEOPLE mentioned (names, roles, companies they work for)
	2. COMPANIES mentioned
	3. TOPICS discussed (products, strategies, challenges, opportunities)
	4. RELATIONSHIPS between entities (who works with whom, what topics relate to what companies, etc.)

	Text: {text}

	Return your response as valid JSON with this structure:
	{{
	"entities": [
	{{"name": "entity_name", "type": "PERSON\|COMPANY\|TOPIC", "context": "relevant_context", "confidence": 0.9}}
	],
	"relationships": [
	{{"source": "entity1", "target": "entity2", "relation_type": "WORKS_WITH\|DISCUSSES\|LEADS\|etc", "context": "context", "confidence": 0.8}}
	]
	}}

	Focus on business-relevant information. Be precise with entity names and relationship types.
	"""
	)

	def extract_from_text(self, text: str, founder_name: str) -> Tuple[List[Entity], List[Relationship]]:
	"""Extract entities and relationships from a text chunk"""
	try:
	# Get LLM response
	prompt = self.entity_prompt.format(text=text, founder_name=founder_name)
	response = self.llm.predict(prompt)

	# Parse JSON response
	data = json.loads(response)

	# Convert to our data structures
	entities = [Entity(**entity) for entity in data.get('entities', [])]
	relationships = [Relationship(**rel) for rel in data.get('relationships', [])]

	return entities, relationships

	except Exception as e:
	logger.error(f"Error extracting entities: {e}")
	return [], []

	class PineconeVectorStore:
	"""Manages Pinecone vector storage operations"""

	def __init__(self, config: ConfigManager, index_name: str = "founder-transcripts"):
	# Initialize Pinecone
	self.pc = Pinecone(
	api_key=config.pinecone_config['api_key']
	)

	self.embeddings = OpenAIEmbeddings(
	openai_api_key=config.openai_api_key,
	model="text-embedding-3-large",
	dimensions=3072
	)

	if not self.pc.has_index(index_name):
	self.index = self.pc.create_index(
	name=index_name,
	dimension=3072,
	metric="cosine",
	spec=ServerlessSpec(
	cloud='aws',
	region='us-east-1'
	)
	)

	self.index = self.pc.Index(index_name)

	def add_documents(self, documents: List[Document], namespace: str = 'ns1', batch_size: int = 32):
	"""Add documents to vector store with namespace"""

	doc_embeds = self.embed([doc.page_content for doc in documents])

	vectors = []
	for doc, embed in zip(documents, doc_embeds):
	metadata = doc.metadata.copy() if doc.metadata else {}
	metadata['text'] = doc.page_content

	vectors.append({
	"id": str(hash(doc.page_content)),
	"values": embed,
	"metadata": metadata
	})

	self.index.upsert(
	vectors=vectors,
	namespace=namespace,
	batch_size=batch_size
	)

	def similarity_search(self, query: str, namespace: str = 'ns1', k: int = 5):
	"""Search for similar documents within a namespace"""
	query_embedding = self.embed([query])[0]

	return self.index.query(
	namespace=namespace,
	vector=query_embedding,
	top_k=k,
	include_values=False,
	include_metadata=True
	)

	def embed(self, texts: list[str]) -> list[list[float]]:
	"""Embed texts using OpenAI's embedding model"""
	res = openai.embeddings.create(
	input=texts,
	model="text-embedding-3-large"
	)
	return [r.embedding for r in res.data]


	class TranscriptProcessor:
	"""Processes transcript files and chunks them appropriately"""

	def __init__(self):
	self.text_splitter = RecursiveCharacterTextSplitter(
	separators=[
	"\n\n",
	"\n",
	" ",
	".",
	",",
	"\u200b", # Zero-width space
	"\uff0c", # Fullwidth comma
	"\u3001", # Ideographic comma
	"\uff0e", # Fullwidth full stop
	"\u3002", # Ideographic full stop
	"",
	],
	chunk_size=1000,
	chunk_overlap=0,
	length_function=len,
	)

	def load_csv_transcript(self, file_path: str) -> List[Dict]:
	"""Load transcript from CSV file"""
	transcripts = []

	with open(file_path, 'r', encoding='utf-8') as file:
	reader = csv.DictReader(file)
	for row in reader:
	transcripts.append(row)

	return transcripts

	def process_transcript_row(self, row: Dict) -> Document:
	"""Convert a transcript row to a Document object"""
	# Combine speaker and text for full context
	content = f"Speaker: {row.get('speaker', 'Unknown')}\nText: {row.get('text', '')}"

	metadata = {
	'speaker': row.get('speaker', 'Unknown'),
	'timestamp': row.get('timestamp', ''),
	'meeting_id': row.get('meeting_id', str(uuid.uuid4())),
	'source': 'transcript'
	}

	return Document(page_content=content, metadata=metadata)

	def chunk_documents(self, documents: List[Document]) -> List[Document]:
	"""Split documents into smaller chunks for better processing"""
	return self.text_splitter.split_documents(documents)


	class Neo4jGraphStore:
	"""Manages Neo4j graph database operations"""

	def __init__(self, config: ConfigManager):
	try:
	self.driver = GraphDatabase.driver(
	config.neo4j_config['uri'],
	auth=(config.neo4j_config['user'], config.neo4j_config['password']),
	database=config.neo4j_config['database']
	)

	# Test the connection
	self.driver.verify_connectivity()
	logger.info("Successfully connected to Neo4j database")
	except Exception as e:
	logger.error(f"Failed to connect to Neo4j: {str(e)}")
	raise ValueError("Could not connect to Neo4j database. Please check your connection URI and credentials.")
	self.setup_constraints()

	def setup_constraints(self):
	"""Set up database constraints and indexes"""
	with self.driver.session() as session:
	for node in NODES:
	session.execute_write(set_uniquness_constraints, node)
	logger.info(f"Set uniqueness constraint for {node}")


	def add_founder(self, founder_name: str, namespace: str):
	"""Add a founder node with namespace isolation"""
	with self.driver.session() as session:
	session.run(
	"""
	MERGE (f:Founder {name: $founder_name, namespace: $namespace})
	SET f.created_at = datetime()
	""",
	founder_name=founder_name,
	namespace=namespace
	)

	def add_entities(self, entities: List[Entity], founder_name: str, namespace: str):
	"""Add entities to the graph with founder association"""
	with self.driver.session() as session:
	for entity in entities:
	# Add the entity node
	session.run(
	f"""
	MERGE (e:{entity.type} {{name: $name, namespace: $namespace}})
	SET e.context = $context,
	e.confidence = $confidence,
	e.updated_at = datetime()
	""",
	name=entity.name,
	namespace=namespace,
	context=entity.context,
	confidence=entity.confidence
	)

	# Link to founder
	session.run(
	f"""
	MATCH (f:Founder {{name: $founder_name, namespace: $namespace}})
	MATCH (e:{entity.type} {{name: $entity_name, namespace: $namespace}})
	MERGE (f)-[:MENTIONS]->(e)
	""",
	founder_name=founder_name,
	entity_name=entity.name,
	namespace=namespace
	)

	def add_relationships(self, relationships: List[Relationship], namespace: str):
	"""Add relationships between entities"""
	with self.driver.session() as session:
	for rel in relationships:
	session.run(
	"""
	MATCH (a {name: $source, namespace: $namespace})
	MATCH (b {name: $target, namespace: $namespace})
	CALL apoc.create.relationship(a, $relation_type, {
	context: $context,
	confidence: $confidence,
	created_at: datetime()
	}, b) YIELD rel
	RETURN rel
	""",
	source=rel.source,
	target=rel.target,
	relation_type=rel.relation_type,
	context=rel.context,
	confidence=rel.confidence,
	namespace=namespace
	)

	def query_entities_by_founder(self, founder_name: str, namespace: str, entity_type: str = None) -> List[Dict]:
	"""Query entities associated with a specific founder"""
	with self.driver.session() as session:
	if entity_type:
	query = f"""
	MATCH (f:Founder {{name: $founder_name, namespace: $namespace}})-[:MENTIONS]->(e:{entity_type})
	RETURN e.name as name, e.context as context, e.confidence as confidence
	"""
	else:
	query = """
	MATCH (f:Founder {name: $founder_name, namespace: $namespace})-[:MENTIONS]->(e)
	RETURN labels(e)[0] as type, e.name as name, e.context as context, e.confidence as confidence
	"""

	result = session.run(query, founder_name=founder_name, namespace=namespace)
	return [record.data() for record in result]

	def query_relationships(self, entity_name: str, namespace: str) -> List[Dict]:
	"""Query relationships for a specific entity"""
	with self.driver.session() as session:
	result = session.run(
	"""
	MATCH (a {name: $entity_name, namespace: $namespace})-[r]->(b)
	RETURN a.name as source, type(r) as relation, b.name as target, r.context as context
	UNION
	MATCH (a)-[r]->(b {name: $entity_name, namespace: $namespace})
	RETURN a.name as source, type(r) as relation, b.name as target, r.context as context
	""",
	entity_name=entity_name,
	namespace=namespace
	)
	return [record.data() for record in result]

	def close(self):
	"""Close database connection"""
	self.driver.close()

	class GraphRAGSystem:
	"""Main system that orchestrates graph and vector operations"""

	def __init__(self, config: ConfigManager):
	self.config = config
	self.graph_store = Neo4jGraphStore(config)
	self.vector_store = PineconeVectorStore(config)
	self.transcript_processor = TranscriptProcessor()
	self.entity_extractor = EntityExtractor(config)

	self.qa_llm = ChatOpenAI(
	temperature=0.2,
	openai_api_key=config.openai_api_key,
	model_name="gpt-4"
	)

	def process_transcript_file(self, file_path: str, founder_name: str):
	"""Process a complete transcript file for a founder"""
	logger.info(f"Processing transcript file for {founder_name}")

	# Create namespace for this founder
	namespace = founder_name.lower().replace(' ', '_')

	# Add founder to graph
	self.graph_store.add_founder(founder_name, namespace)

	# Load and process transcript
	transcript_rows = self.transcript_processor.load_csv_transcript(file_path)
	documents = [self.transcript_processor.process_transcript_row(row) for row in transcript_rows]

	# Chunk documents for better processing
	chunked_docs = self.transcript_processor.chunk_documents(documents)

	# Add to vector store with optimized batching
	# Process in smaller batches to avoid context length issues
	batch_size = 32 # Smaller batch size for upserts
	embedding_chunk_size = 100 # Smaller chunks for embeddings to avoid context length issues

	for i in range(0, len(chunked_docs), embedding_chunk_size):
	batch_docs = chunked_docs[i:i + embedding_chunk_size]
	logger.info(f"Processing batch {i//embedding_chunk_size + 1} of {len(chunked_docs)//embedding_chunk_size + 1}")
	self.vector_store.add_documents(
	batch_docs,
	namespace=namespace,
	batch_size=batch_size
	)

	# Extract entities and relationships from each chunk
	all_entities = []
	all_relationships = []

	for doc in chunked_docs:
	entities, relationships = self.entity_extractor.extract_from_text(
	doc.page_content,
	founder_name
	)
	all_entities.extend(entities)
	all_relationships.extend(relationships)

	# Add to graph store
	if all_entities:
	self.graph_store.add_entities(all_entities, founder_name, namespace)

	if all_relationships:
	self.graph_store.add_relationships(all_relationships, namespace)

	logger.info(f"Processed {len(documents)} transcript entries, extracted {len(all_entities)} entities and {len(all_relationships)} relationships")

	def hybrid_search(self, query: str, founder_name: str, k: int = 5) -> Dict[str, Any]:
	"""Perform hybrid search combining graph and vector results"""
	namespace = founder_name.lower().replace(' ', '_')

	# Vector search for semantic similarity
	vector_results = self.vector_store.similarity_search(query, namespace, k)

	# Graph search for entity-based queries
	# First, try to identify entities in the query
	query_entities, _ = self.entity_extractor.extract_from_text(query, founder_name)

	graph_results = []
	for entity in query_entities:
	# Get related entities and relationships
	related_entities = self.graph_store.query_entities_by_founder(founder_name, namespace)
	relationships = self.graph_store.query_relationships(entity.name, namespace)

	graph_results.extend(related_entities)
	graph_results.extend(relationships)

	return {
	'vector_results': vector_results,
	'graph_results': graph_results,
	'query_entities': query_entities
	}

	def answer_question(self, question: str, founder_name: str) -> str:
	"""Generate answer using hybrid GraphRAG approach"""
	# Get hybrid search results
	search_results = self.hybrid_search(question, founder_name)

	# Prepare context from both sources
	context_parts = []

	# Add vector search context
	for doc in search_results['vector_results']:
	context_parts.append(f"Transcript: {doc.page_content}")

	# Add graph context
	for result in search_results['graph_results']:
	if 'name' in result and 'context' in result:
	context_parts.append(f"Entity: {result['name']} - {result['context']}")
	elif 'source' in result and 'relation' in result:
	context_parts.append(f"Relationship: {result['source']} {result['relation']} {result['target']}")

	# Combine context
	combined_context = "\n\n".join(context_parts[:10]) # Limit context size

	# Generate answer using LLM with enhanced context
	prompt = f"""
	Based on the following information about {founder_name}'s meetings and business activities, answer the question comprehensively.

	Context from transcripts and knowledge graph:
	{combined_context}

	Question: {question}

	Provide a detailed answer based on the available information. If the information is insufficient, say so clearly.
	"""

	return self.qa_llm.predict(prompt)


	def main():
	config = ConfigManager()
	system = GraphRAGSystem(config)

	# Example: Process a transcript file
	founder_name = "Ronak"
	transcript_file = "adaptive-anand.csv" # Your CSV file path

	try:
	# Process the transcript
	# system.process_transcript_file(transcript_file, founder_name)

	# Ask questions
	questions = [
	"What challenges did the founder discuss?",
	"Who are the key people mentioned in the meetings?",
	"What companies were discussed?",
	"What strategies were mentioned for growth?"
	]

	for question in questions:
	print(f"\nQuestion: {question}")
	answer = system.answer_question(question, founder_name)
	print(f"Answer: {answer}")
	print("-" * 80)

	except Exception as e:
	logger.error(f"Error in main execution: {e}")

	finally:
	# Clean up connections
	system.graph_store.close()


	if __name__ == "__main__":
	main()
No results found