Skip to content

Instantly share code, notes, and snippets.

@peterw
Created April 17, 2023 16:30
Show Gist options
  • Save peterw/9fc317e7d88d4d7cc313deda5381294a to your computer and use it in GitHub Desktop.
Save peterw/9fc317e7d88d4d7cc313deda5381294a to your computer and use it in GitHub Desktop.

Revisions

  1. peterw created this gist Apr 17, 2023.
    26 changes: 26 additions & 0 deletions embed.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,26 @@
    import openai
    import streamlit as st
    from streamlit_chat import message
    from dotenv import load_dotenv
    import os
    from langchain.embeddings.openai import OpenAIEmbeddings
    from langchain.vectorstores import Chroma
    import openai
    from langchain.document_loaders import UnstructuredMarkdownLoader
    from langchain.chains.question_answering import load_qa_chain
    from langchain.chat_models import ChatOpenAI
    from langchain.document_loaders import UnstructuredPDFLoader

    load_dotenv()
    openai.api_key = os.environ.get('OPENAI_API_KEY', 'sk-9azBt6Dd8j7p5z5Lwq2S9EhmkVX48GtN2Kt2t3GJGN94SQ2')

    persist_directory = 'ai_paper1'
    embeddings = OpenAIEmbeddings()

    if not os.path.exists(persist_directory):
    print('embedding the document now')
    loader = UnstructuredPDFLoader('ai_paper.pdf', mode="elements")
    pages = loader.load_and_split()

    vectordb = Chroma.from_documents(documents=pages, embedding=embeddings, persist_directory=persist_directory)
    vectordb.persist()