Created
April 17, 2024 06:44
-
-
Save gustavz/53d6ee25d53e6cdd15070b617afb127e to your computer and use it in GitHub Desktop.
Revisions
-
Gustav von Zitzewitz created this gist
Apr 17, 2024 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,25 @@ from langchain_chroma import Chroma from langchain_openai import OpenAIEmbeddings from langchain_community.document_loaders import PDFMinerLoader, PyMuPDFLoader from langchain_text_splitters import RecursiveCharacterTextSplitter pdf_path = "https://www.barclaycard.co.uk/content/dam/barclaycard/documents/personal/existing-customers/terms-and-conditions-barclaycard-core-2019.pdf" loader = PDFMinerLoader(pdf_path) # loads all text into a single document loader = PyMuPDFLoader(pdf_path) # loads each page as a separate document documents = loader.load() text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=100, separators=["\n\n", "\n", " ", ""], ) docs = text_splitter.split_documents(documents) embeddings = OpenAIEmbeddings(model="text-embedding-ada-002") db = Chroma.from_documents(documents=docs, embedding=embeddings) query = "Why can't max do this by himself?" docs = db.similarity_search(query) print(docs[0].page_content)