import json import numpy as np from FlagEmbedding import FlagModel from qdrant_client import QdrantClient from qdrant_client.http.models import Batch, VectorParams, Distance import uuid # Initialize the Qdrant client client = QdrantClient(host="localhost", port=6333) # Create a collection # client.create_collection( # collection_name="arxiv", # vectors_config=VectorParams(size=768, distance=Distance.COSINE), # ) # Initialize the embedding model model = FlagModel('BAAI/bge-base-en-v1.5', query_instruction_for_retrieval="Represent this sentence for searching relevant passages: ", use_fp16=True) def generate_uuid_from_fields(json_obj): unique_string = f"{json_obj['id']}-{json_obj['title']}-{json_obj['authors']}" return uuid.uuid5(uuid.NAMESPACE_DNS, unique_string).hex def embed_sentences(sentences): embeddings = model.encode(sentences) return embeddings def upload_to_qdrant(collection_name, ids, vectors, payloads): # Convert payloads to the expected format by Qdrant client formatted_payloads = [{str(k): v for k, v in payload.items()} for payload in payloads] client.upsert( collection_name=collection_name, points=Batch( ids=ids, payloads=formatted_payloads, vectors=vectors, ), ) def process_file(file_path, start_line=0, batch_size=250): line_counter = 0 with open(file_path, 'r') as file: sentences = [] metadata = [] ids = [] for line in file: line_counter += 1 if line_counter < start_line: continue json_obj = json.loads(line) combined_text = f"{json_obj['title']} {json_obj['abstract']} {json_obj['authors']}" sentences.append(combined_text) metadata.append({k: v for k, v in json_obj.items()}) ids.append(generate_uuid_from_fields(json_obj)) if len(sentences) >= batch_size: embeddings = embed_sentences(sentences) vectors = [embedding.tolist() for embedding in embeddings] upload_to_qdrant("arxiv", ids, vectors, metadata) sentences = [] metadata = [] ids = [] if sentences: # Process any remaining items in the last batch embeddings = embed_sentences(sentences) vectors = [embedding.tolist() for embedding in embeddings] upload_to_qdrant("arxiv", ids, vectors, metadata) # https://www.kaggle.com/datasets/Cornell-University/arxiv/data file_path = "../data/arxiv-metadata-oai-snapshot.json" start_line = 0 # Adjust this if you need to resume from a specific line process_file(file_path, start_line=start_line)