from sentence_transformers import SentenceTransformer import duckdb from huggingface_hub import get_token model = SentenceTransformer("TaylorAI/bge-micro-v2") def similarity_search( query: str, k: int = 5, dataset_name: str = "smol-blueprint/hf-blogs-text-embeddings", embedding_column: str = "embedding", ): query_vector = model.encode(query) embedding_dim = model.get_sentence_embedding_dimension() sql = f""" SELECT *, array_cosine_distance( {embedding_column}::float[{embedding_dim}], {query_vector.tolist()}::float[{embedding_dim}] ) as distance FROM 'hf://datasets/{dataset_name}/**/*.parquet' ORDER BY distance LIMIT {k} """ return duckdb.sql(sql).to_df() similarity_search("How can I use the Hub for vector search?")