|
|
@@ -0,0 +1,107 @@ |
|
|
{ |
|
|
"cells": [ |
|
|
{ |
|
|
"cell_type": "code", |
|
|
"execution_count": null, |
|
|
"metadata": {}, |
|
|
"outputs": [], |
|
|
"source": [ |
|
|
"!git clone [email protected]:UKPLab/sentence-transformers.git\n", |
|
|
"!cd sentence-transformers\n", |
|
|
"!pip install ." |
|
|
] |
|
|
}, |
|
|
{ |
|
|
"cell_type": "code", |
|
|
"execution_count": 1, |
|
|
"metadata": {}, |
|
|
"outputs": [], |
|
|
"source": [ |
|
|
"import scipy\n", |
|
|
"import numpy as np\n", |
|
|
"from sentence_transformers import models, SentenceTransformer\n", |
|
|
"\n", |
|
|
"model = SentenceTransformer('distiluse-base-multilingual-cased')" |
|
|
] |
|
|
}, |
|
|
{ |
|
|
"cell_type": "code", |
|
|
"execution_count": 2, |
|
|
"metadata": {}, |
|
|
"outputs": [], |
|
|
"source": [ |
|
|
"# Corpus with example sentences, some sentences were swapped to French, Italian and German\n", |
|
|
"corpus = ['Un homme mange de la nourriture.', #FR 'A man is eating food.',\n", |
|
|
" 'A man is eating a piece of bread.',\n", |
|
|
" 'Das Mädchen trägt ein Baby.', #DE 'The girl is carrying a baby.',\n", |
|
|
" 'A man is riding a horse.',\n", |
|
|
" 'An elderly man is enjoying dinner.',\n", |
|
|
" 'Amis partageant du vin dans un restaurant.', #FR 'Friends sharing wine at a restaurant.',\n", |
|
|
" 'A woman is playing violin.',\n", |
|
|
" 'A child is learning to play a base guitar.',\n", |
|
|
" 'Due uomini hanno spinto i carrelli attraverso i boschi.', #IT 'Two men pushed carts through the woods.',\n", |
|
|
" 'A man is riding a white horse on an enclosed ground.',\n", |
|
|
" 'Una scimmia suona la batteria.', #IT 'A monkey is playing drums.',\n", |
|
|
" 'A cheetah is running behind its prey.']\n", |
|
|
"\n", |
|
|
"corpus_embeddings = model.encode(corpus)" |
|
|
] |
|
|
}, |
|
|
{ |
|
|
"cell_type": "code", |
|
|
"execution_count": 3, |
|
|
"metadata": {}, |
|
|
"outputs": [], |
|
|
"source": [ |
|
|
"# Sample queries to find similar sentences to, some sentences were translated to Russian and German.\n", |
|
|
"queries = ['A man is eating pasta.', \n", |
|
|
" 'Кто-то в костюме гориллы играет на барабане', #RU 'Someone in a gorilla costume is playing a set of drums.', \n", |
|
|
" 'Ein Gepard jagt Beute über ein Feld.'] #DE 'A cheetah chases prey on across a field.']\n", |
|
|
"query_embeddings = model.encode(queries)" |
|
|
] |
|
|
}, |
|
|
{ |
|
|
"cell_type": "code", |
|
|
"execution_count": null, |
|
|
"metadata": {}, |
|
|
"outputs": [], |
|
|
"source": [ |
|
|
"# Calculate Cosine similarity of query against each sentence i\n", |
|
|
"closest_n = 3\n", |
|
|
"for query, query_embedding in zip(queries, query_embeddings):\n", |
|
|
" distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, \"cosine\")[0]\n", |
|
|
"\n", |
|
|
" results = zip(range(len(distances)), distances)\n", |
|
|
" results = sorted(results, key=lambda x: x[1])\n", |
|
|
"\n", |
|
|
" print(\"\\n======================\\n\")\n", |
|
|
" print(\"Query:\", query)\n", |
|
|
" print(\"\\nTop 3 most similar sentences in corpus:\")\n", |
|
|
"\n", |
|
|
" for idx, distance in results[0:closest_n]:\n", |
|
|
" print(corpus[idx].strip(), \"(Score: %.4f)\" % (1-distance))" |
|
|
] |
|
|
} |
|
|
], |
|
|
"metadata": { |
|
|
"kernelspec": { |
|
|
"display_name": "transformers", |
|
|
"language": "python", |
|
|
"name": "transformers" |
|
|
}, |
|
|
"language_info": { |
|
|
"codemirror_mode": { |
|
|
"name": "ipython", |
|
|
"version": 3 |
|
|
}, |
|
|
"file_extension": ".py", |
|
|
"mimetype": "text/x-python", |
|
|
"name": "python", |
|
|
"nbconvert_exporter": "python", |
|
|
"pygments_lexer": "ipython3", |
|
|
"version": "3.6.10" |
|
|
} |
|
|
}, |
|
|
"nbformat": 4, |
|
|
"nbformat_minor": 4 |
|
|
} |