Last active
February 18, 2022 13:51
-
-
Save MathiasGruber/fe14cd8eda75301ccc86b7d330e6f768 to your computer and use it in GitHub Desktop.
Revisions
-
MathiasGruber revised this gist
Apr 19, 2021 . 2 changed files with 26 additions and 48 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,48 +0,0 @@ This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,26 @@ from transformers import AutoTokenizer, AutoModel def mean_pooling(model_output, attention_mask): """ Mean pooling to get sentence embeddings. See: https://huggingface.co/sentence-transformers/paraphrase-distilroberta-base-v1 """ token_embeddings = model_output[0] input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) # Sum columns sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) return sum_embeddings / sum_mask # Fetch the model & tokenizer from transformers library model_name = 'sentence-transformers/stsb-roberta-large' model = AutoModel.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) # Tokenize input encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=512, return_tensors="pt") # Create word embeddings model_output = model(**encoded_input) # Pool to get sentence embeddings; i.e. generate one 1024 vector for the entire sentence sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']).detach().numpy() -
MathiasGruber created this gist
Apr 19, 2021 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,48 @@ from transformers import AutoTokenizer, AutoModel def mean_pooling(model_output, attention_mask): """ Mean pooling to get sentence embeddings. See: https://huggingface.co/sentence-transformers/paraphrase-distilroberta-base-v1 """ token_embeddings = model_output[0] input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) # Sum columns sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) return sum_embeddings / sum_mask # Fetch the model & tokenizer from transformers library model_name = 'sentence-transformers/stsb-roberta-large' model = AutoModel.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) sentence_embeddings = [] word_embeddings = [] # Embed 8 sentences at a time for sentences in tqdm(grouper(df.question.tolist(), 8, None)): # Ignore sentences with None valid_sentences = [s for s in sentences if s] # Tokenize input encoded_input = tokenizer(valid_sentences, padding=True, truncation=True, max_length=512, return_tensors="pt") # Create word embeddings model_output = model(**encoded_input) # For each sentence, store a list of token embeddings; i.e. a 1024-dimensional vector for each token for i, sentence in enumerate(valid_sentences): tokens = tokenizer.convert_ids_to_tokens(encoded_input['input_ids'][i]) embeddings = model_output[0][i] word_embeddings.append( [{"token": token, "embedding": embedding.detach().numpy()} for token, embedding in zip(tokens, embeddings)] ) # Pool to get sentence embeddings; i.e. generate one 1024 vector for the entire sentence sentence_embeddings.append( mean_pooling(model_output, encoded_input['attention_mask']).detach().numpy() ) # Concatenate all of the embeddings into one numpy array of shape (n_sentences, 1024) sentence_embeddings = np.concatenate(sentence_embeddings)