Last active
          February 18, 2022 13:51 
        
      - 
      
- 
        Save MathiasGruber/fe14cd8eda75301ccc86b7d330e6f768 to your computer and use it in GitHub Desktop. 
    Embedding questions using sentence transformer model
  
        
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | from transformers import AutoTokenizer, AutoModel | |
| def mean_pooling(model_output, attention_mask): | |
| """ | |
| Mean pooling to get sentence embeddings. See: | |
| https://huggingface.co/sentence-transformers/paraphrase-distilroberta-base-v1 | |
| """ | |
| token_embeddings = model_output[0] | |
| input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() | |
| sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) # Sum columns | |
| sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) | |
| return sum_embeddings / sum_mask | |
| # Fetch the model & tokenizer from transformers library | |
| model_name = 'sentence-transformers/stsb-roberta-large' | |
| model = AutoModel.from_pretrained(model_name) | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| # Tokenize input | |
| encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=512, return_tensors="pt") | |
| # Create word embeddings | |
| model_output = model(**encoded_input) | |
| # Pool to get sentence embeddings; i.e. generate one 1024 vector for the entire sentence | |
| sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']).detach().numpy() | 
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment