Skip to content

Instantly share code, notes, and snippets.

@MathiasGruber
Last active February 18, 2022 13:51
Show Gist options
  • Save MathiasGruber/fe14cd8eda75301ccc86b7d330e6f768 to your computer and use it in GitHub Desktop.
Save MathiasGruber/fe14cd8eda75301ccc86b7d330e6f768 to your computer and use it in GitHub Desktop.

Revisions

  1. MathiasGruber revised this gist Apr 19, 2021. 2 changed files with 26 additions and 48 deletions.
    48 changes: 0 additions & 48 deletions gistfile1.txt
    Original file line number Diff line number Diff line change
    @@ -1,48 +0,0 @@
    from transformers import AutoTokenizer, AutoModel

    def mean_pooling(model_output, attention_mask):
    """
    Mean pooling to get sentence embeddings. See:
    https://huggingface.co/sentence-transformers/paraphrase-distilroberta-base-v1
    """
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) # Sum columns
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

    # Fetch the model & tokenizer from transformers library
    model_name = 'sentence-transformers/stsb-roberta-large'
    model = AutoModel.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    sentence_embeddings = []
    word_embeddings = []

    # Embed 8 sentences at a time
    for sentences in tqdm(grouper(df.question.tolist(), 8, None)):

    # Ignore sentences with None
    valid_sentences = [s for s in sentences if s]

    # Tokenize input
    encoded_input = tokenizer(valid_sentences, padding=True, truncation=True, max_length=512, return_tensors="pt")

    # Create word embeddings
    model_output = model(**encoded_input)

    # For each sentence, store a list of token embeddings; i.e. a 1024-dimensional vector for each token
    for i, sentence in enumerate(valid_sentences):
    tokens = tokenizer.convert_ids_to_tokens(encoded_input['input_ids'][i])
    embeddings = model_output[0][i]
    word_embeddings.append(
    [{"token": token, "embedding": embedding.detach().numpy()} for token, embedding in zip(tokens, embeddings)]
    )

    # Pool to get sentence embeddings; i.e. generate one 1024 vector for the entire sentence
    sentence_embeddings.append(
    mean_pooling(model_output, encoded_input['attention_mask']).detach().numpy()
    )

    # Concatenate all of the embeddings into one numpy array of shape (n_sentences, 1024)
    sentence_embeddings = np.concatenate(sentence_embeddings)
    26 changes: 26 additions & 0 deletions sts_sentence_embedding.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,26 @@
    from transformers import AutoTokenizer, AutoModel

    def mean_pooling(model_output, attention_mask):
    """
    Mean pooling to get sentence embeddings. See:
    https://huggingface.co/sentence-transformers/paraphrase-distilroberta-base-v1
    """
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) # Sum columns
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

    # Fetch the model & tokenizer from transformers library
    model_name = 'sentence-transformers/stsb-roberta-large'
    model = AutoModel.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Tokenize input
    encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=512, return_tensors="pt")

    # Create word embeddings
    model_output = model(**encoded_input)

    # Pool to get sentence embeddings; i.e. generate one 1024 vector for the entire sentence
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']).detach().numpy()
  2. MathiasGruber created this gist Apr 19, 2021.
    48 changes: 48 additions & 0 deletions gistfile1.txt
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,48 @@
    from transformers import AutoTokenizer, AutoModel

    def mean_pooling(model_output, attention_mask):
    """
    Mean pooling to get sentence embeddings. See:
    https://huggingface.co/sentence-transformers/paraphrase-distilroberta-base-v1
    """
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) # Sum columns
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

    # Fetch the model & tokenizer from transformers library
    model_name = 'sentence-transformers/stsb-roberta-large'
    model = AutoModel.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    sentence_embeddings = []
    word_embeddings = []

    # Embed 8 sentences at a time
    for sentences in tqdm(grouper(df.question.tolist(), 8, None)):

    # Ignore sentences with None
    valid_sentences = [s for s in sentences if s]

    # Tokenize input
    encoded_input = tokenizer(valid_sentences, padding=True, truncation=True, max_length=512, return_tensors="pt")

    # Create word embeddings
    model_output = model(**encoded_input)

    # For each sentence, store a list of token embeddings; i.e. a 1024-dimensional vector for each token
    for i, sentence in enumerate(valid_sentences):
    tokens = tokenizer.convert_ids_to_tokens(encoded_input['input_ids'][i])
    embeddings = model_output[0][i]
    word_embeddings.append(
    [{"token": token, "embedding": embedding.detach().numpy()} for token, embedding in zip(tokens, embeddings)]
    )

    # Pool to get sentence embeddings; i.e. generate one 1024 vector for the entire sentence
    sentence_embeddings.append(
    mean_pooling(model_output, encoded_input['attention_mask']).detach().numpy()
    )

    # Concatenate all of the embeddings into one numpy array of shape (n_sentences, 1024)
    sentence_embeddings = np.concatenate(sentence_embeddings)