Last active
September 6, 2018 21:27
-
-
Save tokestermw/6b3549bc5caa1be1d724a2a09659284c to your computer and use it in GitHub Desktop.
Revisions
-
tokestermw revised this gist
Sep 6, 2018 . 1 changed file with 4 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -26,6 +26,10 @@ grp.create_dataset('W', W.shape, dtype='float32', data=W) grp.create_dataset('b', b.shape, dtype='float32', data=b) To test pytest allennlp/tests/modules/elmo_test.py """ from allennlp.modules.elmo import _ElmoCharacterEncoder, _ElmoBiLm, _ElmoSoftmax, Elmo, batch_to_ids -
tokestermw revised this gist
Sep 6, 2018 . 1 changed file with 58 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,3 +1,61 @@ """ To use it inside ELMo script To get the embeddings: allennlp elmo sample_sents.txt out1.hdf5 --top python -c "import h5py; f = h5py.File('out1.hdf5'); print(f['0'][:], f['0'].shape)" To get probabilities: allennlp elmo sample_sents.txt out2.hdf5 --top \ --softmax-weight-file https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_softmax_weights.hdf5 \ --softmax-vocab-file https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/vocab-2016-09-10.txt python -c "import h5py; f = h5py.File('out2.hdf5'); print(f['0'][:], f['0'].shape)" Save new example softmax file import h5py import numpy as np e = h5py.File('./allennlp/tests/fixtures/elmo/elmo_token_embeddings.hdf5') W = e['embedding'][:] b = np.zeros_like(W[:, 0]) with h5py.File('elmo_softmax_weights.hdf5', 'w') as f: grp = f.create_group('softmax') grp.create_dataset('W', W.shape, dtype='float32', data=W) grp.create_dataset('b', b.shape, dtype='float32', data=b) """ from allennlp.modules.elmo import _ElmoCharacterEncoder, _ElmoBiLm, _ElmoSoftmax, Elmo, batch_to_ids from allennlp.commands.elmo import DEFAULT_OPTIONS_FILE, DEFAULT_WEIGHT_FILE, DEFAULT_SOFTMAX_FILE, DEFAULT_VOCAB_FILE def _tokenize(text): return text.split() if __name__ == '__main__': # elmo_char_encoder - _ElmoCharacterEncoder elmo_bilm = _ElmoBiLm(DEFAULT_OPTIONS_FILE, DEFAULT_WEIGHT_FILE) elmo_softmax = _ElmoSoftmax(DEFAULT_SOFTMAX_FILE, DEFAULT_VOCAB_FILE) sentences = [ 'How are you ?', 'how are you ?', 'How are you .', 'You are how ?', ] sentences = [_tokenize(i) for i in sentences] char_ids, word_ids = batch_to_ids(sentences, elmo_softmax.vocab) bilm_outputs = elmo_bilm(char_ids) softmax_log_probs, softmax_mask = elmo_softmax( bilm_outputs, word_ids, aggregation_fun='mean') # average backward and forward log probs print(softmax_log_probs.shape) print(softmax_mask.shape) -
tokestermw revised this gist
Sep 6, 2018 . 1 changed file with 1 addition and 59 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,61 +1,3 @@ allennlp elmo sample_sents.txt out2.hdf5 --top \ --softmax-weight-file https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_softmax_weights.hdf5 \ --softmax-vocab-file https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/vocab-2016-09-10.txt -
tokestermw revised this gist
Sep 6, 2018 . 1 changed file with 15 additions and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -13,6 +13,19 @@ --vocab-file https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/vocab-2016-09-10.txt python -c "import h5py; f = h5py.File('out2.hdf5'); print(f['0'][:], f['0'].shape)" Save new example softmax file import h5py import numpy as np e = h5py.File('./allennlp/tests/fixtures/elmo/elmo_token_embeddings.hdf5') W = e['embedding'][:] b = np.zeros_like(W[:, 0]) with h5py.File('elmo_softmax_weights.hdf5', 'w') as f: grp = f.create_group('softmax') grp.create_dataset('W', W.shape, dtype='float32', data=W) grp.create_dataset('b', b.shape, dtype='float32', data=b) """ from allennlp.modules.elmo import _ElmoCharacterEncoder, _ElmoBiLm, _ElmoSoftmax, Elmo, batch_to_ids @@ -44,5 +57,5 @@ def _tokenize(text): bilm_outputs, word_ids, aggregation_fun='mean') # average backward and forward log probs print(softmax_log_probs.shape) print(softmax_mask.shape) -
tokestermw revised this gist
Sep 6, 2018 . 1 changed file with 3 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -8,7 +8,9 @@ To get probabilities: allennlp elmo sample_sents.txt out2.hdf5 --top \ --softmax-weight-file https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_softmax_weights.hdf5 \ --vocab-file https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/vocab-2016-09-10.txt python -c "import h5py; f = h5py.File('out2.hdf5'); print(f['0'][:], f['0'].shape)" """ -
tokestermw revised this gist
Sep 5, 2018 . 1 changed file with 2 additions and 55 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -13,61 +13,8 @@ """ from allennlp.modules.elmo import _ElmoCharacterEncoder, _ElmoBiLm, _ElmoSoftmax, Elmo, batch_to_ids from allennlp.commands.elmo import DEFAULT_OPTIONS_FILE, DEFAULT_WEIGHT_FILE, DEFAULT_SOFTMAX_FILE, DEFAULT_VOCAB_FILE def _tokenize(text): -
tokestermw revised this gist
Sep 5, 2018 . 1 changed file with 62 additions and 55 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,13 +1,20 @@ """ To use it inside ELMo script To get the embeddings: allennlp elmo sample_sents.txt out1.hdf5 --top python -c "import h5py; f = h5py.File('out1.hdf5'); print(f['0'][:], f['0'].shape)" To get probabilities: allennlp elmo sample_sents.txt out2.hdf5 --top --softmax-weight-file https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_softmax_weights.hdf5 --vocab-file https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/vocab-2016-09-10.txt python -c "import h5py; f = h5py.File('out2.hdf5'); print(f['0'][:], f['0'].shape)" """ from allennlp.modules.elmo import ( _ElmoCharacterEncoder, _ElmoBiLm, _ElmoSoftmax, Elmo, batch_to_ids ) DEFAULT_OPTIONS_FILE = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json" # pylint: disable=line-too-long @@ -17,57 +24,61 @@ DEFAULT_VOCAB_FILE = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/vocab-2016-09-10.txt" # pylint: disable=line-too-long # # def batch_to_ids(batch: List[List[str]], vocab: Vocabulary = None) -> Tuple[torch.Tensor, torch.Tensor]: # """ # Converts a batch of tokenized sentences to a tensor representing the sentences with encoded characters # (len(batch), max sentence length, max word length). # # Parameters # ---------- # batch : ``List[List[str]]``, required # A list of tokenized sentences. # vocab : ``Vocabulary``, optional # A vocab of words if you need to return word ids. # # Returns # ------- # If vocab is present, returns a tuple of char ids and word ids. # If not, it returns a tensor of char ids. # """ # instances = [] # char_indexer = ELMoTokenCharactersIndexer() # if vocab: # token_indexer = SingleIdTokenIndexer( # namespace='tokens', lowercase_tokens=False) # else: # token_indexer = None # for sentence in batch: # tokens = [Token(token) for token in sentence] # if vocab: # field = TextField(tokens, { # 'character_ids': char_indexer, # 'word_ids': token_indexer, # }) # else: # field = TextField(tokens, {'character_ids': char_indexer}) # instance = Instance({"elmo": field}) # instances.append(instance) # # dataset = Batch(instances) # dataset.index_instances(vocab) # elmo_tensor_dict = dataset.as_tensor_dict()['elmo'] # if vocab: # return elmo_tensor_dict['character_ids'], elmo_tensor_dict['word_ids'] # else: # return elmo_tensor_dict['character_ids'] def _tokenize(text): return text.split() if __name__ == '__main__': # elmo_char_encoder - _ElmoCharacterEncoder elmo_bilm = _ElmoBiLm(DEFAULT_OPTIONS_FILE, DEFAULT_WEIGHT_FILE) elmo_softmax = _ElmoSoftmax(DEFAULT_SOFTMAX_FILE, DEFAULT_VOCAB_FILE) sentences = [ 'How are you ?', 'how are you ?', @@ -76,10 +87,6 @@ def _tokenize(text): ] sentences = [_tokenize(i) for i in sentences] char_ids, word_ids = batch_to_ids(sentences, elmo_softmax.vocab) bilm_outputs = elmo_bilm(char_ids) -
tokestermw revised this gist
Sep 5, 2018 . 1 changed file with 7 additions and 4 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -2,10 +2,6 @@ import torch from allennlp.data import Token, Instance, Vocabulary from allennlp.data.dataset import Batch from allennlp.data.fields import TextField @@ -14,6 +10,13 @@ _ElmoCharacterEncoder, _ElmoBiLm, _ElmoSoftmax, Elmo, #batch_to_ids ) DEFAULT_OPTIONS_FILE = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json" # pylint: disable=line-too-long DEFAULT_WEIGHT_FILE = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5" # pylint: disable=line-too-long # TODO: add softmax as an option to the elmo command DEFAULT_SOFTMAX_FILE = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_softmax_weights.hdf5" # pylint: disable=line-too-long DEFAULT_VOCAB_FILE = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/vocab-2016-09-10.txt" # pylint: disable=line-too-long def batch_to_ids(batch: List[List[str]], vocab: Vocabulary = None) -> Tuple[torch.Tensor, torch.Tensor]: """ -
tokestermw created this gist
Sep 5, 2018 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,89 @@ from typing import List, Tuple import torch from allennlp.commands.elmo import ( DEFAULT_OPTIONS_FILE, DEFAULT_WEIGHT_FILE, DEFAULT_SOFTMAX_FILE, DEFAULT_VOCAB_FILE, ElmoEmbedder, ) from allennlp.data import Token, Instance, Vocabulary from allennlp.data.dataset import Batch from allennlp.data.fields import TextField from allennlp.data.token_indexers import ELMoTokenCharactersIndexer, SingleIdTokenIndexer from allennlp.modules.elmo import ( _ElmoCharacterEncoder, _ElmoBiLm, _ElmoSoftmax, Elmo, #batch_to_ids ) def batch_to_ids(batch: List[List[str]], vocab: Vocabulary = None) -> Tuple[torch.Tensor, torch.Tensor]: """ Converts a batch of tokenized sentences to a tensor representing the sentences with encoded characters (len(batch), max sentence length, max word length). Parameters ---------- batch : ``List[List[str]]``, required A list of tokenized sentences. vocab : ``Vocabulary``, optional A vocab of words if you need to return word ids. Returns ------- If vocab is present, returns a tuple of char ids and word ids. If not, it returns a tensor of char ids. """ instances = [] char_indexer = ELMoTokenCharactersIndexer() if vocab: token_indexer = SingleIdTokenIndexer( namespace='tokens', lowercase_tokens=False) else: token_indexer = None for sentence in batch: tokens = [Token(token) for token in sentence] if vocab: field = TextField(tokens, { 'character_ids': char_indexer, 'word_ids': token_indexer, }) else: field = TextField(tokens, {'character_ids': char_indexer}) instance = Instance({"elmo": field}) instances.append(instance) dataset = Batch(instances) dataset.index_instances(vocab) elmo_tensor_dict = dataset.as_tensor_dict()['elmo'] if vocab: return elmo_tensor_dict['character_ids'], elmo_tensor_dict['word_ids'] else: return elmo_tensor_dict['character_ids'] def _tokenize(text): return text.split() if __name__ == '__main__': sentences = [ 'How are you ?', 'how are you ?', 'How are you .', 'You are how ?', ] sentences = [_tokenize(i) for i in sentences] # elmo_char_encoder - _ElmoCharacterEncoder elmo_bilm = _ElmoBiLm(DEFAULT_OPTIONS_FILE, DEFAULT_WEIGHT_FILE) elmo_softmax = _ElmoSoftmax(DEFAULT_SOFTMAX_FILE, DEFAULT_VOCAB_FILE) char_ids, word_ids = batch_to_ids(sentences, elmo_softmax.vocab) bilm_outputs = elmo_bilm(char_ids) softmax_log_probs, softmax_mask = elmo_softmax( bilm_outputs, word_ids, aggregation_fun='mean') # average backward and forward log probs print(softmax_log_probs) print(softmax_mask)