Skip to content

Instantly share code, notes, and snippets.

@weedge
Forked from awni/ctc_decoder.py
Created December 21, 2024 16:25
Show Gist options
  • Select an option

  • Save weedge/afff21de35548298ef156dbcef90a711 to your computer and use it in GitHub Desktop.

Select an option

Save weedge/afff21de35548298ef156dbcef90a711 to your computer and use it in GitHub Desktop.

Revisions

  1. @awni awni revised this gist Feb 28, 2018. 1 changed file with 6 additions and 6 deletions.
    12 changes: 6 additions & 6 deletions ctc_decoder.py
    Original file line number Diff line number Diff line change
    @@ -102,12 +102,12 @@ def decode(probs, beam_size=100, blank=0):
    n_p_nb = logsumexp(n_p_nb, p_nb + p)
    next_beam[prefix] = (n_p_b, n_p_nb)

    # Sort and trim the beam before moving on to the
    # next time-step.
    beam = sorted(next_beam.items(),
    key=lambda x : logsumexp(*x[1]),
    reverse=True)
    beam = beam[:beam_size]
    # Sort and trim the beam before moving on to the
    # next time-step.
    beam = sorted(next_beam.items(),
    key=lambda x : logsumexp(*x[1]),
    reverse=True)
    beam = beam[:beam_size]

    best = beam[0]
    return best[0], -logsumexp(*best[1])
  2. @awni awni revised this gist Nov 30, 2017. 1 changed file with 4 additions and 1 deletion.
    5 changes: 4 additions & 1 deletion ctc_decoder.py
    Original file line number Diff line number Diff line change
    @@ -8,7 +8,10 @@
    The algorithm is a prefix beam search for a model trained
    with the CTC loss function.
    See https://arxiv.org/pdf/1408.2873.pdf for more details.
    For more details checkout either of these references:
    https://distill.pub/2017/ctc/#inference
    https://arxiv.org/abs/1408.2873
    """

    import numpy as np
  3. @awni awni revised this gist Nov 5, 2017. 1 changed file with 3 additions and 3 deletions.
    6 changes: 3 additions & 3 deletions ctc_decoder.py
    Original file line number Diff line number Diff line change
    @@ -75,9 +75,9 @@ def decode(probs, beam_size=100, blank=0):
    next_beam[prefix] = (n_p_b, n_p_nb)
    continue

    # Extend the prefix by the new character s and it to the
    # beam. Only the probability of not ending in blank gets
    # updated.
    # Extend the prefix by the new character s and add it to
    # the beam. Only the probability of not ending in blank
    # gets updated.
    end_t = prefix[-1] if prefix else None
    n_prefix = prefix + (s,)
    n_p_b, n_p_nb = next_beam[n_prefix]
  4. @awni awni revised this gist Nov 5, 2017. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions ctc_decoder.py
    Original file line number Diff line number Diff line change
    @@ -106,8 +106,8 @@ def decode(probs, beam_size=100, blank=0):
    reverse=True)
    beam = beam[:beam_size]

    best = beam[0]
    return best[0], -logsumexp(*best[1])
    best = beam[0]
    return best[0], -logsumexp(*best[1])

    if __name__ == "__main__":
    np.random.seed(3)
  5. @awni awni revised this gist Sep 29, 2017. 1 changed file with 4 additions and 4 deletions.
    8 changes: 4 additions & 4 deletions ctc_decoder.py
    Original file line number Diff line number Diff line change
    @@ -2,11 +2,11 @@
    Author: Awni Hannun
    This is an example CTC decoder written in Python. The code is
    intended to be a simple example of a CTC decoder and is not
    designed to be especially efficient.
    intended to be a simple example and is not designed to be
    especially efficient.
    The algorithm is a prefix beam search intended for a model
    trained with the CTC loss function.
    The algorithm is a prefix beam search for a model trained
    with the CTC loss function.
    See https://arxiv.org/pdf/1408.2873.pdf for more details.
    """
  6. @awni awni revised this gist Aug 7, 2017. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion ctc_decoder.py
    Original file line number Diff line number Diff line change
    @@ -89,7 +89,7 @@ def decode(probs, beam_size=100, blank=0):
    # algorithm merges characters not separated by a blank.
    n_p_nb = logsumexp(n_p_nb, p_b + p)

    # *NB* this would be a good place to insert an LM score.
    # *NB* this would be a good place to include an LM score.
    next_beam[n_prefix] = (n_p_b, n_p_nb)

    # If s is repeated at the end we also update the unchanged
  7. @awni awni created this gist Aug 7, 2017.
    122 changes: 122 additions & 0 deletions ctc_decoder.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,122 @@
    """
    Author: Awni Hannun
    This is an example CTC decoder written in Python. The code is
    intended to be a simple example of a CTC decoder and is not
    designed to be especially efficient.
    The algorithm is a prefix beam search intended for a model
    trained with the CTC loss function.
    See https://arxiv.org/pdf/1408.2873.pdf for more details.
    """

    import numpy as np
    import math
    import collections

    NEG_INF = -float("inf")

    def make_new_beam():
    fn = lambda : (NEG_INF, NEG_INF)
    return collections.defaultdict(fn)

    def logsumexp(*args):
    """
    Stable log sum exp.
    """
    if all(a == NEG_INF for a in args):
    return NEG_INF
    a_max = max(args)
    lsp = math.log(sum(math.exp(a - a_max)
    for a in args))
    return a_max + lsp

    def decode(probs, beam_size=100, blank=0):
    """
    Performs inference for the given output probabilities.
    Arguments:
    probs: The output probabilities (e.g. post-softmax) for each
    time step. Should be an array of shape (time x output dim).
    beam_size (int): Size of the beam to use during inference.
    blank (int): Index of the CTC blank label.
    Returns the output label sequence and the corresponding negative
    log-likelihood estimated by the decoder.
    """
    T, S = probs.shape
    probs = np.log(probs)

    # Elements in the beam are (prefix, (p_blank, p_no_blank))
    # Initialize the beam with the empty sequence, a probability of
    # 1 for ending in blank and zero for ending in non-blank
    # (in log space).
    beam = [(tuple(), (0.0, NEG_INF))]

    for t in range(T): # Loop over time

    # A default dictionary to store the next step candidates.
    next_beam = make_new_beam()

    for s in range(S): # Loop over vocab
    p = probs[t, s]

    # The variables p_b and p_nb are respectively the
    # probabilities for the prefix given that it ends in a
    # blank and does not end in a blank at this time step.
    for prefix, (p_b, p_nb) in beam: # Loop over beam

    # If we propose a blank the prefix doesn't change.
    # Only the probability of ending in blank gets updated.
    if s == blank:
    n_p_b, n_p_nb = next_beam[prefix]
    n_p_b = logsumexp(n_p_b, p_b + p, p_nb + p)
    next_beam[prefix] = (n_p_b, n_p_nb)
    continue

    # Extend the prefix by the new character s and it to the
    # beam. Only the probability of not ending in blank gets
    # updated.
    end_t = prefix[-1] if prefix else None
    n_prefix = prefix + (s,)
    n_p_b, n_p_nb = next_beam[n_prefix]
    if s != end_t:
    n_p_nb = logsumexp(n_p_nb, p_b + p, p_nb + p)
    else:
    # We don't include the previous probability of not ending
    # in blank (p_nb) if s is repeated at the end. The CTC
    # algorithm merges characters not separated by a blank.
    n_p_nb = logsumexp(n_p_nb, p_b + p)

    # *NB* this would be a good place to insert an LM score.
    next_beam[n_prefix] = (n_p_b, n_p_nb)

    # If s is repeated at the end we also update the unchanged
    # prefix. This is the merging case.
    if s == end_t:
    n_p_b, n_p_nb = next_beam[prefix]
    n_p_nb = logsumexp(n_p_nb, p_nb + p)
    next_beam[prefix] = (n_p_b, n_p_nb)

    # Sort and trim the beam before moving on to the
    # next time-step.
    beam = sorted(next_beam.items(),
    key=lambda x : logsumexp(*x[1]),
    reverse=True)
    beam = beam[:beam_size]

    best = beam[0]
    return best[0], -logsumexp(*best[1])

    if __name__ == "__main__":
    np.random.seed(3)

    time = 50
    output_dim = 20

    probs = np.random.rand(time, output_dim)
    probs = probs / np.sum(probs, axis=1, keepdims=True)

    labels, score = decode(probs)
    print("Score {:.3f}".format(score))