Skip to content

Instantly share code, notes, and snippets.

@ines
Created October 3, 2019 20:37
Show Gist options
  • Save ines/0adc578bffff78de32e706ef987bddde to your computer and use it in GitHub Desktop.
Save ines/0adc578bffff78de32e706ef987bddde to your computer and use it in GitHub Desktop.

Revisions

  1. ines created this gist Oct 3, 2019.
    245 changes: 245 additions & 0 deletions streamlit_prodigy.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,245 @@
    """
    Example of a Streamlit app for an interactive Prodigy dataset viewer that also lets you
    run simple training experiments for NER and text classification.
    Requires the Prodigy annotation tool to be installed: https://prodi.gy
    See here for details on Streamlit: https://streamlit.io.
    """
    import streamlit as st
    from prodigy.components.db import connect
    from prodigy.models.ner import EntityRecognizer, merge_spans, guess_batch_size
    from prodigy.models.textcat import TextClassifier
    from prodigy.util import split_evals
    import pandas as pd
    import spacy
    from spacy import displacy
    from spacy.util import filter_spans, minibatch
    import random


    SPACY_MODEL_NAMES = ["en_core_web_sm"]
    EXC_FIELDS = ["meta", "priority", "score"]
    HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
    COLOR_ACCEPT = "#93eaa1"
    COLOR_REJECT = "#ff8f8e"


    def guess_dataset_type(first_eg):
    if "image" in first_eg:
    return "image"
    if "arc" in first_eg:
    return "dep"
    if "options" in first_eg or "label" in first_eg:
    return "textcat"
    if "spans" in first_eg:
    return "ner"
    return "other"


    def get_answer_counts(examples):
    result = {"accept": 0, "reject": 0, "ignore": 0}
    for eg in examples:
    answer = eg.get("answer")
    if answer:
    result[answer] += 1
    return result


    def format_label(label, answer="accept"):
    # Hack to use different colors for the label (by adding zero-width space)
    return f"{label}\u200B" if answer == "reject" else label


    st.sidebar.title("Prodigy Data Explorer")
    db = connect()
    db_sets = db.datasets
    placeholder = "Select dataset..."
    dataset = st.sidebar.selectbox(f"Datasets ({len(db_sets)})", [placeholder] + db_sets)
    if dataset != placeholder:
    examples = db.get_dataset(dataset)
    st.header(f"{dataset} ({len(examples)})")
    if not len(examples):
    st.markdown("_Empty dataset._")
    else:
    counts = get_answer_counts(examples)
    st.markdown(", ".join(f"**{c}** {a}" for a, c in counts.items()))
    dataset_types = ["ner", "textcat", "dep", "pos", "image", "other"]
    guessed_index = dataset_types.index(guess_dataset_type(examples[0]))
    set_type = st.sidebar.selectbox("Dataset type", dataset_types, guessed_index)
    fields = list(examples[0].keys())
    default_fields = [f for f in fields if f[0] != "_" and f not in EXC_FIELDS]
    task_fields = st.sidebar.multiselect("Visible fields", fields, default_fields)
    st.dataframe(pd.DataFrame(examples).filter(task_fields), height=500)

    if set_type in ["ner", "textcat"]:
    st.sidebar.header("Viewer options")
    purpose = "tokenization & training" if set_type == "ner" else "training"
    spacy_model_title = f"spaCy model for {purpose}"
    spacy_model = st.sidebar.selectbox(spacy_model_title, SPACY_MODEL_NAMES)
    st.sidebar.subheader("Training configuration")
    n_iter = st.sidebar.slider("Number of iterations", 1, 100, 5, 1)
    dropout = st.sidebar.slider("Dropout rate", 0.0, 1.0, 0.2, 0.05)
    eval_split_label = "% of examples held back for evaluation"
    eval_split = st.sidebar.slider(eval_split_label, 0.0, 1.0, 0.2, 0.05)

    if set_type == "ner":
    st.subheader("Named entity viewer")
    nlp = spacy.load(spacy_model)
    merged_examples = merge_spans(list(examples))
    all_labels = set()
    for eg in merged_examples:
    for span in eg["spans"]:
    all_labels.add(span["label"])
    colors = {}
    for label in all_labels:
    colors[label] = COLOR_ACCEPT
    colors[format_label(label, "reject")] = COLOR_REJECT
    ner_example_i = st.selectbox(
    f"Merged examples ({len(merged_examples)})",
    range(len(merged_examples)),
    format_func=lambda i: merged_examples[int(i)]["text"][:400],
    )
    ner_example = merged_examples[int(ner_example_i)]
    doc = nlp.make_doc(ner_example["text"])
    ents = []
    for span in ner_example.get("spans", []):
    label = format_label(span["label"], span["answer"])
    ents.append(doc.char_span(span["start"], span["end"], label=label))
    doc.ents = filter_spans(ents)
    html = displacy.render(doc, style="ent", options={"colors": colors})
    html = html.replace("\n", " ") # Newlines seem to mess with the rendering
    st.write(HTML_WRAPPER.format(html), unsafe_allow_html=True)
    show_ner_example_json = st.checkbox("Show JSON example")
    if show_ner_example_json:
    st.json(ner_example)

    st.subheader("Train a model (experimental)")
    no_missing = st.checkbox(
    "Data is gold-standard and contains no missing values", False
    )
    start_blank = st.checkbox("Start with blank NER model", True)
    if st.button("🚀 Start training"):
    if start_blank:
    ner = nlp.create_pipe("ner")
    if "ner" in nlp.pipe_names:
    nlp.replace_pipe("ner", ner)
    else:
    nlp.add_pipe(ner)
    ner.begin_training([])
    else:
    ner = nlp.get_pipe("ner")
    for label in all_labels:
    ner.add_label(label)
    random.shuffle(examples)
    train_examples, evals, eval_split = split_evals(
    merged_examples, eval_split
    )
    st.success(
    f"✅ Using **{len(train_examples)}** training examples "
    f"and **{len(evals)}** evaluation examples with "
    f"**{len(all_labels)}** label(s)"
    )
    annot_model = EntityRecognizer(
    nlp, label=all_labels, no_missing=no_missing
    )
    batch_size = guess_batch_size(len(train_examples))
    baseline = annot_model.evaluate(evals)
    st.info(
    f"ℹ️ **Baseline**\n**{baseline['right']:.0f}** right "
    f"entities, **{baseline['wrong']:.0f}** wrong entities, "
    f"**{baseline['unk']:.0f}** unkown entities, "
    f"**{baseline['ents']:.0f}** total predicted, "
    f"**{baseline['acc']:.2f}** accuracy"
    )
    progress = st.progress(0)
    results = []
    result_table = st.empty()
    best_acc = 0.0
    for i in range(n_iter):
    random.shuffle(train_examples)
    losses = annot_model.batch_train(
    train_examples,
    batch_size=batch_size,
    drop=dropout,
    beam_width=16,
    )
    stats = annot_model.evaluate(evals)
    stats = {
    "Right": stats["right"],
    "Wrong": stats["wrong"],
    "Unknown": stats["unk"],
    "Predicted Ents": stats["ents"],
    "Loss": losses["ner"],
    "Accuracy": round(stats["acc"], 3),
    }
    best_acc = (
    stats["Accuracy"] if stats["Accuracy"] > best_acc else best_acc
    )

    def highlight(v):
    is_best = v != 0 and v == best_acc
    return f"background: {'yellow' if is_best else 'white'}"

    results.append(stats)
    results_df = pd.DataFrame(results, dtype="float")
    result_table.dataframe(results_df.style.applymap(highlight))
    progress.progress(int((i + 1) / n_iter * 100))

    elif set_type == "textcat":
    st.subheader("Train a model (experimental)")
    exclusive = st.checkbox("Labels are mututally exclusive", False)
    if st.button("🚀 Start training"):
    nlp = spacy.load(spacy_model)
    examples = list(examples)
    all_labels = set()
    for eg in examples:
    all_labels.update(eg.get("accelt", []))
    if "label" in eg:
    all_labels.add(eg["label"])
    textcat = nlp.create_pipe("textcat")
    for label in all_labels:
    textcat.add_label(label)
    textcat.begin_training()
    nlp.add_pipe(textcat)
    random.shuffle(examples)
    train_examples, evals, eval_split = split_evals(examples, eval_split)
    st.success(
    f"✅ Using **{len(train_examples)}** training examples "
    f"and **{len(evals)}** evaluation examples with "
    f"**{len(all_labels)}** label(s)"
    )
    annot_model = TextClassifier(
    nlp,
    all_labels,
    low_data=len(train_examples) < 1000,
    exclusive_classes=exclusive,
    )
    progress = st.progress(0)
    results = []
    result_table = st.empty()
    best_acc = 0.0
    for i in range(n_iter):
    loss = 0.0
    random.shuffle(train_examples)
    for batch in minibatch(train_examples, size=10):
    batch = list(batch)
    loss += annot_model.update(batch, revise=False, drop=dropout)
    with nlp.use_params(annot_model.optimizer.averages):
    stats = annot_model.evaluate(evals)
    stats = {
    "Loss": loss,
    "F-Score": stats["fscore"],
    "Accuracy": round(stats["accuracy"], 3),
    }
    best_acc = (
    stats["Accuracy"] if stats["Accuracy"] > best_acc else best_acc
    )

    def highlight(v):
    is_best = v != 0 and v == best_acc
    return f"background: {'yellow' if is_best else 'white'}"

    results.append(stats)
    results_df = pd.DataFrame(results, dtype="float").round(3)
    result_table.dataframe(results_df.style.applymap(highlight))
    progress.progress(int((i + 1) / n_iter * 100))