Skip to content

Instantly share code, notes, and snippets.

@wang-zifu
Forked from shantanuo/nlp.py
Created May 9, 2020 04:48
Show Gist options
  • Select an option

  • Save wang-zifu/03937cd9cd506c8aec6e6b5c1f305611 to your computer and use it in GitHub Desktop.

Select an option

Save wang-zifu/03937cd9cd506c8aec6e6b5c1f305611 to your computer and use it in GitHub Desktop.

Revisions

  1. @shantanuo shantanuo created this gist Aug 19, 2018.
    113 changes: 113 additions & 0 deletions nlp.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,113 @@
    ## install
    # apt-get update && apt-get install -y \
    build-essential \
    wget \
    git \
    python-dev \
    unzip \
    python-numpy \
    python-scipy \
    && rm -rf /var/cache/apk/*


    # git clone https://github.com/facebookresearch/fastText.git /tmp/fastText && \
    rm -rf /tmp/fastText/.git* && \
    mv /tmp/fastText/* / && \
    cd / && \
    make

    # wget https://s3.amazonaws.com/datameetgeo/datameetgeo/yelp/yelp_dataset.tar

    import json
    from pathlib import Path
    import re
    import random

    reviews_data = Path("dataset") / "review.json"
    training_data = Path("fasttext_dataset_training.txt")
    test_data = Path("fasttext_dataset_test.txt")

    # What percent of data to save separately as test data
    percent_test_data = 0.10

    def strip_formatting(string):
    string = string.lower()
    string = re.sub(r"([.!?,'/()])", r" \1 ", string)
    return string

    with reviews_data.open() as input, \
    training_data.open("w") as train_output, \
    test_data.open("w") as test_output:

    for line in input:
    review_data = json.loads(line)

    rating = review_data['stars']
    text = review_data['text'].replace("\n", " ")
    text = strip_formatting(text)

    fasttext_line = "__label__{} {}".format(rating, text)

    if random.random() <= percent_test_data:
    test_output.write(fasttext_line + "\n")
    else:
    train_output.write(fasttext_line + "\n")


    /fasttext supervised -input fasttext_dataset_training.txt -output reviews_model

    /fasttext test reviews_model.bin fasttext_dataset_test.txt

    /fasttext predict reviews_model.bin -
    this is a terrible restaurant . i hate it so much .




    # conda install -c conda-forge spacy

    # python -m spacy download en_core_web_lg

    # pip install -U textacy

    import spacy
    import textacy

    # Load the large English NLP model
    nlp = spacy.load('en_core_web_lg')

    # The text we want to examine
    text = """London is the capital and most populous city of England and
    the United Kingdom. Standing on the River Thames in the south east
    of the island of Great Britain, London has been a major settlement
    for two millennia. It was founded by the Romans, who named it Londinium.
    """

    # Parse the text with spaCy. This runs the entire pipeline.
    doc = nlp(text)

    # 'doc' now contains a parsed version of text. We can use it to do anything we want!
    # For example, this will print out all the named entities that were detected:
    for entity in doc.ents:
    print(f"{entity.text} ({entity.label_})")

    # Extract semi-structured statements
    statements = textacy.extract.semistructured_statements(doc, "London")

    # Print the results
    print("Here are the things I know about London:")

    for statement in statements:
    subject, verb, fact = statement
    print(f" - {fact}")


    #pip install https://github.com/huggingface/neuralcoref-models/releases/download/en_coref_md-3.0.0/en_coref_md-3.0.0.tar.gz

    import en_coref_md

    nlp = en_coref_md.load()
    doc = nlp(u'My sister has a dog. She loves him.')

    doc._.has_coref
    doc._.coref_clusters