Skip to content

Instantly share code, notes, and snippets.

@wang-zifu
Forked from shantanuo/nlp.py
Created May 9, 2020 04:48
Show Gist options
  • Select an option

  • Save wang-zifu/03937cd9cd506c8aec6e6b5c1f305611 to your computer and use it in GitHub Desktop.

Select an option

Save wang-zifu/03937cd9cd506c8aec6e6b5c1f305611 to your computer and use it in GitHub Desktop.
NLP using spacy and other modules
## install
# apt-get update && apt-get install -y \
build-essential \
wget \
git \
python-dev \
unzip \
python-numpy \
python-scipy \
&& rm -rf /var/cache/apk/*
# git clone https://github.com/facebookresearch/fastText.git /tmp/fastText && \
rm -rf /tmp/fastText/.git* && \
mv /tmp/fastText/* / && \
cd / && \
make
# wget https://s3.amazonaws.com/datameetgeo/datameetgeo/yelp/yelp_dataset.tar
import json
from pathlib import Path
import re
import random
reviews_data = Path("dataset") / "review.json"
training_data = Path("fasttext_dataset_training.txt")
test_data = Path("fasttext_dataset_test.txt")
# What percent of data to save separately as test data
percent_test_data = 0.10
def strip_formatting(string):
string = string.lower()
string = re.sub(r"([.!?,'/()])", r" \1 ", string)
return string
with reviews_data.open() as input, \
training_data.open("w") as train_output, \
test_data.open("w") as test_output:
for line in input:
review_data = json.loads(line)
rating = review_data['stars']
text = review_data['text'].replace("\n", " ")
text = strip_formatting(text)
fasttext_line = "__label__{} {}".format(rating, text)
if random.random() <= percent_test_data:
test_output.write(fasttext_line + "\n")
else:
train_output.write(fasttext_line + "\n")
/fasttext supervised -input fasttext_dataset_training.txt -output reviews_model
/fasttext test reviews_model.bin fasttext_dataset_test.txt
/fasttext predict reviews_model.bin -
this is a terrible restaurant . i hate it so much .
# conda install -c conda-forge spacy
# python -m spacy download en_core_web_lg
# pip install -U textacy
import spacy
import textacy
# Load the large English NLP model
nlp = spacy.load('en_core_web_lg')
# The text we want to examine
text = """London is the capital and most populous city of England and
the United Kingdom. Standing on the River Thames in the south east
of the island of Great Britain, London has been a major settlement
for two millennia. It was founded by the Romans, who named it Londinium.
"""
# Parse the text with spaCy. This runs the entire pipeline.
doc = nlp(text)
# 'doc' now contains a parsed version of text. We can use it to do anything we want!
# For example, this will print out all the named entities that were detected:
for entity in doc.ents:
print(f"{entity.text} ({entity.label_})")
# Extract semi-structured statements
statements = textacy.extract.semistructured_statements(doc, "London")
# Print the results
print("Here are the things I know about London:")
for statement in statements:
subject, verb, fact = statement
print(f" - {fact}")
#pip install https://github.com/huggingface/neuralcoref-models/releases/download/en_coref_md-3.0.0/en_coref_md-3.0.0.tar.gz
import en_coref_md
nlp = en_coref_md.load()
doc = nlp(u'My sister has a dog. She loves him.')
doc._.has_coref
doc._.coref_clusters
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment