|
|
@@ -0,0 +1,113 @@ |
|
|
## install |
|
|
# apt-get update && apt-get install -y \ |
|
|
build-essential \ |
|
|
wget \ |
|
|
git \ |
|
|
python-dev \ |
|
|
unzip \ |
|
|
python-numpy \ |
|
|
python-scipy \ |
|
|
&& rm -rf /var/cache/apk/* |
|
|
|
|
|
|
|
|
# git clone https://github.com/facebookresearch/fastText.git /tmp/fastText && \ |
|
|
rm -rf /tmp/fastText/.git* && \ |
|
|
mv /tmp/fastText/* / && \ |
|
|
cd / && \ |
|
|
make |
|
|
|
|
|
# wget https://s3.amazonaws.com/datameetgeo/datameetgeo/yelp/yelp_dataset.tar |
|
|
|
|
|
import json |
|
|
from pathlib import Path |
|
|
import re |
|
|
import random |
|
|
|
|
|
reviews_data = Path("dataset") / "review.json" |
|
|
training_data = Path("fasttext_dataset_training.txt") |
|
|
test_data = Path("fasttext_dataset_test.txt") |
|
|
|
|
|
# What percent of data to save separately as test data |
|
|
percent_test_data = 0.10 |
|
|
|
|
|
def strip_formatting(string): |
|
|
string = string.lower() |
|
|
string = re.sub(r"([.!?,'/()])", r" \1 ", string) |
|
|
return string |
|
|
|
|
|
with reviews_data.open() as input, \ |
|
|
training_data.open("w") as train_output, \ |
|
|
test_data.open("w") as test_output: |
|
|
|
|
|
for line in input: |
|
|
review_data = json.loads(line) |
|
|
|
|
|
rating = review_data['stars'] |
|
|
text = review_data['text'].replace("\n", " ") |
|
|
text = strip_formatting(text) |
|
|
|
|
|
fasttext_line = "__label__{} {}".format(rating, text) |
|
|
|
|
|
if random.random() <= percent_test_data: |
|
|
test_output.write(fasttext_line + "\n") |
|
|
else: |
|
|
train_output.write(fasttext_line + "\n") |
|
|
|
|
|
|
|
|
/fasttext supervised -input fasttext_dataset_training.txt -output reviews_model |
|
|
|
|
|
/fasttext test reviews_model.bin fasttext_dataset_test.txt |
|
|
|
|
|
/fasttext predict reviews_model.bin - |
|
|
this is a terrible restaurant . i hate it so much . |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# conda install -c conda-forge spacy |
|
|
|
|
|
# python -m spacy download en_core_web_lg |
|
|
|
|
|
# pip install -U textacy |
|
|
|
|
|
import spacy |
|
|
import textacy |
|
|
|
|
|
# Load the large English NLP model |
|
|
nlp = spacy.load('en_core_web_lg') |
|
|
|
|
|
# The text we want to examine |
|
|
text = """London is the capital and most populous city of England and |
|
|
the United Kingdom. Standing on the River Thames in the south east |
|
|
of the island of Great Britain, London has been a major settlement |
|
|
for two millennia. It was founded by the Romans, who named it Londinium. |
|
|
""" |
|
|
|
|
|
# Parse the text with spaCy. This runs the entire pipeline. |
|
|
doc = nlp(text) |
|
|
|
|
|
# 'doc' now contains a parsed version of text. We can use it to do anything we want! |
|
|
# For example, this will print out all the named entities that were detected: |
|
|
for entity in doc.ents: |
|
|
print(f"{entity.text} ({entity.label_})") |
|
|
|
|
|
# Extract semi-structured statements |
|
|
statements = textacy.extract.semistructured_statements(doc, "London") |
|
|
|
|
|
# Print the results |
|
|
print("Here are the things I know about London:") |
|
|
|
|
|
for statement in statements: |
|
|
subject, verb, fact = statement |
|
|
print(f" - {fact}") |
|
|
|
|
|
|
|
|
#pip install https://github.com/huggingface/neuralcoref-models/releases/download/en_coref_md-3.0.0/en_coref_md-3.0.0.tar.gz |
|
|
|
|
|
import en_coref_md |
|
|
|
|
|
nlp = en_coref_md.load() |
|
|
doc = nlp(u'My sister has a dog. She loves him.') |
|
|
|
|
|
doc._.has_coref |
|
|
doc._.coref_clusters |