wang-zifu · May 9, 2020 04:48 · Aug 19, 2018
diff --git a/nlp.py b/nlp.py
@@ -0,0 +1,113 @@
+## install
+# apt-get update && apt-get install -y \
+        build-essential \
+        wget \
+        git \
+        python-dev \
+        unzip \
+        python-numpy \
+        python-scipy \
+        && rm -rf /var/cache/apk/*
+
+
+# git clone https://github.com/facebookresearch/fastText.git /tmp/fastText && \
+  rm -rf /tmp/fastText/.git* && \
+  mv /tmp/fastText/* / && \
+  cd / && \
+  make
+
+# wget https://s3.amazonaws.com/datameetgeo/datameetgeo/yelp/yelp_dataset.tar
+
+import json
+from pathlib import Path
+import re
+import random
+
+reviews_data = Path("dataset") / "review.json"
+training_data = Path("fasttext_dataset_training.txt")
+test_data = Path("fasttext_dataset_test.txt")
+
+# What percent of data to save separately as test data
+percent_test_data = 0.10
+
+def strip_formatting(string):
+    string = string.lower()
+    string = re.sub(r"([.!?,'/()])", r" \1 ", string)
+    return string
+
+with reviews_data.open() as input, \
+     training_data.open("w") as train_output, \
+     test_data.open("w") as test_output:
+
+    for line in input:
+        review_data = json.loads(line)
+
+        rating = review_data['stars']
+        text = review_data['text'].replace("\n", " ")
+        text = strip_formatting(text)
+
+        fasttext_line = "__label__{} {}".format(rating, text)
+
+        if random.random() <= percent_test_data:
+            test_output.write(fasttext_line + "\n")
+        else:
+            train_output.write(fasttext_line + "\n")
+
+
+/fasttext supervised -input fasttext_dataset_training.txt -output reviews_model 
+
+/fasttext test reviews_model.bin fasttext_dataset_test.txt
+
+/fasttext predict reviews_model.bin -
+this is a terrible restaurant . i hate it so much .
+
+
+
+
+# conda install -c conda-forge spacy
+
+# python -m spacy download en_core_web_lg
+
+# pip install -U textacy
+
+import spacy
+import textacy
+
+# Load the large English NLP model
+nlp = spacy.load('en_core_web_lg')
+
+# The text we want to examine
+text = """London is the capital and most populous city of England and 
+the United Kingdom.  Standing on the River Thames in the south east 
+of the island of Great Britain, London has been a major settlement 
+for two millennia. It was founded by the Romans, who named it Londinium.
+"""
+
+# Parse the text with spaCy. This runs the entire pipeline.
+doc = nlp(text)
+
+# 'doc' now contains a parsed version of text. We can use it to do anything we want!
+# For example, this will print out all the named entities that were detected:
+for entity in doc.ents:
+    print(f"{entity.text} ({entity.label_})")
+
+# Extract semi-structured statements
+statements = textacy.extract.semistructured_statements(doc, "London")
+
+# Print the results
+print("Here are the things I know about London:")
+
+for statement in statements:
+    subject, verb, fact = statement
+    print(f" - {fact}")
+
+
+#pip install https://github.com/huggingface/neuralcoref-models/releases/download/en_coref_md-3.0.0/en_coref_md-3.0.0.tar.gz
+
+import en_coref_md
+
+nlp = en_coref_md.load()
+doc = nlp(u'My sister has a dog. She loves him.')
+
+doc._.has_coref
+doc._.coref_clusters
No results found