## install # apt-get update && apt-get install -y \ build-essential \ wget \ git \ python-dev \ unzip \ python-numpy \ python-scipy \ && rm -rf /var/cache/apk/* # git clone https://github.com/facebookresearch/fastText.git /tmp/fastText && \ rm -rf /tmp/fastText/.git* && \ mv /tmp/fastText/* / && \ cd / && \ make # wget https://s3.amazonaws.com/datameetgeo/datameetgeo/yelp/yelp_dataset.tar import json from pathlib import Path import re import random reviews_data = Path("dataset") / "review.json" training_data = Path("fasttext_dataset_training.txt") test_data = Path("fasttext_dataset_test.txt") # What percent of data to save separately as test data percent_test_data = 0.10 def strip_formatting(string): string = string.lower() string = re.sub(r"([.!?,'/()])", r" \1 ", string) return string with reviews_data.open() as input, \ training_data.open("w") as train_output, \ test_data.open("w") as test_output: for line in input: review_data = json.loads(line) rating = review_data['stars'] text = review_data['text'].replace("\n", " ") text = strip_formatting(text) fasttext_line = "__label__{} {}".format(rating, text) if random.random() <= percent_test_data: test_output.write(fasttext_line + "\n") else: train_output.write(fasttext_line + "\n") /fasttext supervised -input fasttext_dataset_training.txt -output reviews_model /fasttext test reviews_model.bin fasttext_dataset_test.txt /fasttext predict reviews_model.bin - this is a terrible restaurant . i hate it so much . # conda install -c conda-forge spacy # python -m spacy download en_core_web_lg # pip install -U textacy import spacy import textacy # Load the large English NLP model nlp = spacy.load('en_core_web_lg') # The text we want to examine text = """London is the capital and most populous city of England and the United Kingdom. Standing on the River Thames in the south east of the island of Great Britain, London has been a major settlement for two millennia. It was founded by the Romans, who named it Londinium. """ # Parse the text with spaCy. This runs the entire pipeline. doc = nlp(text) # 'doc' now contains a parsed version of text. We can use it to do anything we want! # For example, this will print out all the named entities that were detected: for entity in doc.ents: print(f"{entity.text} ({entity.label_})") # Extract semi-structured statements statements = textacy.extract.semistructured_statements(doc, "London") # Print the results print("Here are the things I know about London:") for statement in statements: subject, verb, fact = statement print(f" - {fact}") #pip install https://github.com/huggingface/neuralcoref-models/releases/download/en_coref_md-3.0.0/en_coref_md-3.0.0.tar.gz import en_coref_md nlp = en_coref_md.load() doc = nlp(u'My sister has a dog. She loves him.') doc._.has_coref doc._.coref_clusters