-
-
Save wang-zifu/03937cd9cd506c8aec6e6b5c1f305611 to your computer and use it in GitHub Desktop.
NLP using spacy and other modules
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ## install | |
| # apt-get update && apt-get install -y \ | |
| build-essential \ | |
| wget \ | |
| git \ | |
| python-dev \ | |
| unzip \ | |
| python-numpy \ | |
| python-scipy \ | |
| && rm -rf /var/cache/apk/* | |
| # git clone https://github.com/facebookresearch/fastText.git /tmp/fastText && \ | |
| rm -rf /tmp/fastText/.git* && \ | |
| mv /tmp/fastText/* / && \ | |
| cd / && \ | |
| make | |
| # wget https://s3.amazonaws.com/datameetgeo/datameetgeo/yelp/yelp_dataset.tar | |
| import json | |
| from pathlib import Path | |
| import re | |
| import random | |
| reviews_data = Path("dataset") / "review.json" | |
| training_data = Path("fasttext_dataset_training.txt") | |
| test_data = Path("fasttext_dataset_test.txt") | |
| # What percent of data to save separately as test data | |
| percent_test_data = 0.10 | |
| def strip_formatting(string): | |
| string = string.lower() | |
| string = re.sub(r"([.!?,'/()])", r" \1 ", string) | |
| return string | |
| with reviews_data.open() as input, \ | |
| training_data.open("w") as train_output, \ | |
| test_data.open("w") as test_output: | |
| for line in input: | |
| review_data = json.loads(line) | |
| rating = review_data['stars'] | |
| text = review_data['text'].replace("\n", " ") | |
| text = strip_formatting(text) | |
| fasttext_line = "__label__{} {}".format(rating, text) | |
| if random.random() <= percent_test_data: | |
| test_output.write(fasttext_line + "\n") | |
| else: | |
| train_output.write(fasttext_line + "\n") | |
| /fasttext supervised -input fasttext_dataset_training.txt -output reviews_model | |
| /fasttext test reviews_model.bin fasttext_dataset_test.txt | |
| /fasttext predict reviews_model.bin - | |
| this is a terrible restaurant . i hate it so much . | |
| # conda install -c conda-forge spacy | |
| # python -m spacy download en_core_web_lg | |
| # pip install -U textacy | |
| import spacy | |
| import textacy | |
| # Load the large English NLP model | |
| nlp = spacy.load('en_core_web_lg') | |
| # The text we want to examine | |
| text = """London is the capital and most populous city of England and | |
| the United Kingdom. Standing on the River Thames in the south east | |
| of the island of Great Britain, London has been a major settlement | |
| for two millennia. It was founded by the Romans, who named it Londinium. | |
| """ | |
| # Parse the text with spaCy. This runs the entire pipeline. | |
| doc = nlp(text) | |
| # 'doc' now contains a parsed version of text. We can use it to do anything we want! | |
| # For example, this will print out all the named entities that were detected: | |
| for entity in doc.ents: | |
| print(f"{entity.text} ({entity.label_})") | |
| # Extract semi-structured statements | |
| statements = textacy.extract.semistructured_statements(doc, "London") | |
| # Print the results | |
| print("Here are the things I know about London:") | |
| for statement in statements: | |
| subject, verb, fact = statement | |
| print(f" - {fact}") | |
| #pip install https://github.com/huggingface/neuralcoref-models/releases/download/en_coref_md-3.0.0/en_coref_md-3.0.0.tar.gz | |
| import en_coref_md | |
| nlp = en_coref_md.load() | |
| doc = nlp(u'My sister has a dog. She loves him.') | |
| doc._.has_coref | |
| doc._.coref_clusters |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment