wang-zifu · May 9, 2020 04:48
diff --git a/nlp.py b/nlp.py
 ## install
 # apt-get update && apt-get install -y \
        build-essential \
        wget \
        git \
        python-dev \
        unzip \
        python-numpy \
        python-scipy \
        && rm -rf /var/cache/apk/*


 # git clone https://github.com/facebookresearch/fastText.git /tmp/fastText && \
  rm -rf /tmp/fastText/.git* && \
  mv /tmp/fastText/* / && \
  cd / && \
  make

 # wget https://s3.amazonaws.com/datameetgeo/datameetgeo/yelp/yelp_dataset.tar

 import json
 from pathlib import Path
 import re
 import random

 reviews_data = Path("dataset") / "review.json"
 training_data = Path("fasttext_dataset_training.txt")
 test_data = Path("fasttext_dataset_test.txt")

 # What percent of data to save separately as test data
 percent_test_data = 0.10

 def strip_formatting(string):
    string = string.lower()
    string = re.sub(r"([.!?,'/()])", r" \1 ", string)
    return string

 with reviews_data.open() as input, \
     training_data.open("w") as train_output, \
     test_data.open("w") as test_output:

    for line in input:
        review_data = json.loads(line)

        rating = review_data['stars']
        text = review_data['text'].replace("\n", " ")
        text = strip_formatting(text)

        fasttext_line = "__label__{} {}".format(rating, text)

        if random.random() <= percent_test_data:
            test_output.write(fasttext_line + "\n")
        else:
            train_output.write(fasttext_line + "\n")


 /fasttext supervised -input fasttext_dataset_training.txt -output reviews_model 

 /fasttext test reviews_model.bin fasttext_dataset_test.txt

 /fasttext predict reviews_model.bin -
 this is a terrible restaurant . i hate it so much .

 

 
 # conda install -c conda-forge spacy

 # python -m spacy download en_core_web_lg

 # pip install -U textacy

 import spacy
 import textacy

 # Load the large English NLP model
 nlp = spacy.load('en_core_web_lg')

 # The text we want to examine
 text = """London is the capital and most populous city of England and 
 the United Kingdom.  Standing on the River Thames in the south east 
 of the island of Great Britain, London has been a major settlement 
 for two millennia. It was founded by the Romans, who named it Londinium.
 """

 # Parse the text with spaCy. This runs the entire pipeline.
 doc = nlp(text)

 # 'doc' now contains a parsed version of text. We can use it to do anything we want!
 # For example, this will print out all the named entities that were detected:
 for entity in doc.ents:
    print(f"{entity.text} ({entity.label_})")

 # Extract semi-structured statements
 statements = textacy.extract.semistructured_statements(doc, "London")

 # Print the results
 print("Here are the things I know about London:")

 for statement in statements:
    subject, verb, fact = statement
    print(f" - {fact}")


 #pip install https://github.com/huggingface/neuralcoref-models/releases/download/en_coref_md-3.0.0/en_coref_md-3.0.0.tar.gz

 import en_coref_md

 nlp = en_coref_md.load()
 doc = nlp(u'My sister has a dog. She loves him.')

 doc._.has_coref
 doc._.coref_clusters
	## install
	# apt-get update && apt-get install -y \
	build-essential \
	wget \
	git \
	python-dev \
	unzip \
	python-numpy \
	python-scipy \
	&& rm -rf /var/cache/apk/*


	# git clone https://github.com/facebookresearch/fastText.git /tmp/fastText && \
	rm -rf /tmp/fastText/.git* && \
	mv /tmp/fastText/* / && \
	cd / && \
	make

	# wget https://s3.amazonaws.com/datameetgeo/datameetgeo/yelp/yelp_dataset.tar

	import json
	from pathlib import Path
	import re
	import random

	reviews_data = Path("dataset") / "review.json"
	training_data = Path("fasttext_dataset_training.txt")
	test_data = Path("fasttext_dataset_test.txt")

	# What percent of data to save separately as test data
	percent_test_data = 0.10

	def strip_formatting(string):
	string = string.lower()
	string = re.sub(r"([.!?,'/()])", r" \1 ", string)
	return string

	with reviews_data.open() as input, \
	training_data.open("w") as train_output, \
	test_data.open("w") as test_output:

	for line in input:
	review_data = json.loads(line)

	rating = review_data['stars']
	text = review_data['text'].replace("\n", " ")
	text = strip_formatting(text)

	fasttext_line = "__label__{} {}".format(rating, text)

	if random.random() <= percent_test_data:
	test_output.write(fasttext_line + "\n")
	else:
	train_output.write(fasttext_line + "\n")


	/fasttext supervised -input fasttext_dataset_training.txt -output reviews_model

	/fasttext test reviews_model.bin fasttext_dataset_test.txt

	/fasttext predict reviews_model.bin -
	this is a terrible restaurant . i hate it so much .




	# conda install -c conda-forge spacy

	# python -m spacy download en_core_web_lg

	# pip install -U textacy

	import spacy
	import textacy

	# Load the large English NLP model
	nlp = spacy.load('en_core_web_lg')

	# The text we want to examine
	text = """London is the capital and most populous city of England and
	the United Kingdom. Standing on the River Thames in the south east
	of the island of Great Britain, London has been a major settlement
	for two millennia. It was founded by the Romans, who named it Londinium.
	"""

	# Parse the text with spaCy. This runs the entire pipeline.
	doc = nlp(text)

	# 'doc' now contains a parsed version of text. We can use it to do anything we want!
	# For example, this will print out all the named entities that were detected:
	for entity in doc.ents:
	print(f"{entity.text} ({entity.label_})")

	# Extract semi-structured statements
	statements = textacy.extract.semistructured_statements(doc, "London")

	# Print the results
	print("Here are the things I know about London:")

	for statement in statements:
	subject, verb, fact = statement
	print(f" - {fact}")


	#pip install https://github.com/huggingface/neuralcoref-models/releases/download/en_coref_md-3.0.0/en_coref_md-3.0.0.tar.gz

	import en_coref_md

	nlp = en_coref_md.load()
	doc = nlp(u'My sister has a dog. She loves him.')

	doc._.has_coref
	doc._.coref_clusters
No results found