ines · July 23, 2023 12:25 · Jan 19, 2019 · Jan 19, 2019
diff --git a/those_generously_buttered_noodles.py b/those_generously_buttered_noodles.py
@@ -67,12 +67,12 @@ def get_object(root):
 
 
 def get_pronoun(token):
-    # Based on the subject token, we also need to decide which pronoun to use.
-    # For example, "noodle" would require "it", whereass "noodles" needs "they".
+    # Based on the subject token, we need to decide which pronoun to use.
+    # For example, "noodle" would require "it", whereas "noodles" needs "they".
     # You might also just want to skip singular nouns alltogether and focus
     # on the plurals only, which are much simpler to deal with in English.
 
-    # spaCy current'y can't do this out-of-the-box, but there are other
+    # spaCy currently can't do this out-of-the-box, but there are other
     # rule-based or statistical systems that can do this pretty well. For
     # simplicity, we just mock this up here and always use "they".
     return "they"

diff --git a/those_generously_buttered_noodles.py b/those_generously_buttered_noodles.py
@@ -0,0 +1,127 @@
+# Using spaCy & NLP to create variations of "those generously buttered noodles"
+# See here: https://twitter.com/ArielDumas/status/1086294656957272065
+#
+# Disclaimer 1: This is a quick, simplified example focusing on one particular
+# sentence. There are obviously many more different constructions and
+# different types of dependencies you want to cover. Some aspects also become
+# significantly more difficult if you're working with, say, German instead of
+# English.
+#
+# Disclaimer 2: Creating spam comments is a very bad use case for NLP and
+# something I / we do not endorse. The point of this little script is to show
+# how to use spaCy to analyse syntax and other fund things you can do with the
+# dependency parse. So feel free to use this in your projects, but please don't
+# build spam bots ;)
+
+# Prerequisites:
+# pip install spacy
+# python -m spacy download en_core_web_sm
+import spacy
+
+# To process the text, we're using the small English model, which was trained
+# on a corpus of general-purpose news and web text. See here for details:
+# https://spacy.io/models/en#en_core_web_sm
+nlp = spacy.load("en_core_web_sm")
+
+# Here's our original text that we want to rephrase.
+text = "These generously buttered noodles, sprinkled with just a quarter cup of parsley for color and freshness, are the perfect blank canvas for practically any stew or braise."
+
+# This is the template we want to fill based on the original text. The subject
+# of the original sentence becomes an object attached to "love".
+template = "Couldn't agree more, but I would add that I sincerely love {subject}, because {pronoun} {verb} {obj}."
+
+# Calling the nlp object on a string of text returns a processed Doc object,
+# which gives us access to the individual tokens (words, punctuation) and their
+# linguistic annotations (part-of-speech tags, dependency labels) predicted
+# by the statistical model. See here for the visualization:
+# https://explosion.ai/demos/displacy?text=These%20generously%20buttered%20noodles%2C%20sprinkled%20with%20just%20a%20quarter%20cup%20of%20parsley%20for%20color%20and%20freshness%2C%20are%20the%20perfect%20blank%20canvas%20for%20practically%20any%20stew%20or%20braise&model=en_core_web_sm&cpu=0&cph=0
+doc = nlp(text)
+
+
+def get_root(doc):
+    # Based on a processed document, we want to find the syntactic root of the
+    # sentence. For this example, that should be the verb "are".
+    for token in doc:
+        if token.dep_ == "ROOT":
+            return token
+
+
+def get_subject(root):
+    # If we know the root of the sentence, we can use it to find its subject.
+    # Here, we're checking the root's children for a token with the dependency
+    # label 'nsubj' (nominal subject).
+    for token in root.children:
+        if token.dep_ == "nsubj":
+            return token
+
+
+def get_object(root):
+    # We also need to look for the object attached to the root. In this case,
+    # the dependency parser predicted the object we're looking for ("canvas")
+    # as "attr" (attribute), so we're using that. There are various other
+    # options, though, so if you want to generalise this script, you'd probably
+    # want to check for those as well.
+    for token in root.children:
+        if token.dep_ == "attr":
+            return token
+
+
+def get_pronoun(token):
+    # Based on the subject token, we also need to decide which pronoun to use.
+    # For example, "noodle" would require "it", whereass "noodles" needs "they".
+    # You might also just want to skip singular nouns alltogether and focus
+    # on the plurals only, which are much simpler to deal with in English.
+
+    # spaCy current'y can't do this out-of-the-box, but there are other
+    # rule-based or statistical systems that can do this pretty well. For
+    # simplicity, we just mock this up here and always use "they".
+    return "they"
+
+
+def get_subtree(token):
+    # Here, we are getting the subtree of a token – for example, if we know
+    # that "noodles" is the subject, we can resolve it to the full phrase
+    # "These generously buttered noodles, sprinkled with just a quarter cup of
+    # parsley for color and freshness".
+
+    # spaCy preserves the whitespace following a token in the `text_with_ws`
+    # attribute. This means you'll alwas be able to restore the original text.
+    # For example: "Hello world!" (good) vs. "Hello world !" (bad).
+    subtree = [t.text_with_ws for t in token.subtree]
+    subtree = "".join(subtree)
+
+    # Since our template will place the subject and object in the middle of a
+    # sentence, we also want to make sure that the first token starts with a
+    # lowercase letter – otherwise we'll end up with things like "love These".
+    subtree = subtree[0].lower() + subtree[1:]
+    return subtree
+
+
+# Let's put this all together!
+root = get_root(doc)
+print("Root:", root)
+
+subject = get_subject(root)
+print("Subject:", subject)
+
+subject_pronoun = get_pronoun(subject)
+print("Subject pronoun:", subject_pronoun)
+
+obj = get_object(root)
+print("Object:", obj)
+
+subject_subtree = get_subtree(subject)
+print("Subject subtree:", subject_subtree)
+
+object_subtree = get_subtree(obj)
+print("Object subtree:", object_subtree)
+
+print("Result:")
+print(
+    template.format(
+        subject=subject_subtree,
+        pronoun=subject_pronoun,
+        verb=root.text,
+        obj=object_subtree,
+    )
+)
No results found