gaulinmp · July 8, 2024 07:29 · Jul 3, 2015 · Jul 2, 2015 · Jul 2, 2015
diff --git a/singularize.py b/singularize.py
@@ -64,7 +64,7 @@
     # -a
     (r'(?i)([ti])a$', '\\1um'),
     (r'(?i)(n)ews$', '\\1ews'),
-    (r'(?i)s$', ''),
+    (r'(?i)([^s])s$', '\\1'),  # don't make ss singularize to s.
 ]
 
 # For performance, compile the regular expressions only once:
@@ -80,8 +80,10 @@
     "christmas", "gallows", "mackerel", "series",
     "clippers", "georgia", "measles", "shears",
     "cod", "graffiti", "mews", "species",
-    "contretemps",              "mumps", "swine",
-    "corps",              "news", "swiss",
+    "contretemps", "mumps", "swine",
+    "corps", "news", "swiss",
+    # Custom added from MD&A corpus
+    "api", "mae", "sae", "basis", "india", "media",
 ))
 _singular_uncountable = set((
     "advice", "equipment", "happiness", "luggage", "news", "software",
@@ -101,12 +103,14 @@
     "cookie", "hippie", "meanie", "rookie", "valkyrie",
 ))
 _singular_irregular = {
+    "abuses": "abuse",
+    "ads": "ad",
     "atlantes": "atlas",
     "atlases": "atlas",
+    "analysis": "analysis",
     "axes": "axe",
     "beeves": "beef",
     "brethren": "brother",
-    "business": "business",
     "children": "child",
     "children": "child",
     "corpora": "corpus",
@@ -132,8 +136,8 @@
     "octopodes": "octopus",
     "opera": "opus",
     "opuses": "opus",
-            "our": "my",
-           "oxen": "ox",
+    "our": "my",
+    "oxen": "ox",
     "penes": "penis",
     "penises": "penis",
     "people": "person",
@@ -143,7 +147,7 @@
     "testes": "testis",
     "trilbys": "trilby",
     "turves": "turf",
-            "zoa": "zoon",
+    "zoa": "zoon",
 }
 
 _plural_prepositions = set((
@@ -154,12 +158,10 @@
     "among", "beside", "in", "out", "unto",
     "around", "besides", "into", "over", "upon",
     "at", "between", "near", "since", "with",
-    "athwart", "betwixt",
-               "beyond",
-               "but",
-               "by"))
-
+    "athwart", "betwixt", "beyond", "but", "by"
+))
 
+
 def singularize(word, custom={}):
     """Returns the singular of a given word."""
     if word in custom:
@@ -193,4 +195,4 @@ def singularize(word, custom={}):
                 if g[k] is None:
                     inflection = inflection.replace('\\' + str(k + 1), '')
             return suffix.sub(inflection, word)
-    return word
+    return word
diff --git a/singularize.py b/singularize.py
@@ -68,7 +68,7 @@
 ]
 
 # For performance, compile the regular expressions only once:
-_singular_rules = [(re.compile(r[0]), r[1]) for r in singular_rules]
+_singular_rules = [(re.compile(r[0]), r[1]) for r in _singular_rules]
 
 _singular_uninflected = set((
     "bison", "debris", "headquarters", "pincers", "trout",

diff --git a/singularize.py b/singularize.py
@@ -0,0 +1,196 @@
+#### SINGULARIZE #########################################################
+# Adapted from Bermi Ferrer's Inflector for Python:
+# http://www.bermi.org/inflector/
+
+# Copyright (c) 2006 Bermi Ferrer Martinez
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software to deal in this software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of this software, and to permit
+# persons to whom this software is furnished to do so, subject to the following
+# condition:
+#
+# THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THIS SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THIS SOFTWARE.
+
+_singular_rules = [
+    (r'(?i)(.)ae$', '\\1a'),
+    (r'(?i)(.)itis$', '\\1itis'),
+    (r'(?i)(.)eaux$', '\\1eau'),
+    (r'(?i)(quiz)zes$', '\\1'),
+    (r'(?i)(matr)ices$', '\\1ix'),
+    (r'(?i)(ap|vert|ind)ices$', '\\1ex'),
+    (r'(?i)^(ox)en', '\\1'),
+    (r'(?i)(alias|status)es$', '\\1'),
+    (r'(?i)([octop|vir])i$',  '\\1us'),
+    (r'(?i)(cris|ax|test)es$', '\\1is'),
+    (r'(?i)(shoe)s$', '\\1'),
+    (r'(?i)(o)es$', '\\1'),
+    (r'(?i)(bus)es$', '\\1'),
+    (r'(?i)([m|l])ice$', '\\1ouse'),
+    (r'(?i)(x|ch|ss|sh)es$', '\\1'),
+    (r'(?i)(m)ovies$', '\\1ovie'),
+    (r'(?i)(.)ombies$', '\\1ombie'),
+    (r'(?i)(s)eries$', '\\1eries'),
+    (r'(?i)([^aeiouy]|qu)ies$', '\\1y'),
+    # -f, -fe sometimes take -ves in the plural
+    # (e.g., lives, wolves).
+    (r"([aeo]l)ves$", "\\1f"),
+    (r"([^d]ea)ves$", "\\1f"),
+    (r"arves$", "arf"),
+    (r"erves$", "erve"),
+    (r"([nlw]i)ves$", "\\1fe"),
+    (r'(?i)([lr])ves$', '\\1f'),
+    (r"([aeo])ves$", "\\1ve"),
+    (r'(?i)(sive)s$', '\\1'),
+    (r'(?i)(tive)s$', '\\1'),
+    (r'(?i)(hive)s$', '\\1'),
+    (r'(?i)([^f])ves$', '\\1fe'),
+    # -ses suffixes.
+    (r'(?i)(^analy)ses$', '\\1sis'),
+    (r'(?i)((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$',
+     '\\1\\2sis'),
+    (r'(?i)(.)opses$', '\\1opsis'),
+    (r'(?i)(.)yses$', '\\1ysis'),
+    (r'(?i)(h|d|r|o|n|b|cl|p)oses$', '\\1ose'),
+    (r'(?i)(fruct|gluc|galact|lact|ket|malt|rib|sacchar|cellul)ose$',
+     '\\1ose'),
+    (r'(?i)(.)oses$', '\\1osis'),
+    # -a
+    (r'(?i)([ti])a$', '\\1um'),
+    (r'(?i)(n)ews$', '\\1ews'),
+    (r'(?i)s$', ''),
+]
+
+# For performance, compile the regular expressions only once:
+_singular_rules = [(re.compile(r[0]), r[1]) for r in singular_rules]
+
+_singular_uninflected = set((
+    "bison", "debris", "headquarters", "pincers", "trout",
+    "bream", "diabetes", "herpes", "pliers", "tuna",
+    "breeches", "djinn", "high-jinks", "proceedings", "whiting",
+    "britches", "eland", "homework", "rabies", "wildebeest"
+    "carp", "elk", "innings", "salmon",
+    "chassis", "flounder", "jackanapes", "scissors",
+    "christmas", "gallows", "mackerel", "series",
+    "clippers", "georgia", "measles", "shears",
+    "cod", "graffiti", "mews", "species",
+    "contretemps",              "mumps", "swine",
+    "corps",              "news", "swiss",
+))
+_singular_uncountable = set((
+    "advice", "equipment", "happiness", "luggage", "news", "software",
+    "bread", "fruit", "information", "mathematics", "progress", "understanding",
+    "butter", "furniture", "ketchup", "mayonnaise", "research", "water"
+    "cheese", "garbage", "knowledge", "meat", "rice",
+    "electricity", "gravel", "love", "mustard", "sand",
+))
+_singular_ie = set((
+    "alergie", "cutie", "hoagie", "newbie", "softie", "veggie",
+    "auntie", "doggie", "hottie", "nightie", "sortie", "weenie",
+    "beanie", "eyrie", "indie", "oldie", "stoolie", "yuppie",
+    "birdie", "freebie", "junkie", "^pie", "sweetie", "zombie"
+    "bogie", "goonie", "laddie", "pixie", "techie",
+    "bombie", "groupie", "laramie", "quickie", "^tie",
+    "collie", "hankie", "lingerie", "reverie", "toughie",
+    "cookie", "hippie", "meanie", "rookie", "valkyrie",
+))
+_singular_irregular = {
+    "atlantes": "atlas",
+    "atlases": "atlas",
+    "axes": "axe",
+    "beeves": "beef",
+    "brethren": "brother",
+    "business": "business",
+    "children": "child",
+    "children": "child",
+    "corpora": "corpus",
+    "corpuses": "corpus",
+    "ephemerides": "ephemeris",
+    "feet": "foot",
+    "ganglia": "ganglion",
+    "geese": "goose",
+    "genera": "genus",
+    "genii": "genie",
+    "graffiti": "graffito",
+    "helves": "helve",
+    "kine": "cow",
+    "leaves": "leaf",
+    "loaves": "loaf",
+    "men": "man",
+    "mongooses": "mongoose",
+    "monies": "money",
+    "moves": "move",
+    "mythoi": "mythos",
+    "numena": "numen",
+    "occipita": "occiput",
+    "octopodes": "octopus",
+    "opera": "opus",
+    "opuses": "opus",
+            "our": "my",
+           "oxen": "ox",
+    "penes": "penis",
+    "penises": "penis",
+    "people": "person",
+    "sexes": "sex",
+    "soliloquies": "soliloquy",
+    "teeth": "tooth",
+    "testes": "testis",
+    "trilbys": "trilby",
+    "turves": "turf",
+            "zoa": "zoon",
+}
+
+_plural_prepositions = set((
+    "about", "before", "during", "of", "till",
+    "above", "behind", "except", "off", "to",
+    "across", "below", "for", "on", "under",
+    "after", "beneath", "from", "onto", "until",
+    "among", "beside", "in", "out", "unto",
+    "around", "besides", "into", "over", "upon",
+    "at", "between", "near", "since", "with",
+    "athwart", "betwixt",
+               "beyond",
+               "but",
+               "by"))
+
+
+def singularize(word, custom={}):
+    """Returns the singular of a given word."""
+    if word in custom:
+        return custom[word]
+    # Recurse compound words (e.g. mothers-in-law).
+    if "-" in word:
+        w = word.split("-")
+        if len(w) > 1 and w[1] in _plural_prepositions:
+            return singularize(w[0], custom) + "-" + "-".join(w[1:])
+    # dogs' => dog's
+    if word.endswith("'"):
+        return singularize(word[:-1], custom) + "'s"
+    w = word.lower()
+    for x in _singular_uninflected:
+        if x.endswith(w):
+            return word
+    for x in _singular_uncountable:
+        if x.endswith(w):
+            return word
+    for x in _singular_ie:
+        if w.endswith(x + "s"):
+            return w
+    for x in _singular_irregular:
+        if w.endswith(x):
+            return re.sub('(?i)' + x + '$', _singular_irregular[x], word)
+    for suffix, inflection in _singular_rules:
+        m = suffix.search(word)
+        g = m and m.groups() or []
+        if m:
+            for k in range(len(g)):
+                if g[k] is None:
+                    inflection = inflection.replace('\\' + str(k + 1), '')
+            return suffix.sub(inflection, word)
+    return word
No results found