alanbernstein · November 11, 2021 04:20 · Nov 11, 2021 · Nov 11, 2021
diff --git a/gistfile1.txt → phylo.py b/gistfile1.txt → phylo.py
diff --git a/gistfile1.txt b/gistfile1.txt
@@ -0,0 +1,281 @@
+#!/usr/local/bin/python
+import sys
+import wikipedia
+import requests
+import re
+from pprint import pprint
+
+# TODO: cache results and use cache to build up a tree
+# TODO: handle disambiguation page (e.g., for 'orange')
+
+# alternately could use this http://www.itis.gov/index.html but requires
+# a lot more work to decide which search result to use
+#
+# https://en.wikipedia.org/wiki/Horse
+# https://en.wikipedia.org/wiki/Template:Taxobox#Classification
+# https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=xmlfm&titles=kidney%20bean&rvsection=0&redirects
+
+ranks = ['regnum', 'phylum', 'classis', 'ordo', 'subordo', 'familia', 'genus',
+         'species']
+
+ranks_english = {'regnum': 'kingdom',
+                 'phylum': 'phylum',
+                 'classis': 'class',
+                 'ordo': 'order',
+                 'subordo': 'suborder',
+                 'familia': 'family',
+                 'genus': 'genus',
+                 'species': 'species'}
+
+
+def main():
+
+    if len(sys.argv) > 2:
+        searchterm1 = sys.argv[1]
+        searchterm2 = sys.argv[2]
+    else:
+        searchterm1 = 'dolphin'
+        searchterm2 = 'blue whale'
+    print(searchterm1, searchterm2)
+
+    tax1 = {'common name': searchterm1}
+    tax2 = {'common name': searchterm2}
+
+    taxobox1 = get_taxobox_from_search_term(searchterm1)
+    if taxobox1:
+        tax1 = get_taxonomy_from_taxobox(taxobox1, tax1)
+    else:
+        print('unable to retrieve info for %s' % tax1['common name'])
+
+    taxobox2 = get_taxobox_from_search_term(searchterm2)
+
+    if taxobox2:
+        tax2 = get_taxonomy_from_taxobox(taxobox2, tax2)
+    else:
+        print('unable to retrieve info for %s' % tax2['common name'])
+
+    if taxobox1 and taxobox2:
+        print_taxonomy([tax1, tax2])
+        get_lowest_common_node(tax1, tax2)
+
+
+def get_taxonomy(search_term):
+    tax = {'common name': search_term}
+    taxobox = get_taxobox_from_search_term(search_term)
+    if taxobox:
+        tax = get_taxonomy_from_taxobox(taxobox, tax)
+    else:
+        print('unable to retrieve info for %s' % tax['common name'])
+
+    # needs to return taxobox and tax - seems like rewrite
+
+
+def get_taxobox_from_search_term(term, d=0):
+    # idea: given a search term, try to get the "taxobox" infobox about the
+    # living thing described by that term. several possibilities:
+    # - page matching search term is the desired page, and has a taxobox
+    #   - done, extract taxobox contents and pass on
+    # - page matching search term has no taxobox
+    #   - search for taxonomy link, repeat (link preceded by 'species', 'genus', ...
+    #     https://en.wikipedia.org/wiki/Salmon
+    #     https://en.wikipedia.org/wiki/Hazelnut
+    # - page is a redirect
+    #     https://en.wikipedia.org/wiki/Phaseolus_lunatus (lima bean)
+    # - page contains taxobox, but taxobox is a template itself
+    #     https://en.wikipedia.org/wiki/Spider - https://en.wikipedia.org/wiki/Template:Taxonomy/Araneae#
+    #
+    # todo: record which of these cases occur for which terms
+    query = 'https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=json&titles=' + term + '&rvsection=0&redirects'
+
+    resp = requests.get(query)
+    content = resp.text
+    taxobox = None
+    print(content)
+
+    if '#REDIRECT' in content:
+        s = 'is redirect'
+        # handled by additional parameter in request string
+
+    elif 'automatic taxobox' in content.lower():
+        s = 'contains taxobox template'
+        # find the 'taxon' entry in the taxobox, go to it
+        # example: spiders is 'taxon = Araneae' amd uses this:
+        # https://en.wikipedia.org/wiki/Template:Taxonomy/Araneae
+
+        # https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=xmlfm&titles=spider&rvsection=0&rvexpandtemplates
+
+    elif 'speciesbox' in content.lower():
+        s = 'contains speciesbox template'
+        # find the 'taxon' entry in the taxobox, go to it
+        # example: garlic is 'taxon = Allium sativum', and uses this:
+        # https://en.wikipedia.org/wiki/Template:Taxonomy/Allium
+
+        # https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=xmlfm&titles=spider&rvsection=0&rvexpandtemplates
+
+    elif 'taxobox' in content.lower():
+        s = 'contains usable taxobox'
+        taxobox = content
+    else:
+        s = 'contains no taxobox... '
+
+        if d == 0:
+            # check pages of all links for taxoboxes
+            # first links preceeded by the word 'species', then 'genus',
+            # then 'family', then all other links in the intro section
+
+            # get a list of links along with the preceeding words
+            p = r'[^ ]*\[\[([^]]*)\]\]'
+            mat = re.finditer(p, content)
+            links = []
+            for m in mat:
+                spind = content.rfind(' ', 0, m.start(0) - 1)
+                prevword = content[spind + 1:m.start(0) - 1]
+                links.append([prevword, m.group(1)])
+#            pprint(links)
+
+# check the species/genus/family links
+            for prev_word in ['species', 'genus', 'family']:
+                links2 = [n for n in links if prev_word in n[0]]
+
+                if not taxobox:
+                    for l in links2:
+                        tb = get_taxobox_from_search_term(l[1], d + 1)
+                        if tb:
+                            taxobox = tb
+                            s = 'retrieved info from ' + prev_word + ' link "' + l[
+                                1] + '"'
+                            break
+
+            # check all other links
+            if not taxobox:
+                for l in links:
+                    tb = get_taxobox_from_search_term(l[1], d + 1)
+                    if tb:
+                        taxobox = tb
+                        s = 'retrieved info from link "' + l[1] + '"'
+                        break
+
+        else:
+            s = 'no taxobox in any first-level links'
+
+    if d == 0:
+        print(' ' * d + term + ': ' + s)
+
+    return taxobox
+
+
+def get_taxonomy_from_expanded_taxobox(taxobox, taxonomy):
+    # for
+    pass
+
+
+def get_taxonomy_from_taxobox(taxobox, taxonomy):
+    # dolphin:
+    # | regnum = [[Animalia]]\n| phylum = [[Chordata]]\n| classis = [[Mammalia]]\n| ordo = [[Cetacea]]\n| subordo = [[Odontoceti]]\n| familia = *[[Delphinidae]]\n*[[Iniidae]]\n*\u2020[[Lipotidae]]\n*[[Platanistidae]]\n*[[Pontoporiidae]]\n|
+
+    # blue whale:
+    # regnum=[[Animal]]ia\n | phylum=[[Chordata]]\n | classis=[[Mammal]]ia\n | ordo = [[Cetartiodactyla]]{{bunch of reference text}}\n| unranked_subordo = [[Cetacea]]\n| unranked_superfamilia = [[Mysticeti]]\n | familia=[[Balaenopteridae]]\n | genus=''[[Balaenoptera]]''\n | species='''''B. musculus '''''\n |
+    #
+    #
+    # todo:
+    # - rank parsing issues:
+    #   - use link name, not link url
+    #   x remove citations
+    #   x deal with non-link ranks and links with different names
+    #   x deal with missing ranks
+    #   - deal with sub, super, infra, ultra, unranked
+    #   - deal with multiple values
+    #     x just use first
+    #     - get a list
+
+    PRINT = 0
+    if PRINT:
+        print('')
+        print(taxonomy['common name'])
+        print(taxobox)
+
+    # TODO:
+    # preprocess links:
+    # [[xyz|qwe abc]] -> [[qwe abc]]
+    # [[qwe xyz]]abc -> [[qwe xyzabc]]
+    #
+    # then:
+    # [[qwe xyz]] -> 'qwe xyz'
+
+    for r in ranks:
+
+        # extract 'row' of taxobox with this rank in it
+        p = r + r'[ ]*=[^|]*\|'
+        x = re.search(p, taxobox)
+        if x is None:
+            continue
+        row = x.group(0)
+        row2 = re.sub('{.*', '', row)  # remove {{}} entities
+
+        # extract based on link - fails if entry not a link
+
+        p = r'\[\[[^]]*\]\]'
+        rnames1 = re.findall(p, row2)
+        rnames1 = [s[2:-2] for s in rnames1]
+
+        # extract based on other stuff -
+        row3 = row2.replace('[[', '')
+        row4 = row3.replace(']]', '')
+        row5 = row4.replace('*', '')
+        row6 = row5.replace("'", '')
+        p = r'=[ ]*[a-zA-Z .]*'
+        x = re.search(p, row6)
+        rname2 = x.group(0)
+        rname2 = rname2.replace('=', '').replace('\\n', '').strip()
+
+        taxonomy[r] = rname2
+        if PRINT:
+            print(r, row, row6, rnames1, rname2)
+
+    if len(taxonomy) < 2:
+        print('no data found for %s' % taxonomy['common name'])
+    return taxonomy
+
+
+def print_taxonomy(taxa):
+
+    s1 = '%10s  ' % ''
+    s2 = ''
+    for t in taxa:
+        s2 = s2 + '%15s' % t['common name']
+    if len(s2.strip()) > 0:
+        print(s1 + s2)
+
+    for r in ranks:
+        s1 = '%10s: ' % ranks_english[r]
+        s2 = ''
+        for t in taxa:
+            if r in t.keys():
+                s2 = s2 + '%15s' % t[r]
+            else:
+                s2 = s2 + '%15s' % ' '
+
+        if len(s2.strip()) > 0:
+            print(s1 + s2)
+
+
+def get_lowest_common_node(a, b):
+
+    for r in reversed(ranks):
+        if r in a.keys() and r in b.keys() and a[r] == b[r]:
+            print('"%s" and "%s" share the same %s (%s)' %
+                  (a['common name'], b['common name'], ranks_english[r], a[r]))
+            break
+
+
+def extract_ranks(text):
+    # idea: start with a known list, including
+    #   regnum, phylum, ..., genus, species
+    # - split the text into lines
+    # - look for lines containing these entries
+    # - look at all intermediate lines
+    pass
+
+
+if __name__ == '__main__':
+    main()
No results found