Skip to content

Instantly share code, notes, and snippets.

@alanbernstein
Last active November 11, 2021 04:20
Show Gist options
  • Select an option

  • Save alanbernstein/c4fb09a4393d0c17ccd02632a016f2d4 to your computer and use it in GitHub Desktop.

Select an option

Save alanbernstein/c4fb09a4393d0c17ccd02632a016f2d4 to your computer and use it in GitHub Desktop.

Revisions

  1. alanbernstein renamed this gist Nov 11, 2021. 1 changed file with 0 additions and 0 deletions.
    File renamed without changes.
  2. alanbernstein created this gist Nov 11, 2021.
    281 changes: 281 additions & 0 deletions gistfile1.txt
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,281 @@
    #!/usr/local/bin/python
    import sys
    import wikipedia
    import requests
    import re
    from pprint import pprint

    # TODO: cache results and use cache to build up a tree
    # TODO: handle disambiguation page (e.g., for 'orange')

    # alternately could use this http://www.itis.gov/index.html but requires
    # a lot more work to decide which search result to use
    #
    # https://en.wikipedia.org/wiki/Horse
    # https://en.wikipedia.org/wiki/Template:Taxobox#Classification
    # https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=xmlfm&titles=kidney%20bean&rvsection=0&redirects

    ranks = ['regnum', 'phylum', 'classis', 'ordo', 'subordo', 'familia', 'genus',
    'species']

    ranks_english = {'regnum': 'kingdom',
    'phylum': 'phylum',
    'classis': 'class',
    'ordo': 'order',
    'subordo': 'suborder',
    'familia': 'family',
    'genus': 'genus',
    'species': 'species'}


    def main():

    if len(sys.argv) > 2:
    searchterm1 = sys.argv[1]
    searchterm2 = sys.argv[2]
    else:
    searchterm1 = 'dolphin'
    searchterm2 = 'blue whale'
    print(searchterm1, searchterm2)

    tax1 = {'common name': searchterm1}
    tax2 = {'common name': searchterm2}

    taxobox1 = get_taxobox_from_search_term(searchterm1)
    if taxobox1:
    tax1 = get_taxonomy_from_taxobox(taxobox1, tax1)
    else:
    print('unable to retrieve info for %s' % tax1['common name'])

    taxobox2 = get_taxobox_from_search_term(searchterm2)

    if taxobox2:
    tax2 = get_taxonomy_from_taxobox(taxobox2, tax2)
    else:
    print('unable to retrieve info for %s' % tax2['common name'])

    if taxobox1 and taxobox2:
    print_taxonomy([tax1, tax2])
    get_lowest_common_node(tax1, tax2)


    def get_taxonomy(search_term):
    tax = {'common name': search_term}
    taxobox = get_taxobox_from_search_term(search_term)
    if taxobox:
    tax = get_taxonomy_from_taxobox(taxobox, tax)
    else:
    print('unable to retrieve info for %s' % tax['common name'])

    # needs to return taxobox and tax - seems like rewrite


    def get_taxobox_from_search_term(term, d=0):
    # idea: given a search term, try to get the "taxobox" infobox about the
    # living thing described by that term. several possibilities:
    # - page matching search term is the desired page, and has a taxobox
    # - done, extract taxobox contents and pass on
    # - page matching search term has no taxobox
    # - search for taxonomy link, repeat (link preceded by 'species', 'genus', ...
    # https://en.wikipedia.org/wiki/Salmon
    # https://en.wikipedia.org/wiki/Hazelnut
    # - page is a redirect
    # https://en.wikipedia.org/wiki/Phaseolus_lunatus (lima bean)
    # - page contains taxobox, but taxobox is a template itself
    # https://en.wikipedia.org/wiki/Spider - https://en.wikipedia.org/wiki/Template:Taxonomy/Araneae#
    #
    # todo: record which of these cases occur for which terms
    query = 'https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=json&titles=' + term + '&rvsection=0&redirects'

    resp = requests.get(query)
    content = resp.text
    taxobox = None
    print(content)

    if '#REDIRECT' in content:
    s = 'is redirect'
    # handled by additional parameter in request string

    elif 'automatic taxobox' in content.lower():
    s = 'contains taxobox template'
    # find the 'taxon' entry in the taxobox, go to it
    # example: spiders is 'taxon = Araneae' amd uses this:
    # https://en.wikipedia.org/wiki/Template:Taxonomy/Araneae

    # https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=xmlfm&titles=spider&rvsection=0&rvexpandtemplates

    elif 'speciesbox' in content.lower():
    s = 'contains speciesbox template'
    # find the 'taxon' entry in the taxobox, go to it
    # example: garlic is 'taxon = Allium sativum', and uses this:
    # https://en.wikipedia.org/wiki/Template:Taxonomy/Allium

    # https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=xmlfm&titles=spider&rvsection=0&rvexpandtemplates

    elif 'taxobox' in content.lower():
    s = 'contains usable taxobox'
    taxobox = content
    else:
    s = 'contains no taxobox... '

    if d == 0:
    # check pages of all links for taxoboxes
    # first links preceeded by the word 'species', then 'genus',
    # then 'family', then all other links in the intro section

    # get a list of links along with the preceeding words
    p = r'[^ ]*\[\[([^]]*)\]\]'
    mat = re.finditer(p, content)
    links = []
    for m in mat:
    spind = content.rfind(' ', 0, m.start(0) - 1)
    prevword = content[spind + 1:m.start(0) - 1]
    links.append([prevword, m.group(1)])
    # pprint(links)

    # check the species/genus/family links
    for prev_word in ['species', 'genus', 'family']:
    links2 = [n for n in links if prev_word in n[0]]

    if not taxobox:
    for l in links2:
    tb = get_taxobox_from_search_term(l[1], d + 1)
    if tb:
    taxobox = tb
    s = 'retrieved info from ' + prev_word + ' link "' + l[
    1] + '"'
    break

    # check all other links
    if not taxobox:
    for l in links:
    tb = get_taxobox_from_search_term(l[1], d + 1)
    if tb:
    taxobox = tb
    s = 'retrieved info from link "' + l[1] + '"'
    break

    else:
    s = 'no taxobox in any first-level links'

    if d == 0:
    print(' ' * d + term + ': ' + s)

    return taxobox


    def get_taxonomy_from_expanded_taxobox(taxobox, taxonomy):
    # for
    pass


    def get_taxonomy_from_taxobox(taxobox, taxonomy):
    # dolphin:
    # | regnum = [[Animalia]]\n| phylum = [[Chordata]]\n| classis = [[Mammalia]]\n| ordo = [[Cetacea]]\n| subordo = [[Odontoceti]]\n| familia = *[[Delphinidae]]\n*[[Iniidae]]\n*\u2020[[Lipotidae]]\n*[[Platanistidae]]\n*[[Pontoporiidae]]\n|

    # blue whale:
    # regnum=[[Animal]]ia\n | phylum=[[Chordata]]\n | classis=[[Mammal]]ia\n | ordo = [[Cetartiodactyla]]{{bunch of reference text}}\n| unranked_subordo = [[Cetacea]]\n| unranked_superfamilia = [[Mysticeti]]\n | familia=[[Balaenopteridae]]\n | genus=''[[Balaenoptera]]''\n | species='''''B. musculus '''''\n |
    #
    #
    # todo:
    # - rank parsing issues:
    # - use link name, not link url
    # x remove citations
    # x deal with non-link ranks and links with different names
    # x deal with missing ranks
    # - deal with sub, super, infra, ultra, unranked
    # - deal with multiple values
    # x just use first
    # - get a list

    PRINT = 0
    if PRINT:
    print('')
    print(taxonomy['common name'])
    print(taxobox)

    # TODO:
    # preprocess links:
    # [[xyz|qwe abc]] -> [[qwe abc]]
    # [[qwe xyz]]abc -> [[qwe xyzabc]]
    #
    # then:
    # [[qwe xyz]] -> 'qwe xyz'

    for r in ranks:

    # extract 'row' of taxobox with this rank in it
    p = r + r'[ ]*=[^|]*\|'
    x = re.search(p, taxobox)
    if x is None:
    continue
    row = x.group(0)
    row2 = re.sub('{.*', '', row) # remove {{}} entities

    # extract based on link - fails if entry not a link

    p = r'\[\[[^]]*\]\]'
    rnames1 = re.findall(p, row2)
    rnames1 = [s[2:-2] for s in rnames1]

    # extract based on other stuff -
    row3 = row2.replace('[[', '')
    row4 = row3.replace(']]', '')
    row5 = row4.replace('*', '')
    row6 = row5.replace("'", '')
    p = r'=[ ]*[a-zA-Z .]*'
    x = re.search(p, row6)
    rname2 = x.group(0)
    rname2 = rname2.replace('=', '').replace('\\n', '').strip()

    taxonomy[r] = rname2
    if PRINT:
    print(r, row, row6, rnames1, rname2)

    if len(taxonomy) < 2:
    print('no data found for %s' % taxonomy['common name'])
    return taxonomy


    def print_taxonomy(taxa):

    s1 = '%10s ' % ''
    s2 = ''
    for t in taxa:
    s2 = s2 + '%15s' % t['common name']
    if len(s2.strip()) > 0:
    print(s1 + s2)

    for r in ranks:
    s1 = '%10s: ' % ranks_english[r]
    s2 = ''
    for t in taxa:
    if r in t.keys():
    s2 = s2 + '%15s' % t[r]
    else:
    s2 = s2 + '%15s' % ' '

    if len(s2.strip()) > 0:
    print(s1 + s2)


    def get_lowest_common_node(a, b):

    for r in reversed(ranks):
    if r in a.keys() and r in b.keys() and a[r] == b[r]:
    print('"%s" and "%s" share the same %s (%s)' %
    (a['common name'], b['common name'], ranks_english[r], a[r]))
    break


    def extract_ranks(text):
    # idea: start with a known list, including
    # regnum, phylum, ..., genus, species
    # - split the text into lines
    # - look for lines containing these entries
    # - look at all intermediate lines
    pass


    if __name__ == '__main__':
    main()