Last active
November 11, 2021 04:20
-
-
Save alanbernstein/c4fb09a4393d0c17ccd02632a016f2d4 to your computer and use it in GitHub Desktop.
Revisions
-
alanbernstein renamed this gist
Nov 11, 2021 . 1 changed file with 0 additions and 0 deletions.There are no files selected for viewing
File renamed without changes. -
alanbernstein created this gist
Nov 11, 2021 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,281 @@ #!/usr/local/bin/python import sys import wikipedia import requests import re from pprint import pprint # TODO: cache results and use cache to build up a tree # TODO: handle disambiguation page (e.g., for 'orange') # alternately could use this http://www.itis.gov/index.html but requires # a lot more work to decide which search result to use # # https://en.wikipedia.org/wiki/Horse # https://en.wikipedia.org/wiki/Template:Taxobox#Classification # https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=xmlfm&titles=kidney%20bean&rvsection=0&redirects ranks = ['regnum', 'phylum', 'classis', 'ordo', 'subordo', 'familia', 'genus', 'species'] ranks_english = {'regnum': 'kingdom', 'phylum': 'phylum', 'classis': 'class', 'ordo': 'order', 'subordo': 'suborder', 'familia': 'family', 'genus': 'genus', 'species': 'species'} def main(): if len(sys.argv) > 2: searchterm1 = sys.argv[1] searchterm2 = sys.argv[2] else: searchterm1 = 'dolphin' searchterm2 = 'blue whale' print(searchterm1, searchterm2) tax1 = {'common name': searchterm1} tax2 = {'common name': searchterm2} taxobox1 = get_taxobox_from_search_term(searchterm1) if taxobox1: tax1 = get_taxonomy_from_taxobox(taxobox1, tax1) else: print('unable to retrieve info for %s' % tax1['common name']) taxobox2 = get_taxobox_from_search_term(searchterm2) if taxobox2: tax2 = get_taxonomy_from_taxobox(taxobox2, tax2) else: print('unable to retrieve info for %s' % tax2['common name']) if taxobox1 and taxobox2: print_taxonomy([tax1, tax2]) get_lowest_common_node(tax1, tax2) def get_taxonomy(search_term): tax = {'common name': search_term} taxobox = get_taxobox_from_search_term(search_term) if taxobox: tax = get_taxonomy_from_taxobox(taxobox, tax) else: print('unable to retrieve info for %s' % tax['common name']) # needs to return taxobox and tax - seems like rewrite def get_taxobox_from_search_term(term, d=0): # idea: given a search term, try to get the "taxobox" infobox about the # living thing described by that term. several possibilities: # - page matching search term is the desired page, and has a taxobox # - done, extract taxobox contents and pass on # - page matching search term has no taxobox # - search for taxonomy link, repeat (link preceded by 'species', 'genus', ... # https://en.wikipedia.org/wiki/Salmon # https://en.wikipedia.org/wiki/Hazelnut # - page is a redirect # https://en.wikipedia.org/wiki/Phaseolus_lunatus (lima bean) # - page contains taxobox, but taxobox is a template itself # https://en.wikipedia.org/wiki/Spider - https://en.wikipedia.org/wiki/Template:Taxonomy/Araneae# # # todo: record which of these cases occur for which terms query = 'https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=json&titles=' + term + '&rvsection=0&redirects' resp = requests.get(query) content = resp.text taxobox = None print(content) if '#REDIRECT' in content: s = 'is redirect' # handled by additional parameter in request string elif 'automatic taxobox' in content.lower(): s = 'contains taxobox template' # find the 'taxon' entry in the taxobox, go to it # example: spiders is 'taxon = Araneae' amd uses this: # https://en.wikipedia.org/wiki/Template:Taxonomy/Araneae # https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=xmlfm&titles=spider&rvsection=0&rvexpandtemplates elif 'speciesbox' in content.lower(): s = 'contains speciesbox template' # find the 'taxon' entry in the taxobox, go to it # example: garlic is 'taxon = Allium sativum', and uses this: # https://en.wikipedia.org/wiki/Template:Taxonomy/Allium # https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=xmlfm&titles=spider&rvsection=0&rvexpandtemplates elif 'taxobox' in content.lower(): s = 'contains usable taxobox' taxobox = content else: s = 'contains no taxobox... ' if d == 0: # check pages of all links for taxoboxes # first links preceeded by the word 'species', then 'genus', # then 'family', then all other links in the intro section # get a list of links along with the preceeding words p = r'[^ ]*\[\[([^]]*)\]\]' mat = re.finditer(p, content) links = [] for m in mat: spind = content.rfind(' ', 0, m.start(0) - 1) prevword = content[spind + 1:m.start(0) - 1] links.append([prevword, m.group(1)]) # pprint(links) # check the species/genus/family links for prev_word in ['species', 'genus', 'family']: links2 = [n for n in links if prev_word in n[0]] if not taxobox: for l in links2: tb = get_taxobox_from_search_term(l[1], d + 1) if tb: taxobox = tb s = 'retrieved info from ' + prev_word + ' link "' + l[ 1] + '"' break # check all other links if not taxobox: for l in links: tb = get_taxobox_from_search_term(l[1], d + 1) if tb: taxobox = tb s = 'retrieved info from link "' + l[1] + '"' break else: s = 'no taxobox in any first-level links' if d == 0: print(' ' * d + term + ': ' + s) return taxobox def get_taxonomy_from_expanded_taxobox(taxobox, taxonomy): # for pass def get_taxonomy_from_taxobox(taxobox, taxonomy): # dolphin: # | regnum = [[Animalia]]\n| phylum = [[Chordata]]\n| classis = [[Mammalia]]\n| ordo = [[Cetacea]]\n| subordo = [[Odontoceti]]\n| familia = *[[Delphinidae]]\n*[[Iniidae]]\n*\u2020[[Lipotidae]]\n*[[Platanistidae]]\n*[[Pontoporiidae]]\n| # blue whale: # regnum=[[Animal]]ia\n | phylum=[[Chordata]]\n | classis=[[Mammal]]ia\n | ordo = [[Cetartiodactyla]]{{bunch of reference text}}\n| unranked_subordo = [[Cetacea]]\n| unranked_superfamilia = [[Mysticeti]]\n | familia=[[Balaenopteridae]]\n | genus=''[[Balaenoptera]]''\n | species='''''B. musculus '''''\n | # # # todo: # - rank parsing issues: # - use link name, not link url # x remove citations # x deal with non-link ranks and links with different names # x deal with missing ranks # - deal with sub, super, infra, ultra, unranked # - deal with multiple values # x just use first # - get a list PRINT = 0 if PRINT: print('') print(taxonomy['common name']) print(taxobox) # TODO: # preprocess links: # [[xyz|qwe abc]] -> [[qwe abc]] # [[qwe xyz]]abc -> [[qwe xyzabc]] # # then: # [[qwe xyz]] -> 'qwe xyz' for r in ranks: # extract 'row' of taxobox with this rank in it p = r + r'[ ]*=[^|]*\|' x = re.search(p, taxobox) if x is None: continue row = x.group(0) row2 = re.sub('{.*', '', row) # remove {{}} entities # extract based on link - fails if entry not a link p = r'\[\[[^]]*\]\]' rnames1 = re.findall(p, row2) rnames1 = [s[2:-2] for s in rnames1] # extract based on other stuff - row3 = row2.replace('[[', '') row4 = row3.replace(']]', '') row5 = row4.replace('*', '') row6 = row5.replace("'", '') p = r'=[ ]*[a-zA-Z .]*' x = re.search(p, row6) rname2 = x.group(0) rname2 = rname2.replace('=', '').replace('\\n', '').strip() taxonomy[r] = rname2 if PRINT: print(r, row, row6, rnames1, rname2) if len(taxonomy) < 2: print('no data found for %s' % taxonomy['common name']) return taxonomy def print_taxonomy(taxa): s1 = '%10s ' % '' s2 = '' for t in taxa: s2 = s2 + '%15s' % t['common name'] if len(s2.strip()) > 0: print(s1 + s2) for r in ranks: s1 = '%10s: ' % ranks_english[r] s2 = '' for t in taxa: if r in t.keys(): s2 = s2 + '%15s' % t[r] else: s2 = s2 + '%15s' % ' ' if len(s2.strip()) > 0: print(s1 + s2) def get_lowest_common_node(a, b): for r in reversed(ranks): if r in a.keys() and r in b.keys() and a[r] == b[r]: print('"%s" and "%s" share the same %s (%s)' % (a['common name'], b['common name'], ranks_english[r], a[r])) break def extract_ranks(text): # idea: start with a known list, including # regnum, phylum, ..., genus, species # - split the text into lines # - look for lines containing these entries # - look at all intermediate lines pass if __name__ == '__main__': main()