#!/usr/bin/env python from bs4 import BeautifulSoup from urllib.request import urlopen import re import sys import argparse parser = argparse.ArgumentParser('Batch get virus host name using a NCBI Taxonomy ID list') group = parser.add_mutually_exclusive_group() group.add_argument("-f", "--file", help = 'A file containing virus NCBI Taxonomy IDs, one ID per line') group.add_argument("-l", "--list", help = '(multiple) virus NCBI Taxonomy IDs, separated by comma') args = parser.parse_args() if args.file: with open(args.file) as f: tax_id_list = f.readlines() elif args.list: tax_id_list = re.split(r',\s?', args.list) else: sys.exit('ERROR: You must provide either a file or a ID list!') def eprint(*args, **kwargs): print(*args, file=sys.stderr, **kwargs) def get_host(): total_number = len(tax_id_list) now = 0 for tax_id in tax_id_list: tax_id = tax_id.rstrip() now += 1 eprint('Processing {}/{}...'.format(now, total_number)) address = 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=' + tax_id while True: try: soup = BeautifulSoup(urlopen(address), 'html.parser') find_string = soup.body.form.find_all('td') for i in find_string: for match in re.findall(r'Host:\s' + '(.*?)'+r'<', str(i)): print("{}\t{}".format(tax_id, match)) except Exception: continue break if __name__ == "__main__": get_host()