bioinformatist · April 15, 2019 08:51
diff --git a/txid2host.py b/txid2host.py
 #!/usr/bin/env python
 from bs4 import BeautifulSoup
 from urllib.request import urlopen
 import re
 import sys
 import argparse

 parser = argparse.ArgumentParser('Batch get virus host name using a NCBI Taxonomy ID list')
 group = parser.add_mutually_exclusive_group()
 group.add_argument("-f", "--file", help = 'A file containing virus NCBI Taxonomy IDs, one ID per line')
 group.add_argument("-l", "--list", help = '(multiple) virus NCBI Taxonomy IDs, separated by comma')
 args = parser.parse_args()

 if args.file:
    with open(args.file) as f:
        tax_id_list = f.readlines()
 elif args.list:
    tax_id_list = re.split(r',\s?', args.list)
 else:
    sys.exit('ERROR: You must provide either a file or a ID list!')

 def eprint(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs)

 def get_host():
    total_number = len(tax_id_list)
    now = 0
    for tax_id in tax_id_list:
        tax_id = tax_id.rstrip()
        now += 1
        eprint('Processing {}/{}...'.format(now, total_number))
        address = 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=' + tax_id
        while True:
            try:
                soup = BeautifulSoup(urlopen(address), 'html.parser')
                find_string = soup.body.form.find_all('td')
                for i in find_string:
                    for match in re.findall(r'Host:\s' + '(.*?)'+r'<', str(i)):
                        print("{}\t{}".format(tax_id, match))
            except Exception:
                continue
            break

 if __name__ == "__main__":
    get_host()
	#!/usr/bin/env python
	from bs4 import BeautifulSoup
	from urllib.request import urlopen
	import re
	import sys
	import argparse

	parser = argparse.ArgumentParser('Batch get virus host name using a NCBI Taxonomy ID list')
	group = parser.add_mutually_exclusive_group()
	group.add_argument("-f", "--file", help = 'A file containing virus NCBI Taxonomy IDs, one ID per line')
	group.add_argument("-l", "--list", help = '(multiple) virus NCBI Taxonomy IDs, separated by comma')
	args = parser.parse_args()

	if args.file:
	with open(args.file) as f:
	tax_id_list = f.readlines()
	elif args.list:
	tax_id_list = re.split(r',\s?', args.list)
	else:
	sys.exit('ERROR: You must provide either a file or a ID list!')

	def eprint(args, *kwargs):
	print(args, file=sys.stderr, *kwargs)

	def get_host():
	total_number = len(tax_id_list)
	now = 0
	for tax_id in tax_id_list:
	tax_id = tax_id.rstrip()
	now += 1
	eprint('Processing {}/{}...'.format(now, total_number))
	address = 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=' + tax_id
	while True:
	try:
	soup = BeautifulSoup(urlopen(address), 'html.parser')
	find_string = soup.body.form.find_all('td')
	for i in find_string:
	for match in re.findall(r'Host:\s' + '(.*?)'+r'<', str(i)):
	print("{}\t{}".format(tax_id, match))
	except Exception:
	continue
	break

	if __name__ == "__main__":
	get_host()