#!/usr/bin/env python
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import sys
import argparse

parser = argparse.ArgumentParser('Batch get virus host name using a NCBI Taxonomy ID list')
group = parser.add_mutually_exclusive_group()
group.add_argument("-f", "--file", help = 'A file containing virus NCBI Taxonomy IDs, one ID per line')
group.add_argument("-l", "--list", help = '(multiple) virus NCBI Taxonomy IDs, separated by comma')
args = parser.parse_args()

if args.file:
    with open(args.file) as f:
        tax_id_list = f.readlines()
elif args.list:
    tax_id_list = re.split(r',\s?', args.list)
else:
    sys.exit('ERROR: You must provide either a file or a ID list!')

def eprint(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs)

def get_host():
    total_number = len(tax_id_list)
    now = 0
    for tax_id in tax_id_list:
        tax_id = tax_id.rstrip()
        now += 1
        eprint('Processing {}/{}...'.format(now, total_number))
        address = 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=' + tax_id
        while True:
            try:
                soup = BeautifulSoup(urlopen(address), 'html.parser')
                find_string = soup.body.form.find_all('td')
                for i in find_string:
                    for match in re.findall(r'Host:\s' + '(.*?)'+r'<', str(i)):
                        print("{}\t{}".format(tax_id, match))
            except Exception:
                continue
            break

if __name__ == "__main__":
    get_host()