Skip to content

Instantly share code, notes, and snippets.

@bioinformatist
Last active April 15, 2019 08:51
Show Gist options
  • Save bioinformatist/8c82b13f76e9eb6eecea1a15c8427ed5 to your computer and use it in GitHub Desktop.
Save bioinformatist/8c82b13f76e9eb6eecea1a15c8427ed5 to your computer and use it in GitHub Desktop.
A Python script for batch getting virus host name using a NCBI Taxonomy ID list
#!/usr/bin/env python
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
import sys
import argparse
parser = argparse.ArgumentParser('Batch get virus host name using a NCBI Taxonomy ID list')
group = parser.add_mutually_exclusive_group()
group.add_argument("-f", "--file", help = 'A file containing virus NCBI Taxonomy IDs, one ID per line')
group.add_argument("-l", "--list", help = '(multiple) virus NCBI Taxonomy IDs, separated by comma')
args = parser.parse_args()
if args.file:
with open(args.file) as f:
tax_id_list = f.readlines()
elif args.list:
tax_id_list = re.split(r',\s?', args.list)
else:
sys.exit('ERROR: You must provide either a file or a ID list!')
def eprint(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)
def get_host():
total_number = len(tax_id_list)
now = 0
for tax_id in tax_id_list:
tax_id = tax_id.rstrip()
now += 1
eprint('Processing {}/{}...'.format(now, total_number))
address = 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=' + tax_id
while True:
try:
soup = BeautifulSoup(urlopen(address), 'html.parser')
find_string = soup.body.form.find_all('td')
for i in find_string:
for match in re.findall(r'Host:\s' + '(.*?)'+r'<', str(i)):
print("{}\t{}".format(tax_id, match))
except Exception:
continue
break
if __name__ == "__main__":
get_host()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment