Last active
April 15, 2019 08:51
-
-
Save bioinformatist/8c82b13f76e9eb6eecea1a15c8427ed5 to your computer and use it in GitHub Desktop.
A Python script for batch getting virus host name using a NCBI Taxonomy ID list
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| from bs4 import BeautifulSoup | |
| from urllib.request import urlopen | |
| import re | |
| import sys | |
| import argparse | |
| parser = argparse.ArgumentParser('Batch get virus host name using a NCBI Taxonomy ID list') | |
| group = parser.add_mutually_exclusive_group() | |
| group.add_argument("-f", "--file", help = 'A file containing virus NCBI Taxonomy IDs, one ID per line') | |
| group.add_argument("-l", "--list", help = '(multiple) virus NCBI Taxonomy IDs, separated by comma') | |
| args = parser.parse_args() | |
| if args.file: | |
| with open(args.file) as f: | |
| tax_id_list = f.readlines() | |
| elif args.list: | |
| tax_id_list = re.split(r',\s?', args.list) | |
| else: | |
| sys.exit('ERROR: You must provide either a file or a ID list!') | |
| def eprint(*args, **kwargs): | |
| print(*args, file=sys.stderr, **kwargs) | |
| def get_host(): | |
| total_number = len(tax_id_list) | |
| now = 0 | |
| for tax_id in tax_id_list: | |
| tax_id = tax_id.rstrip() | |
| now += 1 | |
| eprint('Processing {}/{}...'.format(now, total_number)) | |
| address = 'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=' + tax_id | |
| while True: | |
| try: | |
| soup = BeautifulSoup(urlopen(address), 'html.parser') | |
| find_string = soup.body.form.find_all('td') | |
| for i in find_string: | |
| for match in re.findall(r'Host:\s' + '(.*?)'+r'<', str(i)): | |
| print("{}\t{}".format(tax_id, match)) | |
| except Exception: | |
| continue | |
| break | |
| if __name__ == "__main__": | |
| get_host() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment