Skip to content

Instantly share code, notes, and snippets.

@bioinformatist
Created March 29, 2019 02:17
Show Gist options
  • Save bioinformatist/18a804056cb7108d7cce7a54c9934350 to your computer and use it in GitHub Desktop.
Save bioinformatist/18a804056cb7108d7cce7a54c9934350 to your computer and use it in GitHub Desktop.
Get latest complete genome URLs from IMG XML file
#!/usr/bin/env python
from datetime import datetime
import argparse
import xml.etree.ElementTree as ET
parser = argparse.ArgumentParser('Get latest complete genome URLs from IMG XML file')
parser.add_argument("file", help = 'A XML file downloaded from IMG')
args = parser.parse_args()
name_time_url = {}
def latest_complete(file):
tree = ET.parse(file)
root = tree.getroot()
for child in root.iter('file'):
filename = child.get('filename')
if 'assembly.fasta' in filename or 'assembled.fna' in filename:
label = child.get('label')
timestamp = child.get('timestamp').replace('PDT ', "").replace('PST', '')
dt = datetime.strptime(timestamp, "%c")
name_time_url.setdefault(label, {})
name_time_url[label][dt] = child.get('url')
for name in name_time_url:
latest = max(name_time_url[name].keys())
print("https://genome.jgi.doe.gov{}".format(name_time_url[name][latest]))
if __name__ == "__main__":
latest_complete(args.file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment