#! /usr/bin/env python

"""
This program reads the output from "dicomdump" and converts it to json.

The latest version of this code can be found at gist.github.com/dgobbi

Note that this code is incomplete, incorrect, and may destroy your data.
It comes with absolutely no warranties. Use at your own risk.
"""

import argparse
import sys
import re
import json
import collections


# the python dict type we want to use is "OrderedDict"
dicttype = collections.OrderedDict


# regular expressions for parsing dicomdump output
re_dataset = re.compile("==== (.*) ====")
re_data = re.compile("( *)\\((....),(....)\\) (..|) \"([^\"]*)\" : \\[(.*)\\] (\\{[^}]*\\}|) *\\(([^)]*)\\)")
re_multi = re.compile("( *)\\((....),(....)\\) (..|) \"([^\"]*)\" : \\(multiple values\\)")
re_instance = re.compile("( *) 0*([0-9]*) \\[(.*)\\] (\\{[^}]*\\}|) *\\(([^)]*)\\)")
re_instance_sq = re.compile("( *) 0*([0-9]*) \\(([0-9]*) item[s]?([^)]*)\\)")
re_sequence = re.compile("( *)\\((....),(....)\\) (..|) \"([^\"]*)\" : \\(([0-9]*) item[s]?([^)]*)\\)")
re_item = re.compile("( *)---- SQ Item 0*([0-9]*) at offset ([0-9]*) ----")
re_mismatch = re.compile("( *)VR mismatch! (..|) != (..) (.*)")
re_indent = re.compile("( *).*")


def build_value(vr, vl, value):
    """Convert an attribute value from a dicomdump file to DICOM's json format.
    """
    # if VL is zero, then no value is given
    if vl == '0 bytes':
        return dicttype([("vr", vr)])
    # if bulk data, use empty BulkDataURI (TODO: InlineBinary)
    if vr in ['OB', 'OD', 'OF', 'OL', 'OV', 'OW']:
        return dicttype([("vr", vr), ("BulkDataURI", "")])
    # these text VRs are always single-valued
    if vr in ['LT', 'ST', 'UT']:
        value_list = [ value ]
    # for AT, convert dicomdump sytax to DICOM json syntax
    elif vr == 'AT':
        value_list = []
        for ptr in value.split('\\'):
            value_list.append(ptr[1:5]+ptr[6:10])
    # for integers, convert to int
    elif vr in ['IS', 'SS', 'US', 'SL', 'UL', 'SV', 'UV']:
        value_list = []
        for v in value.split('\\'):
            try:
                value_list.append(int(v))
            except ValueError:
                # TODO: warn
                pass
    # for decimal, convert to float (inexact)
    elif vr in ['DS', 'FL', 'FD']:
        value_list = []
        for v in value.split('\\'):
            try:
                value_list.append(float(v))
            except ValueError:
                # TODO: warn
                pass
    # for PN, handle "Alphabetic", "Ideographic", "Phonetic" groups
    elif vr == 'PN':
        value_list = []
        for name in value.split('\\'):
            name_attrs = {}
            parts = name.split('=')
            name_attrs['Alphabetic'] = parts[0]
            if len(parts) > 1:
                name_attrs['Ideographic'] = parts[1]
            if len(parts) > 2:
                name_attrs['Phonetic'] = parts[2]
            value_list.append(name_attrs)
    # for all other VRs
    else:
        value_list = value.split('\\')
        # replace any empty values with null
        for i in range(len(value_list)):
            if value_list[i] == "":
                value_list[i] = None
    return dicttype([("vr", vr), ("Value", value_list)])


def handle_instances(sequence, instances):
    """Handle "multiple values" by recreating multiple datasets.
    """
    if instances:
        # make copies of last sequence
        last_dataset = sequence[-1]
        sequence.pop()
        n = 0
        for tag in instances:
            n = max(n, len(instances[tag]))
        for i in range(n):
            dataset = dicttype(last_dataset)
            for tag in instances:
                try:
                    dataset[tag] = instances[tag][i]
                except IndexError:
                    # TODO: warning
                    pass
            sequence.append(dataset)
        instances.clear()


def skip_tag(tag):
    """Returns True for tags that should be skipped.
    """
    # group length tags
    if tag[-4:] == '0000':
        return True
    # tags in group 0002, 0004, etc
    elif tag[0:4] < '0008':
        return True
    return False


def read_dicomdump(lines):
    """Parse a dicomdump file that has been read with "readlines".
    """
    # a sequence of datasets will be read (usually just one)
    sequence = []
    dataset = None

    # a stack is needed for handling the depth of the tree
    stack = []

    # for dicomdump's "multiple values" across a series
    instance_tag = None
    instance_vr = None
    instances = {}

    # for dealing with a bug in dicomdump for series where
    # the first dataset is missing elements
    vr_mismatch = ("", "")

    # go through the dump line-by-line
    for line in lines:
        line = line.rstrip()

        # empty line: ignore
        if len(line) == 0:
            continue

        # mismatched VR warning: ignore
        m = re_mismatch.match(line)
        if m:
            groups = m.groups()
            vr_mismatch = (groups[1], groups[2])
            continue

        # check the indentation, which indicates depth
        if re_instance.match(line):
            # always at the root, depth of zero
            depth = 0
        elif re_instance_sq.match(line):
            # always within a sequence at the root, hence depth is 1
            depth = 1
        else:
            # the depth is given by the indentation
            depth = len(re_indent.match(line).group(1))/2
        # check for extra indentation that isn't in a sequence
        if depth > len(stack):
            sys.stderr.write("Improper indentation:\n" + line + "\n")
            continue
        # check for decreasing indentation (marks end of a block)
        while len(stack) > depth:
            sequence, dataset = stack[-1]
            stack.pop()

        # new dataset (indicated by "====" in the file)
        m = re_dataset.match(line)
        if m:
            handle_instances(sequence, instances)
            # start a fresh dataset
            dataset = dicttype()
            sequence.append(dataset)
            continue

        # new item (indicated by "----" in the file)
        m = re_item.match(line)
        if m:
            dataset = dicttype()
            sequence.append(dataset)
            continue

        # sequence value (increase depth)
        m = re_sequence.match(line)
        if m:
            groups = m.groups()
            tag = "".join(groups[1:3]).upper()
            vr = groups[3]
            stack.append((sequence, dataset))
            sequence = []
            dataset[tag] = dicttype([("vr", vr), ("Value", sequence)])
            dataset = None
            continue

        # any other value
        m = re_data.match(line)
        if m:
            groups = m.groups()
            tag = "".join(groups[1:3]).upper()
            # skip group length tags
            if skip_tag(tag):
                continue
            keyword = groups[4]
            vr = groups[3]
            vl = groups[7]
            value = groups[5]
            dataset[tag] = build_value(vr, vl, value)
            continue

        # ----
        # special code for the dicomdump "multiple values" lines
        m = re_multi.match(line)
        if m:
            groups = m.groups()
            instance_tag = "".join(groups[1:3]).upper()
            instance_vr = groups[3]
            if instance_vr == "" and vr_mismatch[0] == "":
                instance_vr = vr_mismatch[1]
            # skip group length tags
            if skip_tag(instance_tag):
                continue
            dataset[instance_tag] = dicttype([("vr", instance_vr)])
            instances[instance_tag] = []
            if instance_vr == 'SQ':
                stack.append((sequence, dataset))
            continue

        # one instance of a "multiple value" attribute
        m = re_instance.match(line)
        if m:
            groups = m.groups()
            # skip group length tags
            if skip_tag(instance_tag):
                continue
            value = groups[2]
            vl = groups[4]
            instances[instance_tag].append(build_value(instance_vr, vl, value))
            continue

        # one instance of a "multiple value" attribute that is SQ
        m = re_instance_sq.match(line)
        if m:
            sequence = []
            dataset = None
            instances[instance_tag].append(dicttype([("vr", instance_vr), ("Value", sequence)]))
            continue

        # none of the regular expressions matched!
        sys.stderr.write("Unrecognized syntax:\n" + line + "\n")

    # at end of dump, pop back to root
    while len(stack) > 0:
        sequence, dataset = stack[-1]
        stack.pop()

    # change out "multiple value" data elements into a series of datasets
    handle_instances(sequence, instances)

    return sequence


def main():
    parser = argparse.ArgumentParser(description="Read dicomdump output.")
    parser.add_argument('input', help="Input file (DICOM).")
    parser.add_argument('-o', '--output', required=False,
                        help="Output file (json).")
    args = parser.parse_args()

    with open(args.input) as f:
        tree = read_dicomdump(f.readlines())

    json_opts = {
        "indent" : 2,
        "separators" : (",", " : "),
    }

    if args.output:
        with open(args.output, 'w') as f:
            json.dump(tree, f, **json_opts)
    else:
        json.dump(tree, sys.stdout, **json_opts)


if __name__ == '__main__':
    main()