dgobbi · August 7, 2020 17:29 · Aug 7, 2020 · Aug 7, 2020 · Aug 7, 2020 · Aug 7, 2020
diff --git a/readdump.py b/readdump.py
@@ -257,6 +257,9 @@ def read_dicomdump(lines):
             instances[instance_tag].append(dicttype([("vr", instance_vr), ("Value", sequence)]))
             continue
 
+        # none of the regular expressions matched!
+        sys.stderr.write("Unrecognized syntax:\n" + line + "\n")
+
     # at end of dump, pop back to root
     while len(stack) > 0:
         sequence, dataset = stack[-1]

diff --git a/readdump.py b/readdump.py
@@ -49,6 +49,24 @@ def build_value(vr, vl, value):
         value_list = []
         for ptr in value.split('\\'):
             value_list.append(ptr[1:5]+ptr[6:10])
+    # for integers, convert to int
+    elif vr in ['IS', 'SS', 'US', 'SL', 'UL', 'SV', 'UV']:
+        value_list = []
+        for v in value.split('\\'):
+            try:
+                value_list.append(int(v))
+            except ValueError:
+                # TODO: warn
+                pass
+    # for decimal, convert to float (inexact)
+    elif vr in ['DS', 'FL', 'FD']:
+        value_list = []
+        for v in value.split('\\'):
+            try:
+                value_list.append(float(v))
+            except ValueError:
+                # TODO: warn
+                pass
     # for PN, handle "Alphabetic", "Ideographic", "Phonetic" groups
     elif vr == 'PN':
         value_list = []

diff --git a/readdump.py b/readdump.py
@@ -22,9 +22,9 @@
 
 # regular expressions for parsing dicomdump output
 re_dataset = re.compile("==== (.*) ====")
-re_data = re.compile("( *)\\((....),(....)\\) (..|) \"([^\"]*)\" : \\[(.*)\\] [^(]*\\(([^)]*)\\)")
+re_data = re.compile("( *)\\((....),(....)\\) (..|) \"([^\"]*)\" : \\[(.*)\\] (\\{[^}]*\\}|) *\\(([^)]*)\\)")
 re_multi = re.compile("( *)\\((....),(....)\\) (..|) \"([^\"]*)\" : \\(multiple values\\)")
-re_instance = re.compile("( *) 0*([0-9]*) \\[(.*)\\] \\(([^)]*)\\)")
+re_instance = re.compile("( *) 0*([0-9]*) \\[(.*)\\] (\\{[^}]*\\}|) *\\(([^)]*)\\)")
 re_instance_sq = re.compile("( *) 0*([0-9]*) \\(([0-9]*) item[s]?([^)]*)\\)")
 re_sequence = re.compile("( *)\\((....),(....)\\) (..|) \"([^\"]*)\" : \\(([0-9]*) item[s]?([^)]*)\\)")
 re_item = re.compile("( *)---- SQ Item 0*([0-9]*) at offset ([0-9]*) ----")
@@ -196,7 +196,7 @@ def read_dicomdump(lines):
                 continue
             keyword = groups[4]
             vr = groups[3]
-            vl = groups[6]
+            vl = groups[7]
             value = groups[5]
             dataset[tag] = build_value(vr, vl, value)
             continue
@@ -227,7 +227,7 @@ def read_dicomdump(lines):
             if skip_tag(instance_tag):
                 continue
             value = groups[2]
-            vl = groups[3]
+            vl = groups[4]
             instances[instance_tag].append(build_value(instance_vr, vl, value))
             continue
 

diff --git a/readdump.py b/readdump.py
@@ -22,11 +22,11 @@
 
 # regular expressions for parsing dicomdump output
 re_dataset = re.compile("==== (.*) ====")
-re_data = re.compile("( *)\\((....),(....)\\) (..|) \"([^\"]*)\" : \\[(.*)\\] \\(([0-9]*) bytes\\)")
+re_data = re.compile("( *)\\((....),(....)\\) (..|) \"([^\"]*)\" : \\[(.*)\\] [^(]*\\(([^)]*)\\)")
 re_multi = re.compile("( *)\\((....),(....)\\) (..|) \"([^\"]*)\" : \\(multiple values\\)")
-re_instance = re.compile("( *) 0*([0-9]*) \\[(.*)\\] \\(([0-9]*) bytes\\)")
-re_instance_sq = re.compile("( *) 0*([0-9]*) \\(([0-9]*) item[s]?(, delimited|)\\)")
-re_sequence = re.compile("( *)\\((....),(....)\\) (..|) \"([^\"]*)\" : \\(([0-9]*) item[s]?(, delimited|)\\)")
+re_instance = re.compile("( *) 0*([0-9]*) \\[(.*)\\] \\(([^)]*)\\)")
+re_instance_sq = re.compile("( *) 0*([0-9]*) \\(([0-9]*) item[s]?([^)]*)\\)")
+re_sequence = re.compile("( *)\\((....),(....)\\) (..|) \"([^\"]*)\" : \\(([0-9]*) item[s]?([^)]*)\\)")
 re_item = re.compile("( *)---- SQ Item 0*([0-9]*) at offset ([0-9]*) ----")
 re_mismatch = re.compile("( *)VR mismatch! (..|) != (..) (.*)")
 re_indent = re.compile("( *).*")
@@ -36,7 +36,7 @@ def build_value(vr, vl, value):
     """Convert an attribute value from a dicomdump file to DICOM's json format.
     """
     # if VL is zero, then no value is given
-    if vl == '0':
+    if vl == '0 bytes':
         return dicttype([("vr", vr)])
     # if bulk data, use empty BulkDataURI (TODO: InlineBinary)
     if vr in ['OB', 'OD', 'OF', 'OL', 'OV', 'OW']:

diff --git a/readdump.py b/readdump.py
@@ -140,17 +140,23 @@ def read_dicomdump(lines):
             continue
 
         # check the indentation, which indicates depth
-        if not (re_instance.match(line) or re_instance_sq.match(line)):
-            m = re_indent.match(line)
-            indent = len(m.group(1))/2
-            # check for extra indentation that isn't in a sequence
-            if indent > len(stack) and not re_instance.match(line):
-                sys.stderr.write("Improper indentation:\n" + line + "\n")
-                continue
-            # check for decreasing indentation (marks end of a block)
-            while len(stack) > indent:
-                sequence, dataset = stack[-1]
-                stack.pop()
+        if re_instance.match(line):
+            # always at the root, depth of zero
+            depth = 0
+        elif re_instance_sq.match(line):
+            # always within a sequence at the root, hence depth is 1
+            depth = 1
+        else:
+            # the depth is given by the indentation
+            depth = len(re_indent.match(line).group(1))/2
+        # check for extra indentation that isn't in a sequence
+        if depth > len(stack):
+            sys.stderr.write("Improper indentation:\n" + line + "\n")
+            continue
+        # check for decreasing indentation (marks end of a block)
+        while len(stack) > depth:
+            sequence, dataset = stack[-1]
+            stack.pop()
 
         # new dataset (indicated by "====" in the file)
         m = re_dataset.match(line)
@@ -233,8 +239,14 @@ def read_dicomdump(lines):
             instances[instance_tag].append(dicttype([("vr", instance_vr), ("Value", sequence)]))
             continue
 
+    # at end of dump, pop back to root
+    while len(stack) > 0:
+        sequence, dataset = stack[-1]
+        stack.pop()
+
     # change out "multiple value" data elements into a series of datasets
     handle_instances(sequence, instances)
+
     return sequence
 
 

diff --git a/readdump.py b/readdump.py
@@ -223,8 +223,6 @@ def read_dicomdump(lines):
             value = groups[2]
             vl = groups[3]
             instances[instance_tag].append(build_value(instance_vr, vl, value))
-            if len(instances[instance_tag]) == 16:
-                print("XXX", instance_tag, line)
             continue
 
         # one instance of a "multiple value" attribute that is SQ

diff --git a/readdump.py b/readdump.py
@@ -40,27 +40,27 @@ def build_value(vr, vl, value):
         return dicttype([("vr", vr)])
     # if bulk data, use empty BulkDataURI (TODO: InlineBinary)
     if vr in ['OB', 'OD', 'OF', 'OL', 'OV', 'OW']:
-       return dicttype([("vr", vr), ("BulkDataURI", "")])
+        return dicttype([("vr", vr), ("BulkDataURI", "")])
     # these text VRs are always single-valued
     if vr in ['LT', 'ST', 'UT']:
         value_list = [ value ]
     # for AT, convert dicomdump sytax to DICOM json syntax
     elif vr == 'AT':
-       value_list = []
-       for ptr in value.split('\\'):
-           value_list.append(ptr[1:5]+ptr[6:10])
+        value_list = []
+        for ptr in value.split('\\'):
+            value_list.append(ptr[1:5]+ptr[6:10])
     # for PN, handle "Alphabetic", "Ideographic", "Phonetic" groups
     elif vr == 'PN':
-       value_list = []
-       for name in value.split('\\'):
-           name_attrs = {}
-           parts = name.split('=')
-           name_attrs['Alphabetic'] = parts[0]
-           if len(parts) > 1:
-               name_attrs['Ideographic'] = parts[1]
-           if len(parts) > 2:
-               name_attrs['Phonetic'] = parts[2]
-           value_list.append(name_attrs)
+        value_list = []
+        for name in value.split('\\'):
+            name_attrs = {}
+            parts = name.split('=')
+            name_attrs['Alphabetic'] = parts[0]
+            if len(parts) > 1:
+                name_attrs['Ideographic'] = parts[1]
+            if len(parts) > 2:
+                name_attrs['Phonetic'] = parts[2]
+            value_list.append(name_attrs)
     # for all other VRs
     else:
         value_list = value.split('\\')

diff --git a/readdump.py b/readdump.py
@@ -0,0 +1,266 @@
+#! /usr/bin/env python
+
+"""
+This program reads the output from "dicomdump" and converts it to json.
+
+The latest version of this code can be found at gist.github.com/dgobbi
+
+Note that this code is incomplete, incorrect, and may destroy your data.
+It comes with absolutely no warranties. Use at your own risk.
+"""
+
+import argparse
+import sys
+import re
+import json
+import collections
+
+
+# the python dict type we want to use is "OrderedDict"
+dicttype = collections.OrderedDict
+
+
+# regular expressions for parsing dicomdump output
+re_dataset = re.compile("==== (.*) ====")
+re_data = re.compile("( *)\\((....),(....)\\) (..|) \"([^\"]*)\" : \\[(.*)\\] \\(([0-9]*) bytes\\)")
+re_multi = re.compile("( *)\\((....),(....)\\) (..|) \"([^\"]*)\" : \\(multiple values\\)")
+re_instance = re.compile("( *) 0*([0-9]*) \\[(.*)\\] \\(([0-9]*) bytes\\)")
+re_instance_sq = re.compile("( *) 0*([0-9]*) \\(([0-9]*) item[s]?(, delimited|)\\)")
+re_sequence = re.compile("( *)\\((....),(....)\\) (..|) \"([^\"]*)\" : \\(([0-9]*) item[s]?(, delimited|)\\)")
+re_item = re.compile("( *)---- SQ Item 0*([0-9]*) at offset ([0-9]*) ----")
+re_mismatch = re.compile("( *)VR mismatch! (..|) != (..) (.*)")
+re_indent = re.compile("( *).*")
+
+
+def build_value(vr, vl, value):
+    """Convert an attribute value from a dicomdump file to DICOM's json format.
+    """
+    # if VL is zero, then no value is given
+    if vl == '0':
+        return dicttype([("vr", vr)])
+    # if bulk data, use empty BulkDataURI (TODO: InlineBinary)
+    if vr in ['OB', 'OD', 'OF', 'OL', 'OV', 'OW']:
+       return dicttype([("vr", vr), ("BulkDataURI", "")])
+    # these text VRs are always single-valued
+    if vr in ['LT', 'ST', 'UT']:
+        value_list = [ value ]
+    # for AT, convert dicomdump sytax to DICOM json syntax
+    elif vr == 'AT':
+       value_list = []
+       for ptr in value.split('\\'):
+           value_list.append(ptr[1:5]+ptr[6:10])
+    # for PN, handle "Alphabetic", "Ideographic", "Phonetic" groups
+    elif vr == 'PN':
+       value_list = []
+       for name in value.split('\\'):
+           name_attrs = {}
+           parts = name.split('=')
+           name_attrs['Alphabetic'] = parts[0]
+           if len(parts) > 1:
+               name_attrs['Ideographic'] = parts[1]
+           if len(parts) > 2:
+               name_attrs['Phonetic'] = parts[2]
+           value_list.append(name_attrs)
+    # for all other VRs
+    else:
+        value_list = value.split('\\')
+        # replace any empty values with null
+        for i in range(len(value_list)):
+            if value_list[i] == "":
+                value_list[i] = None
+    return dicttype([("vr", vr), ("Value", value_list)])
+
+
+def handle_instances(sequence, instances):
+    """Handle "multiple values" by recreating multiple datasets.
+    """
+    if instances:
+        # make copies of last sequence
+        last_dataset = sequence[-1]
+        sequence.pop()
+        n = 0
+        for tag in instances:
+            n = max(n, len(instances[tag]))
+        for i in range(n):
+            dataset = dicttype(last_dataset)
+            for tag in instances:
+                try:
+                    dataset[tag] = instances[tag][i]
+                except IndexError:
+                    # TODO: warning
+                    pass
+            sequence.append(dataset)
+        instances.clear()
+
+
+def skip_tag(tag):
+    """Returns True for tags that should be skipped.
+    """
+    # group length tags
+    if tag[-4:] == '0000':
+        return True
+    # tags in group 0002, 0004, etc
+    elif tag[0:4] < '0008':
+        return True
+    return False
+
+
+def read_dicomdump(lines):
+    """Parse a dicomdump file that has been read with "readlines".
+    """
+    # a sequence of datasets will be read (usually just one)
+    sequence = []
+    dataset = None
+
+    # a stack is needed for handling the depth of the tree
+    stack = []
+
+    # for dicomdump's "multiple values" across a series
+    instance_tag = None
+    instance_vr = None
+    instances = {}
+
+    # for dealing with a bug in dicomdump for series where
+    # the first dataset is missing elements
+    vr_mismatch = ("", "")
+
+    # go through the dump line-by-line
+    for line in lines:
+        line = line.rstrip()
+
+        # empty line: ignore
+        if len(line) == 0:
+            continue
+
+        # mismatched VR warning: ignore
+        m = re_mismatch.match(line)
+        if m:
+            groups = m.groups()
+            vr_mismatch = (groups[1], groups[2])
+            continue
+
+        # check the indentation, which indicates depth
+        if not (re_instance.match(line) or re_instance_sq.match(line)):
+            m = re_indent.match(line)
+            indent = len(m.group(1))/2
+            # check for extra indentation that isn't in a sequence
+            if indent > len(stack) and not re_instance.match(line):
+                sys.stderr.write("Improper indentation:\n" + line + "\n")
+                continue
+            # check for decreasing indentation (marks end of a block)
+            while len(stack) > indent:
+                sequence, dataset = stack[-1]
+                stack.pop()
+
+        # new dataset (indicated by "====" in the file)
+        m = re_dataset.match(line)
+        if m:
+            handle_instances(sequence, instances)
+            # start a fresh dataset
+            dataset = dicttype()
+            sequence.append(dataset)
+            continue
+
+        # new item (indicated by "----" in the file)
+        m = re_item.match(line)
+        if m:
+            dataset = dicttype()
+            sequence.append(dataset)
+            continue
+
+        # sequence value (increase depth)
+        m = re_sequence.match(line)
+        if m:
+            groups = m.groups()
+            tag = "".join(groups[1:3]).upper()
+            vr = groups[3]
+            stack.append((sequence, dataset))
+            sequence = []
+            dataset[tag] = dicttype([("vr", vr), ("Value", sequence)])
+            dataset = None
+            continue
+
+        # any other value
+        m = re_data.match(line)
+        if m:
+            groups = m.groups()
+            tag = "".join(groups[1:3]).upper()
+            # skip group length tags
+            if skip_tag(tag):
+                continue
+            keyword = groups[4]
+            vr = groups[3]
+            vl = groups[6]
+            value = groups[5]
+            dataset[tag] = build_value(vr, vl, value)
+            continue
+
+        # ----
+        # special code for the dicomdump "multiple values" lines
+        m = re_multi.match(line)
+        if m:
+            groups = m.groups()
+            instance_tag = "".join(groups[1:3]).upper()
+            instance_vr = groups[3]
+            if instance_vr == "" and vr_mismatch[0] == "":
+                instance_vr = vr_mismatch[1]
+            # skip group length tags
+            if skip_tag(instance_tag):
+                continue
+            dataset[instance_tag] = dicttype([("vr", instance_vr)])
+            instances[instance_tag] = []
+            if instance_vr == 'SQ':
+                stack.append((sequence, dataset))
+            continue
+
+        # one instance of a "multiple value" attribute
+        m = re_instance.match(line)
+        if m:
+            groups = m.groups()
+            # skip group length tags
+            if skip_tag(instance_tag):
+                continue
+            value = groups[2]
+            vl = groups[3]
+            instances[instance_tag].append(build_value(instance_vr, vl, value))
+            if len(instances[instance_tag]) == 16:
+                print("XXX", instance_tag, line)
+            continue
+
+        # one instance of a "multiple value" attribute that is SQ
+        m = re_instance_sq.match(line)
+        if m:
+            sequence = []
+            dataset = None
+            instances[instance_tag].append(dicttype([("vr", instance_vr), ("Value", sequence)]))
+            continue
+
+    # change out "multiple value" data elements into a series of datasets
+    handle_instances(sequence, instances)
+    return sequence
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Read dicomdump output.")
+    parser.add_argument('input', help="Input file (DICOM).")
+    parser.add_argument('-o', '--output', required=False,
+                        help="Output file (json).")
+    args = parser.parse_args()
+
+    with open(args.input) as f:
+        tree = read_dicomdump(f.readlines())
+
+    json_opts = {
+        "indent" : 2,
+        "separators" : (",", " : "),
+    }
+
+    if args.output:
+        with open(args.output, 'w') as f:
+            json.dump(tree, f, **json_opts)
+    else:
+        json.dump(tree, sys.stdout, **json_opts)
+
+
+if __name__ == '__main__':
+    main()
No results found