dlamblin · July 2, 2025 21:54
diff --git a/hcl2json.py b/hcl2json.py
 #!/usr/bin/env python
 from argparse import ArgumentParser
 from fileinput import input
 from itertools import groupby
 from json import dumps
 from typing import Generator

 """
 The intent of this script is to convert the many versions.tf files found
 with `find . -name versions.tf` (finds 124 files in my repo) into json, so
 that we can use `jq -r '.terraform.required_providers|to_entries[]|
 .value.version + " " + .value.source + " " + .key'` to get all the versions
 One per line like:
 ~> 5.90.0 hashicorp/aws aws
 We'll then be able to `sort -v`

 If the terraform repo is in your CDPATH, hcl2json.py is in your PATH,
 as are column, find, jq, sort & xargs, all put together, it would be:
 cd terraform > /dev/null; \
 find . -name versions.tf -print0 |\
 xargs -0 hcl2json.py |\
 jq -r '.terraform.required_providers|to_entries[]|
 .value.version + " " + .value.source + " " + .key' |\
 sort -Vrk4 -k2 | sort -uk4 | column -t; cd - > /dev/null

 To get the highest version number for each required provider named.
 E.G. the above produces output like:
 ~>  2.7.1   hashicorp/archive      archive
 ~>  5.99.1  hashicorp/aws          aws
 =   4.20.0  cloudflare/cloudflare  cloudflare
 ~>  3.65.0  datadog/datadog        datadog
 ~>  3.4.3   hashicorp/dns          dns
 ~>  6.4.0   integrations/github    github
 ~>  2.16.1  hashicorp/helm         helm
 ~>  2.37.0  hashicorp/kubernetes   kubernetes
 ~>  2.4.0   hashicorp/local        local
 ~>  3.2.4   hashicorp/null         null
 ~>  1.25.0  cyrilgdn/postgresql    postgresql
 ~>  3.7.1   hashicorp/random       random
 ~>  0.13.1  hashicorp/time         time
 ~>  4.1.0   hashicorp/tls          tls
 ~>  4.8.0   hashicorp/vault        vault

 Known to "work" with Python 3.13.2
 There's no error handling on the parsing, test on smaller inputs first.
 """


 def main() -> None:
    """
    Might convert HCL to JSON.
    If HCL is valid and simple like versions.tf
    """
    # old_process_lines_by_filename(); return
    parser = ArgumentParser(
        prog='hcl2json',
        description='Loosely converts some Hashicorp Terraform HCL to JSON '
                    'without error handling',
        epilog='copyright 2025 Daniel Lamblin MIT Licensed',
    )
    parser.add_argument('--version', action='version', version='%(prog)s 1.0')
    parser.add_argument('-i', '--indent', nargs='?', type=int,
                        const=4, default=None,
                        help='indent INDENT spaces. Default 4, or one-line')
    parser.add_argument('-s', '--sort-keys', action='store_true',
                        help='sort the keys in the JSON objects')
    arg, file_names = parser.parse_known_args()
    convert_input_with(file_names, **arg.__dict__)


 def convert_input_with(
        file_names: list[str],
        indent: int | None = None,
        sort_keys: bool = False,
 ) -> None:
    for _, lines in groupby(
            input_lines_by_filename(file_names),
            key=lambda x: x[0],
    ):
        print(dumps(
            obj=process(''.join(line for _, line in lines)),
            indent=indent,
            sort_keys=sort_keys,
        ))


 def input_lines_by_filename(
        file_names: list[str],
 ) -> Generator[tuple[str, str]]:
    with input(file_names) as all_input:
        for line in all_input:
            yield all_input.filename(), line


 def process(content: str) -> dict:
    """
    HCL to dict
    E.G.
    >>> process('''
    ... terraform {
    ...   required_providers {
    ...     aws = {
    ...       source  = "hashicorp/aws"
    ...       version = "~> 5.90.0"
    ...     }
    ...   }
    ...   required_version = "~> 1.4.7"
    ... }''')
    {'terraform': {'required_providers': {'aws': {'source': 'hashicorp/aws', 'version': '~> 5.90.0'}}, 'required_version': '~> 1.4.7'}}
    """
    tokens = tokenize(content)
    dict, _ = build_from(tokens)
    return dict


 def tokenize(content: str) -> list[str]:
    """
    Tokenize the content. An example demonstration:
    >>> inp='A "quoted string" in a sentence. And:\nA list \n with lines '\
    ... '"inside\nquotes"\nso lets see it go.'
    >>> print(inp);[t for s in [[t] if i%2==1 else [n for m in [l.split() +
    ... ['\n'] for l in t.splitlines()] for n in m][:-1] for i,t in
    ... enumerate(inp.split('"'))] for t in s]
    A "quoted string" in a sentence. And:
    A list
     with lines "inside
    quotes"
    so lets see it go.
    ['A', 'quoted string', 'in', 'a', 'sentence.', 'And:', '\n', 'A',
    'list', '\n', 'with', 'lines', 'inside\nquotes', '\n', 'so', 'lets',
    'see', 'it', 'go.']

    """

    alternating_quoted_tokens = content.split('"')
    lists_of_tokens = [
        [tok] if idx % 2 == 1 else [
            token for sublist in
            [line.split() + ['\n'] for line in tok.splitlines()]
            for token in sublist
        ][:-1] for idx, tok in enumerate(alternating_quoted_tokens)
    ]
    tokens = [token for sublist in lists_of_tokens for token in sublist]
    return tokens


 def build_from(tokens: list[str], idx: int = 0) -> tuple[dict, int]:
    """
    Build a JSON object from tokens.
    """
    dict = {}
    key = None
    while idx < len(tokens):
        token = tokens[idx]
        idx += 1
        if token == '{':
            dict[key], idx = build_from(tokens, idx)
            key = None
        elif token == '}':
            return dict, idx
        elif token == '=':
            dict[key], idx = scan_for_value(tokens, idx)
            key = None
        elif token.startswith('#'):
            # TODO(lamblin): fix for a quoted token that starts with # ?
            while idx < len(tokens) and tokens[idx] != '\n':
                idx += 1
        # elif token == '[':
        #     dict[key], idx = build_from(tokens, idx)
        # elif token == ']':
        #     return dict
        # elif token == ',':
        #     continue
        elif token == '\n':
            continue
        else:
            key = token if not key else key + '__' + token
    return dict, idx


 def scan_for_value(tokens: list[str], idx: int) -> tuple[str | dict, int]:
    """
    Scan for a value in tokens up to newline, closing brace, etc
    """
    value = ''
    depth = 0
    while idx < len(tokens):
        token = tokens[idx]
        idx += 1
        if token == '{' and not value:
            return build_from(tokens, idx)
        elif depth == 0 and token in ('}', '\n'):
            return value, idx
        elif token == '{':
            value += f'-{depth}-'
            depth += 1
        elif token == '}':
            depth -= 1
            value += f'-{depth}-'
        else:
            value += f'__{token}' if value else token
    return value, idx


 def old_process_lines_by_filename() -> None:
    """This worked but I like groupby(input_lines_by_filename(… better."""
    current_file: str = ''
    current_content: str = ''
    with input() as all_input:
        for line in all_input:
            if current_file != all_input.filename():
                if current_content:
                    print(dumps(process(current_content)))
                    current_content = ''
                current_file = all_input.filename()
            current_content += line
        if current_content:
            print(dumps(process(current_content)))


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python
	from argparse import ArgumentParser
	from fileinput import input
	from itertools import groupby
	from json import dumps
	from typing import Generator

	"""
	The intent of this script is to convert the many versions.tf files found
	with `find . -name versions.tf` (finds 124 files in my repo) into json, so
	that we can use `jq -r '.terraform.required_providers\|to_entries[]\|
	.value.version + " " + .value.source + " " + .key'` to get all the versions
	One per line like:
	~> 5.90.0 hashicorp/aws aws
	We'll then be able to `sort -v`

	If the terraform repo is in your CDPATH, hcl2json.py is in your PATH,
	as are column, find, jq, sort & xargs, all put together, it would be:
	cd terraform > /dev/null; \
	find . -name versions.tf -print0 \|\
	xargs -0 hcl2json.py \|\
	jq -r '.terraform.required_providers\|to_entries[]\|
	.value.version + " " + .value.source + " " + .key' \|\
	sort -Vrk4 -k2 \| sort -uk4 \| column -t; cd - > /dev/null

	To get the highest version number for each required provider named.
	E.G. the above produces output like:
	~> 2.7.1 hashicorp/archive archive
	~> 5.99.1 hashicorp/aws aws
	= 4.20.0 cloudflare/cloudflare cloudflare
	~> 3.65.0 datadog/datadog datadog
	~> 3.4.3 hashicorp/dns dns
	~> 6.4.0 integrations/github github
	~> 2.16.1 hashicorp/helm helm
	~> 2.37.0 hashicorp/kubernetes kubernetes
	~> 2.4.0 hashicorp/local local
	~> 3.2.4 hashicorp/null null
	~> 1.25.0 cyrilgdn/postgresql postgresql
	~> 3.7.1 hashicorp/random random
	~> 0.13.1 hashicorp/time time
	~> 4.1.0 hashicorp/tls tls
	~> 4.8.0 hashicorp/vault vault

	Known to "work" with Python 3.13.2
	There's no error handling on the parsing, test on smaller inputs first.
	"""


	def main() -> None:
	"""
	Might convert HCL to JSON.
	If HCL is valid and simple like versions.tf
	"""
	# old_process_lines_by_filename(); return
	parser = ArgumentParser(
	prog='hcl2json',
	description='Loosely converts some Hashicorp Terraform HCL to JSON '
	'without error handling',
	epilog='copyright 2025 Daniel Lamblin MIT Licensed',
	)
	parser.add_argument('--version', action='version', version='%(prog)s 1.0')
	parser.add_argument('-i', '--indent', nargs='?', type=int,
	const=4, default=None,
	help='indent INDENT spaces. Default 4, or one-line')
	parser.add_argument('-s', '--sort-keys', action='store_true',
	help='sort the keys in the JSON objects')
	arg, file_names = parser.parse_known_args()
	convert_input_with(file_names, **arg.__dict__)


	def convert_input_with(
	file_names: list[str],
	indent: int \| None = None,
	sort_keys: bool = False,
	) -> None:
	for _, lines in groupby(
	input_lines_by_filename(file_names),
	key=lambda x: x[0],
	):
	print(dumps(
	obj=process(''.join(line for _, line in lines)),
	indent=indent,
	sort_keys=sort_keys,
	))


	def input_lines_by_filename(
	file_names: list[str],
	) -> Generator[tuple[str, str]]:
	with input(file_names) as all_input:
	for line in all_input:
	yield all_input.filename(), line


	def process(content: str) -> dict:
	"""
	HCL to dict
	E.G.
	>>> process('''
	... terraform {
	... required_providers {
	... aws = {
	... source = "hashicorp/aws"
	... version = "~> 5.90.0"
	... }
	... }
	... required_version = "~> 1.4.7"
	... }''')
	{'terraform': {'required_providers': {'aws': {'source': 'hashicorp/aws', 'version': '~> 5.90.0'}}, 'required_version': '~> 1.4.7'}}
	"""
	tokens = tokenize(content)
	dict, _ = build_from(tokens)
	return dict


	def tokenize(content: str) -> list[str]:
	"""
	Tokenize the content. An example demonstration:
	>>> inp='A "quoted string" in a sentence. And:\nA list \n with lines '\
	... '"inside\nquotes"\nso lets see it go.'
	>>> print(inp);[t for s in [[t] if i%2==1 else [n for m in [l.split() +
	... ['\n'] for l in t.splitlines()] for n in m][:-1] for i,t in
	... enumerate(inp.split('"'))] for t in s]
	A "quoted string" in a sentence. And:
	A list
	with lines "inside
	quotes"
	so lets see it go.
	['A', 'quoted string', 'in', 'a', 'sentence.', 'And:', '\n', 'A',
	'list', '\n', 'with', 'lines', 'inside\nquotes', '\n', 'so', 'lets',
	'see', 'it', 'go.']

	"""

	alternating_quoted_tokens = content.split('"')
	lists_of_tokens = [
	[tok] if idx % 2 == 1 else [
	token for sublist in
	[line.split() + ['\n'] for line in tok.splitlines()]
	for token in sublist
	][:-1] for idx, tok in enumerate(alternating_quoted_tokens)
	]
	tokens = [token for sublist in lists_of_tokens for token in sublist]
	return tokens


	def build_from(tokens: list[str], idx: int = 0) -> tuple[dict, int]:
	"""
	Build a JSON object from tokens.
	"""
	dict = {}
	key = None
	while idx < len(tokens):
	token = tokens[idx]
	idx += 1
	if token == '{':
	dict[key], idx = build_from(tokens, idx)
	key = None
	elif token == '}':
	return dict, idx
	elif token == '=':
	dict[key], idx = scan_for_value(tokens, idx)
	key = None
	elif token.startswith('#'):
	# TODO(lamblin): fix for a quoted token that starts with # ?
	while idx < len(tokens) and tokens[idx] != '\n':
	idx += 1
	# elif token == '[':
	# dict[key], idx = build_from(tokens, idx)
	# elif token == ']':
	# return dict
	# elif token == ',':
	# continue
	elif token == '\n':
	continue
	else:
	key = token if not key else key + '__' + token
	return dict, idx


	def scan_for_value(tokens: list[str], idx: int) -> tuple[str \| dict, int]:
	"""
	Scan for a value in tokens up to newline, closing brace, etc
	"""
	value = ''
	depth = 0
	while idx < len(tokens):
	token = tokens[idx]
	idx += 1
	if token == '{' and not value:
	return build_from(tokens, idx)
	elif depth == 0 and token in ('}', '\n'):
	return value, idx
	elif token == '{':
	value += f'-{depth}-'
	depth += 1
	elif token == '}':
	depth -= 1
	value += f'-{depth}-'
	else:
	value += f'__{token}' if value else token
	return value, idx


	def old_process_lines_by_filename() -> None:
	"""This worked but I like groupby(input_lines_by_filename(… better."""
	current_file: str = ''
	current_content: str = ''
	with input() as all_input:
	for line in all_input:
	if current_file != all_input.filename():
	if current_content:
	print(dumps(process(current_content)))
	current_content = ''
	current_file = all_input.filename()
	current_content += line
	if current_content:
	print(dumps(process(current_content)))


	if __name__ == "__main__":
	main()