#!/usr/bin/env python3 """ SPDX-License-Identifier: MIT https://opensource.org/licenses/MIT Copyright © 2021 pukkandan.ytdlp@gmail.com * Input file is an info.json (with comments) that yt-dlp (https://github.com/yt-dlp/yt-dlp) wrote * Change FIELDS according to your needs The output file will be in the format: [{ 'text': 'comment 1', ... 'replies': [{ 'text': 'reply 1', ... 'replies': [...], }, ...], }, ...] """ import json import argparse from datetime import datetime def get_fields(dct): for name, fn in FIELDS.items(): val = fn(dct, name) if val is not None: yield name, val def filter_func(comments): return [dict(get_fields(c)) for c in comments] FIELDS = { 'text': dict.get, 'author': dict.get, 'timestamp': lambda dct, name: dct.get(name) and datetime.strftime( datetime.utcfromtimestamp(dct.get(name)), '%Y/%m/%d'), 'replies': lambda dct, name: filter_func(dct.get(name, [])) or None } parser = argparse.ArgumentParser() parser.add_argument( '--input-file', '-i', dest='inputfile', metavar='FILE', required=True, help='File to read info_dict from') parser.add_argument( '--output-file', '-o', dest='outputfile', metavar='FILE', required=True, help='File to write comments to') args = parser.parse_args() print('Reading file') with open(args.inputfile, encoding='utf-8') as f: info_dict = json.load(f) comment_data = {c['id']: c for c in sorted( info_dict['comments'], key=lambda c: c.get('timestamp') or 0)} count = len(info_dict['comments']) del info_dict nested_comments = [] for i, (cid, c) in enumerate(comment_data.items(), 1): print(f'Processing comment {i}/{count}', end='\r') parent = nested_comments if c['parent'] == 'root' else comment_data[c['parent']].setdefault('replies', []) parent.append(c) print('\nWriting file') with open(args.outputfile, 'w', encoding='utf-8') as f: json.dump(filter_func(nested_comments), f, indent=4, ensure_ascii=False) print('Done')