Skip to content

Instantly share code, notes, and snippets.

@Airbus5717
Created February 8, 2024 11:06
Show Gist options
  • Select an option

  • Save Airbus5717/ec899febf2c151fea2fb3ccce2f2bcc0 to your computer and use it in GitHub Desktop.

Select an option

Save Airbus5717/ec899febf2c151fea2fb3ccce2f2bcc0 to your computer and use it in GitHub Desktop.

Revisions

  1. Airbus5717 created this gist Feb 8, 2024.
    147 changes: 147 additions & 0 deletions main.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,147 @@
    # source https://gist.github.com/glasslion/b2fcad16bc8a9630dbd7a945ab5ebf5e
    import os

    def find_files(directory, extension):
    """
    Recursively finds all files with a specific extension in a directory and its subdirectories.
    Args:
    - directory (str): The directory to start the search from.
    - extension (str): The file extension to search for (e.g., '.txt', '.jpg', etc.).
    Returns:
    - file_list (list): A list of file paths matching the specified extension.
    """
    file_list = []
    for root, dirs, files in os.walk(directory):
    for file in files:
    if file.endswith(extension):
    file_list.append(os.path.join(root, file))
    return file_list

    # Example usage:
    directory_path = './'
    file_extension = '.vtt'
    found_files = find_files(directory_path, file_extension)
    print("Found files with extension '{}':".format(file_extension))
    # for file_path in found_files:
    # print(file_path)



    """
    Convert YouTube subtitles(vtt) to human readable text.
    Download only subtitles from YouTube with youtube-dl:
    youtube-dl --skip-download --convert-subs vtt <video_url>
    Note that default subtitle format provided by YouTube is ass, which is hard
    to process with simple regex. Luckily youtube-dl can convert ass to vtt, which
    is easier to process.
    To conver all vtt files inside a directory:
    find . -name "*.vtt" -exec python vtt2text.py {} \;
    """

    import sys
    import re


    def remove_tags(text):
    """
    Remove vtt markup tags
    """
    tags = [
    r'</c>',
    r'<c(\.color\w+)?>',
    r'<\d{2}:\d{2}:\d{2}\.\d{3}>',

    ]

    for pat in tags:
    text = re.sub(pat, '', text)

    # extract timestamp, only kep HH:MM
    text = re.sub(
    r'(\d{2}:\d{2}):\d{2}\.\d{3} --> .* align:start position:0%',
    r'\g<1>',
    text
    )

    text = re.sub(r'^\s+$', '', text, flags=re.MULTILINE)
    return text

    def remove_header(lines):
    """
    Remove vtt file header
    """
    pos = -1
    for mark in ('##', 'Language: en',):
    if mark in lines:
    pos = lines.index(mark)
    lines = lines[pos+1:]
    return lines


    def merge_duplicates(lines):
    """
    Remove duplicated subtitles. Duplacates are always adjacent.
    """
    last_timestamp = ''
    last_cap = ''
    for line in lines:
    if line == "":
    continue
    if re.match('^\d{2}:\d{2}$', line):
    if line != last_timestamp:
    yield line
    last_timestamp = line
    else:
    if line != last_cap:
    yield line
    last_cap = line


    def merge_short_lines(lines):
    buffer = ''
    for line in lines:
    if line == "" or re.match('^\d{2}:\d{2}$', line):
    yield '\n' + line
    continue

    if len(line+buffer) < 80:
    buffer += ' ' + line
    else:
    yield buffer.strip()
    buffer = line
    yield buffer



    def main():
    print(len(found_files))
    for file in found_files:
    print("file path: "+ file)
    with open(file, 'r', encoding='utf-8') as f:
    text = f.read()
    vtt_file_name = file
    txt_name = re.sub(r'.vtt$', '.txt', vtt_file_name)
    print("text file path: "+ txt_name)

    text = remove_tags(text)
    lines = text.splitlines()
    lines = remove_header(lines)
    lines = merge_duplicates(lines)
    lines = list(lines)
    lines = merge_short_lines(lines)
    lines = list(lines)

    with open(txt_name, 'w') as f:
    for line in lines:
    f.write(line)
    f.write("\n")



    if __name__ == "__main__":
    main()