Airbus5717 · February 8, 2024 11:06 · Feb 8, 2024
diff --git a/main.py b/main.py
@@ -0,0 +1,147 @@
+# source https://gist.github.com/glasslion/b2fcad16bc8a9630dbd7a945ab5ebf5e
+import os
+
+def find_files(directory, extension):
+    """
+    Recursively finds all files with a specific extension in a directory and its subdirectories.
+
+    Args:
+    - directory (str): The directory to start the search from.
+    - extension (str): The file extension to search for (e.g., '.txt', '.jpg', etc.).
+
+    Returns:
+    - file_list (list): A list of file paths matching the specified extension.
+    """
+    file_list = []
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            if file.endswith(extension):
+                file_list.append(os.path.join(root, file))
+    return file_list
+
+# Example usage:
+directory_path = './'
+file_extension = '.vtt'
+found_files = find_files(directory_path, file_extension)
+print("Found files with extension '{}':".format(file_extension))
+# for file_path in found_files:
+#     print(file_path)
+
+
+
+"""
+Convert YouTube subtitles(vtt) to human readable text.
+
+Download only subtitles from YouTube with youtube-dl:
+youtube-dl  --skip-download --convert-subs vtt <video_url>
+
+Note that default subtitle format provided by YouTube is ass, which is hard
+to process with simple regex. Luckily youtube-dl can convert ass to vtt, which
+is easier to process.
+
+To conver all vtt files inside a directory:
+find . -name "*.vtt" -exec python vtt2text.py {} \;
+"""
+
+import sys
+import re
+
+
+def remove_tags(text):
+    """
+    Remove vtt markup tags
+    """
+    tags = [
+        r'</c>',
+        r'<c(\.color\w+)?>',
+        r'<\d{2}:\d{2}:\d{2}\.\d{3}>',
+
+    ]
+
+    for pat in tags:
+        text = re.sub(pat, '', text)
+
+    # extract timestamp, only kep HH:MM
+    text = re.sub(
+        r'(\d{2}:\d{2}):\d{2}\.\d{3} --> .* align:start position:0%',
+        r'\g<1>',
+        text
+    )
+
+    text = re.sub(r'^\s+$', '', text, flags=re.MULTILINE)
+    return text
+
+def remove_header(lines):
+    """
+    Remove vtt file header
+    """
+    pos = -1
+    for mark in ('##', 'Language: en',):
+        if mark in lines:
+            pos = lines.index(mark)
+    lines = lines[pos+1:]
+    return lines
+
+
+def merge_duplicates(lines):
+    """
+    Remove duplicated subtitles. Duplacates are always adjacent.
+    """
+    last_timestamp = ''
+    last_cap = ''
+    for line in lines:
+        if line == "":
+            continue
+        if re.match('^\d{2}:\d{2}$', line):
+            if line != last_timestamp:
+                yield line
+                last_timestamp = line
+        else:
+            if line != last_cap:
+                yield line
+                last_cap = line
+
+
+def merge_short_lines(lines):
+    buffer = ''
+    for line in lines:
+        if line == "" or re.match('^\d{2}:\d{2}$', line):
+            yield '\n' + line
+            continue
+
+        if len(line+buffer) < 80:
+            buffer += ' ' + line
+        else:
+            yield buffer.strip()
+            buffer = line
+    yield buffer
+
+
+
+def main():
+    print(len(found_files))
+    for file in found_files:
+        print("file path: "+ file)
+        with open(file, 'r', encoding='utf-8') as f:
+            text = f.read()
+        vtt_file_name = file
+        txt_name =  re.sub(r'.vtt$', '.txt', vtt_file_name)
+        print("text file path: "+ txt_name)
+
+        text = remove_tags(text)
+        lines = text.splitlines()
+        lines = remove_header(lines)
+        lines = merge_duplicates(lines)
+        lines = list(lines)
+        lines = merge_short_lines(lines)
+        lines = list(lines)
+
+        with open(txt_name, 'w') as f:
+            for line in lines:
+                f.write(line)
+                f.write("\n")
+
+
+
+if __name__ == "__main__":
+    main()
No results found