-
-
Save dw-liedji/c4fc132d4d675d743d5e2f62cf46e5b8 to your computer and use it in GitHub Desktop.
Simple Python Script for Extracting Text from an SRT File
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Creates formatted, readable .docx file from directory of udacity SRT files (assuming fixed directory structure). | |
| """ | |
| import re, sys, os | |
| from docx import Document | |
| from docx.shared import Inches | |
| #Hardcoded directory; assumes .srt files are inside SUB-directories of this directory | |
| directory = 'Knowledge-Based AI_ Cognitive Systems Subtitles/' | |
| def is_time_stamp(l): | |
| if l[:2].isnumeric() and l[2] == ':': | |
| return True | |
| return False | |
| def has_letters(line): | |
| if re.search('[a-zA-Z]', line): | |
| return True | |
| return False | |
| def has_no_text(line): | |
| l = line.strip() | |
| if not len(l): | |
| return True | |
| if l.isnumeric(): | |
| return True | |
| if is_time_stamp(l): | |
| return True | |
| if l[0] == '(' and l[-1] == ')': | |
| return True | |
| if not has_letters(line): | |
| return True | |
| return False | |
| def is_lowercase_letter_or_comma(letter): | |
| if letter.isalpha() and letter.lower() == letter: | |
| return True | |
| if letter == ',': | |
| return True | |
| return False | |
| def clean_up(lines): | |
| """ | |
| From: https://www.webucator.com/blog/2017/04/simple-python-script-extracting-text-srt-file/ | |
| Get rid of all non-text lines and | |
| try to combine text broken into multiple lines | |
| """ | |
| new_lines = [] | |
| for line in lines[1:]: | |
| if has_no_text(line): | |
| continue | |
| elif len(new_lines): #Removed this condition as too many incorrect newlines were created: and is_lowercase_letter_or_comma(line[0]): | |
| #combine with previous line | |
| new_lines[-1] = new_lines[-1].strip() + ' ' + line | |
| else: | |
| #append line | |
| new_lines.append(line) | |
| return new_lines | |
| def processLesson(sub): | |
| #Takes directory that holds the SRT and loops over files | |
| lesson = {} #A lesson is a collection of concepts with concept title by key | |
| print("processLesson: sub = ", sub) | |
| for filename in sorted(os.listdir(sub)): | |
| if filename.endswith(".srt"): | |
| #Open file | |
| with open(sub + "/" + filename) as f: | |
| #Read line | |
| lines = f.readlines() | |
| new_lines = clean_up(lines) | |
| #Add filename as a heading | |
| heading = filename[:-4]#chop off the file extension) | |
| lesson[heading] = new_lines | |
| else: | |
| raise NameError('Wrong file extension') | |
| return lesson | |
| def writeToFile(lesson, document): | |
| #Accepts a lesson object/dictionary and writes to the main file | |
| #with open(filename, 'w') as f: | |
| for conceptTitle in lesson: | |
| #f.write("\n" + conceptTitle + "\n") | |
| #for sentence in lesson[conceptTitle]: | |
| # f.write(sentence) | |
| p = document.add_paragraph() | |
| p.add_run(conceptTitle + '\n').bold = True | |
| document.add_paragraph() | |
| p.add_run(lesson[conceptTitle]) | |
| document.save('Main.docx') | |
| return 1 | |
| def main(): | |
| #Create a word document object | |
| document = Document() | |
| #Cycle through lessons | |
| subDirectories = sorted(os.listdir(directory))[1:] #Removes some file or value at 0 index | |
| for sub in subDirectories: | |
| #Cycle through the concepts in a lesson | |
| document.add_heading(sub) | |
| lesson = processLesson(directory + "/" + sub) | |
| writeToFile(lesson, document) | |
| if __name__ == '__main__': | |
| #Eliminate the spaces in the directory name | |
| directory.replace(" ", "\\ ") | |
| main() | |
| """ | |
| NOTES | |
| * Takes subtitles from udacity videos (concepts) from inside a lecture (lesson) and consolidates them across a class to create class notes | |
| * Assumes the class directory is in same folder with this source code file | |
| * Change main directory in global variable at the top and be sure to include / at the end | |
| * This assumes there is one level of sub-directories before the SRT files | |
| * Uses python-docx to store subtitles in Word docx format: https://python-docx.readthedocs.io/en/latest/ | |
| * Uses original code for parsing SRT files from https://www.webucator.com/blog/2017/04/simple-python-script-extracting-text-srt-file/ | |
| """ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment