Skip to content

Instantly share code, notes, and snippets.

@dw-liedji
Forked from Norcim133/srt_to_txt.py
Created November 10, 2019 15:20
Show Gist options
  • Select an option

  • Save dw-liedji/c4fc132d4d675d743d5e2f62cf46e5b8 to your computer and use it in GitHub Desktop.

Select an option

Save dw-liedji/c4fc132d4d675d743d5e2f62cf46e5b8 to your computer and use it in GitHub Desktop.
Simple Python Script for Extracting Text from an SRT File
"""
Creates formatted, readable .docx file from directory of udacity SRT files (assuming fixed directory structure).
"""
import re, sys, os
from docx import Document
from docx.shared import Inches
#Hardcoded directory; assumes .srt files are inside SUB-directories of this directory
directory = 'Knowledge-Based AI_ Cognitive Systems Subtitles/'
def is_time_stamp(l):
if l[:2].isnumeric() and l[2] == ':':
return True
return False
def has_letters(line):
if re.search('[a-zA-Z]', line):
return True
return False
def has_no_text(line):
l = line.strip()
if not len(l):
return True
if l.isnumeric():
return True
if is_time_stamp(l):
return True
if l[0] == '(' and l[-1] == ')':
return True
if not has_letters(line):
return True
return False
def is_lowercase_letter_or_comma(letter):
if letter.isalpha() and letter.lower() == letter:
return True
if letter == ',':
return True
return False
def clean_up(lines):
"""
From: https://www.webucator.com/blog/2017/04/simple-python-script-extracting-text-srt-file/
Get rid of all non-text lines and
try to combine text broken into multiple lines
"""
new_lines = []
for line in lines[1:]:
if has_no_text(line):
continue
elif len(new_lines): #Removed this condition as too many incorrect newlines were created: and is_lowercase_letter_or_comma(line[0]):
#combine with previous line
new_lines[-1] = new_lines[-1].strip() + ' ' + line
else:
#append line
new_lines.append(line)
return new_lines
def processLesson(sub):
#Takes directory that holds the SRT and loops over files
lesson = {} #A lesson is a collection of concepts with concept title by key
print("processLesson: sub = ", sub)
for filename in sorted(os.listdir(sub)):
if filename.endswith(".srt"):
#Open file
with open(sub + "/" + filename) as f:
#Read line
lines = f.readlines()
new_lines = clean_up(lines)
#Add filename as a heading
heading = filename[:-4]#chop off the file extension)
lesson[heading] = new_lines
else:
raise NameError('Wrong file extension')
return lesson
def writeToFile(lesson, document):
#Accepts a lesson object/dictionary and writes to the main file
#with open(filename, 'w') as f:
for conceptTitle in lesson:
#f.write("\n" + conceptTitle + "\n")
#for sentence in lesson[conceptTitle]:
# f.write(sentence)
p = document.add_paragraph()
p.add_run(conceptTitle + '\n').bold = True
document.add_paragraph()
p.add_run(lesson[conceptTitle])
document.save('Main.docx')
return 1
def main():
#Create a word document object
document = Document()
#Cycle through lessons
subDirectories = sorted(os.listdir(directory))[1:] #Removes some file or value at 0 index
for sub in subDirectories:
#Cycle through the concepts in a lesson
document.add_heading(sub)
lesson = processLesson(directory + "/" + sub)
writeToFile(lesson, document)
if __name__ == '__main__':
#Eliminate the spaces in the directory name
directory.replace(" ", "\\ ")
main()
"""
NOTES
* Takes subtitles from udacity videos (concepts) from inside a lecture (lesson) and consolidates them across a class to create class notes
* Assumes the class directory is in same folder with this source code file
* Change main directory in global variable at the top and be sure to include / at the end
* This assumes there is one level of sub-directories before the SRT files
* Uses python-docx to store subtitles in Word docx format: https://python-docx.readthedocs.io/en/latest/
* Uses original code for parsing SRT files from https://www.webucator.com/blog/2017/04/simple-python-script-extracting-text-srt-file/
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment