dw-liedji · November 10, 2019 15:20
diff --git a/srt_to_txt.py b/srt_to_txt.py
 """
 Creates formatted, readable .docx file from directory of udacity SRT files (assuming fixed directory structure).
 """
 import re, sys, os
 from docx import Document
 from docx.shared import Inches

 #Hardcoded directory; assumes .srt files are inside SUB-directories of this directory
 directory = 'Knowledge-Based AI_ Cognitive Systems Subtitles/'

 def is_time_stamp(l):
  if l[:2].isnumeric() and l[2] == ':':
    return True
  return False

 def has_letters(line):
  if re.search('[a-zA-Z]', line):
    return True
  return False

 def has_no_text(line):
  l = line.strip()
  if not len(l):
    return True
  if l.isnumeric():
    return True
  if is_time_stamp(l):
    return True
  if l[0] == '(' and l[-1] == ')':
    return True
  if not has_letters(line):
    return True
  return False

 def is_lowercase_letter_or_comma(letter):
  if letter.isalpha() and letter.lower() == letter:
    return True
  if letter == ',':
    return True
  return False

 def clean_up(lines):
  """
  From: https://www.webucator.com/blog/2017/04/simple-python-script-extracting-text-srt-file/
  Get rid of all non-text lines and
  try to combine text broken into multiple lines
  """
  new_lines = []
  for line in lines[1:]:
    if has_no_text(line):
      continue
  elif len(new_lines): #Removed this condition as too many incorrect newlines were created: and is_lowercase_letter_or_comma(line[0]):
      #combine with previous line
      new_lines[-1] = new_lines[-1].strip() + ' ' + line
    else:
      #append line
      new_lines.append(line)
  return new_lines

 def processLesson(sub):
    #Takes directory that holds the SRT and loops over files
    lesson = {} #A lesson is a collection of concepts with concept title by key
    print("processLesson:  sub = ", sub)
    for filename in sorted(os.listdir(sub)):
        if filename.endswith(".srt"):
            #Open file
            with open(sub + "/" + filename) as f:
                #Read line
                lines = f.readlines()
                new_lines = clean_up(lines)

                #Add filename as a heading
                heading = filename[:-4]#chop off the file extension)
                lesson[heading] = new_lines
        else:
            raise NameError('Wrong file extension')
    return lesson

 def writeToFile(lesson, document):
    #Accepts a lesson object/dictionary and writes to the main file
    #with open(filename, 'w') as f:
    for conceptTitle in lesson:
        #f.write("\n" + conceptTitle + "\n")
        #for sentence in lesson[conceptTitle]:
        #    f.write(sentence)
        p = document.add_paragraph()
        p.add_run(conceptTitle + '\n').bold = True
        document.add_paragraph()
        p.add_run(lesson[conceptTitle])
    document.save('Main.docx')
    return 1

 def main():

  #Create a word document object
  document = Document()

  #Cycle through lessons
  subDirectories = sorted(os.listdir(directory))[1:] #Removes some file or value at 0 index

  for sub in subDirectories:
      #Cycle through the concepts in a lesson
      document.add_heading(sub)
      lesson = processLesson(directory + "/" + sub)

      writeToFile(lesson, document)

 if __name__ == '__main__':
    #Eliminate the spaces in the directory name
    directory.replace(" ", "\\ ")
    main()

 """
 NOTES
 * Takes subtitles from udacity videos (concepts) from inside a lecture (lesson) and consolidates them across a class to create class notes
 * Assumes the class directory is in same folder with this source code file
 * Change main directory in global variable at the top and be sure to include / at the end
 * This assumes there is one level of sub-directories before the SRT files
 * Uses python-docx to store subtitles in Word docx format: https://python-docx.readthedocs.io/en/latest/
 * Uses original code for parsing SRT files from https://www.webucator.com/blog/2017/04/simple-python-script-extracting-text-srt-file/
 """
	"""
	Creates formatted, readable .docx file from directory of udacity SRT files (assuming fixed directory structure).
	"""
	import re, sys, os
	from docx import Document
	from docx.shared import Inches

	#Hardcoded directory; assumes .srt files are inside SUB-directories of this directory
	directory = 'Knowledge-Based AI_ Cognitive Systems Subtitles/'

	def is_time_stamp(l):
	if l[:2].isnumeric() and l[2] == ':':
	return True
	return False

	def has_letters(line):
	if re.search('[a-zA-Z]', line):
	return True
	return False

	def has_no_text(line):
	l = line.strip()
	if not len(l):
	return True
	if l.isnumeric():
	return True
	if is_time_stamp(l):
	return True
	if l[0] == '(' and l[-1] == ')':
	return True
	if not has_letters(line):
	return True
	return False

	def is_lowercase_letter_or_comma(letter):
	if letter.isalpha() and letter.lower() == letter:
	return True
	if letter == ',':
	return True
	return False

	def clean_up(lines):
	"""
	From: https://www.webucator.com/blog/2017/04/simple-python-script-extracting-text-srt-file/
	Get rid of all non-text lines and
	try to combine text broken into multiple lines
	"""
	new_lines = []
	for line in lines[1:]:
	if has_no_text(line):
	continue
	elif len(new_lines): #Removed this condition as too many incorrect newlines were created: and is_lowercase_letter_or_comma(line[0]):
	#combine with previous line
	new_lines[-1] = new_lines[-1].strip() + ' ' + line
	else:
	#append line
	new_lines.append(line)
	return new_lines

	def processLesson(sub):
	#Takes directory that holds the SRT and loops over files
	lesson = {} #A lesson is a collection of concepts with concept title by key
	print("processLesson: sub = ", sub)
	for filename in sorted(os.listdir(sub)):
	if filename.endswith(".srt"):
	#Open file
	with open(sub + "/" + filename) as f:
	#Read line
	lines = f.readlines()
	new_lines = clean_up(lines)

	#Add filename as a heading
	heading = filename[:-4]#chop off the file extension)
	lesson[heading] = new_lines
	else:
	raise NameError('Wrong file extension')
	return lesson

	def writeToFile(lesson, document):
	#Accepts a lesson object/dictionary and writes to the main file
	#with open(filename, 'w') as f:
	for conceptTitle in lesson:
	#f.write("\n" + conceptTitle + "\n")
	#for sentence in lesson[conceptTitle]:
	# f.write(sentence)
	p = document.add_paragraph()
	p.add_run(conceptTitle + '\n').bold = True
	document.add_paragraph()
	p.add_run(lesson[conceptTitle])
	document.save('Main.docx')
	return 1

	def main():

	#Create a word document object
	document = Document()

	#Cycle through lessons
	subDirectories = sorted(os.listdir(directory))[1:] #Removes some file or value at 0 index

	for sub in subDirectories:
	#Cycle through the concepts in a lesson
	document.add_heading(sub)
	lesson = processLesson(directory + "/" + sub)

	writeToFile(lesson, document)

	if __name__ == '__main__':
	#Eliminate the spaces in the directory name
	directory.replace(" ", "\\ ")
	main()

	"""
	NOTES
	* Takes subtitles from udacity videos (concepts) from inside a lecture (lesson) and consolidates them across a class to create class notes
	* Assumes the class directory is in same folder with this source code file
	* Change main directory in global variable at the top and be sure to include / at the end
	* This assumes there is one level of sub-directories before the SRT files
	* Uses python-docx to store subtitles in Word docx format: https://python-docx.readthedocs.io/en/latest/
	* Uses original code for parsing SRT files from https://www.webucator.com/blog/2017/04/simple-python-script-extracting-text-srt-file/
	"""
No results found