Last active
April 13, 2024 15:33
-
-
Save b-adams/ee9fd90f3d85bb2a2da1 to your computer and use it in GitHub Desktop.
Revisions
-
Prof. Bryant E Adams revised this gist
Sep 2, 2015 . 2 changed files with 4 additions and 3 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,4 +1,5 @@ Entries,Timecodes,Subtitles 1,"00:00:00,104 --> 00:00:02,669","Hi, I'm shell-scripting." 2,"00:00:02,982 --> 00:00:04,965","I'm not sure if it would work," ,,but I'll try it! 3,"00:00:05,085 --> 00:00:07,321",There must be a way to do it! This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,2 +1,2 @@ Done converting Wildlife.srt to Subtitle.xlsx. 3 subtitle entries found. 5 rows written [Finished in 0.6s] -
Prof. Bryant E Adams revised this gist
Sep 2, 2015 . 1 changed file with 30 additions and 14 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -7,7 +7,7 @@ def parse_subtitles(lines): line_timestamp = re.compile('^\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}$') line_seperator = re.compile('^\s*$') current_record = {'index':None, 'timestamp':None, 'subtitles':[]} state = 'seeking to next entry' for line in lines: @@ -33,36 +33,52 @@ def parse_subtitles(lines): logging.info('Blank line reached, yielding record: {r}'.format(r=current_record)) yield current_record state = 'seeking to next entry' current_record = {'index':None, 'timestamp':None, 'subtitles':[]} else: logging.debug('Appending to subtitle: {s}'.format(s=line)) current_record['subtitles'].append(line) else: logging.error('HUH: Fell into an unknown state: `{s}`'.format(s=state)) if state == 'reading subtitles': # We must have finished the file without encountering a blank line. Dump the last record yield current_record def write_dict_to_worksheet(columns_for_keys, keyed_data, worksheet, row): """ Write a subtitle-record to a worksheet. Return the row number after those that were written (since this may write multiple rows) """ current_row = row #First, horizontally write the entry and timecode for (colname, colindex) in columns_for_keys.items(): if colname != 'subtitles': worksheet.write(current_row, colindex, keyed_data[colname]) #Next, vertically write the subtitle data subtitle_column = columns_for_keys['subtitles'] for morelines in keyed_data['subtitles']: worksheet.write(current_row, subtitle_column, morelines) current_row+=1 return current_row def convert(input_filename, output_filename): workbook = xlsxwriter.Workbook(output_filename) worksheet = workbook.add_worksheet('subtitles') columns = {'index':0, 'timestamp':1, 'subtitles':2} next_available_row = 0 records_processed = 0 headings = {'index':"Entries", 'timestamp':"Timecodes", 'subtitles':["Subtitles"]} next_available_row=write_dict_to_worksheet(columns, headings, worksheet, next_available_row) with open(input_filename) as textfile: for record in parse_subtitles(textfile): next_available_row = write_dict_to_worksheet(columns, record, worksheet, next_available_row) records_processed += 1 print('Done converting {inp} to {outp}. {n} subtitle entries found. {m} rows written'.format(inp=input_filename, outp=output_filename, n=records_processed, m=next_available_row)) workbook.close() convert(input_filename='Wildlife.srt', output_filename='Subtitle.xlsx') -
Prof. Bryant E Adams created this gist
Sep 2, 2015 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,4 @@ Entries,Timecodes,Subtitles 1,"00:00:00,104 --> 00:00:02,669","Hi, I'm shell-scripting." 2,"00:00:02,982 --> 00:00:04,965","I'm not sure if it would work,but I'll try it!" 3,"00:00:05,085 --> 00:00:07,321",There must be a way to do it! This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,12 @@ 1 00:00:00,104 --> 00:00:02,669 Hi, I'm shell-scripting. 2 00:00:02,982 --> 00:00:04,965 I'm not sure if it would work, but I'll try it! 3 00:00:05,085 --> 00:00:07,321 There must be a way to do it! This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,2 @@ Done converting Wildlife.srt to Subtitle.xlsx. 3 subtitle entries found [Finished in 2.0s] This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,68 @@ import xlsxwriter import re import logging def parse_subtitles(lines): line_index = re.compile('^\d*$') line_timestamp = re.compile('^\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}$') line_seperator = re.compile('^\s*$') current_record = {'index':None, 'timestamp':None, 'subtitle':''} state = 'seeking to next entry' for line in lines: line = line.strip('\n') if state == 'seeking to next entry': if line_index.match(line): logging.debug('Found index: {i}'.format(i=line)) current_record['index'] = line state = 'looking for timestamp' else: logging.error('HUH: Expected to find an index, but instead found: [{d}]'.format(d=line)) elif state == 'looking for timestamp': if line_timestamp.match(line): logging.debug('Found timestamp: {t}'.format(t=line)) current_record['timestamp'] = line state = 'reading subtitles' else: logging.error('HUH: Expected to find a timestamp, but instead found: [{d}]'.format(d=line)) elif state == 'reading subtitles': if line_seperator.match(line): logging.info('Blank line reached, yielding record: {r}'.format(r=current_record)) yield current_record state = 'seeking to next entry' current_record = {'index':None, 'timestamp':None, 'subtitle':''} else: logging.debug('Appending to subtitle: {s}'.format(s=line)) current_record['subtitle'] += line else: logging.error('HUH: Fell into an unknown state: `{s}`'.format(s=state)) if state == 'reading subtitles': # We must have finished the file without encountering a blank line. Dump the last record yield current_record def write_dict_to_worksheet(colnums_for_keys, keyed_data, worksheet, row): for (colname, colindex) in colnums_for_keys.items(): worksheet.write(row, colindex, keyed_data[colname]) def convert(input_filename, output_filename): workbook = xlsxwriter.Workbook(output_filename) worksheet = workbook.add_worksheet('subtitle') columns = {'index':0, 'timestamp':1, 'subtitle':2} row = 0 headings = {'index':"Entries", 'timestamp':"Timecodes", 'subtitle':"Subtitles"} write_dict_to_worksheet(columns, headings, worksheet, row) with open(input_filename) as textfile: for record in parse_subtitles(textfile): row += 1 write_dict_to_worksheet(columns, record, worksheet, row) print('Done converting {inp} to {outp}. {n} subtitle entries found'.format(inp=input_filename, outp=output_filename, n=row)) workbook.close() convert(input_filename='Wildlife.srt', output_filename='Subtitle.xlsx')