Skip to content

Instantly share code, notes, and snippets.

@b-adams
Last active April 13, 2024 15:33
Show Gist options
  • Save b-adams/ee9fd90f3d85bb2a2da1 to your computer and use it in GitHub Desktop.
Save b-adams/ee9fd90f3d85bb2a2da1 to your computer and use it in GitHub Desktop.

Revisions

  1. Prof. Bryant E Adams revised this gist Sep 2, 2015. 2 changed files with 4 additions and 3 deletions.
    3 changes: 2 additions & 1 deletion Converted_output.csv
    Original file line number Diff line number Diff line change
    @@ -1,4 +1,5 @@
    Entries,Timecodes,Subtitles
    1,"00:00:00,104 --> 00:00:02,669","Hi, I'm shell-scripting."
    2,"00:00:02,982 --> 00:00:04,965","I'm not sure if it would work,but I'll try it!"
    2,"00:00:02,982 --> 00:00:04,965","I'm not sure if it would work,"
    ,,but I'll try it!
    3,"00:00:05,085 --> 00:00:07,321",There must be a way to do it!
    4 changes: 2 additions & 2 deletions console_output.log
    Original file line number Diff line number Diff line change
    @@ -1,2 +1,2 @@
    Done converting Wildlife.srt to Subtitle.xlsx. 3 subtitle entries found
    [Finished in 2.0s]
    Done converting Wildlife.srt to Subtitle.xlsx. 3 subtitle entries found. 5 rows written
    [Finished in 0.6s]
  2. Prof. Bryant E Adams revised this gist Sep 2, 2015. 1 changed file with 30 additions and 14 deletions.
    44 changes: 30 additions & 14 deletions convert.py
    Original file line number Diff line number Diff line change
    @@ -7,7 +7,7 @@ def parse_subtitles(lines):
    line_timestamp = re.compile('^\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}$')
    line_seperator = re.compile('^\s*$')

    current_record = {'index':None, 'timestamp':None, 'subtitle':''}
    current_record = {'index':None, 'timestamp':None, 'subtitles':[]}
    state = 'seeking to next entry'

    for line in lines:
    @@ -33,36 +33,52 @@ def parse_subtitles(lines):
    logging.info('Blank line reached, yielding record: {r}'.format(r=current_record))
    yield current_record
    state = 'seeking to next entry'
    current_record = {'index':None, 'timestamp':None, 'subtitle':''}
    current_record = {'index':None, 'timestamp':None, 'subtitles':[]}
    else:
    logging.debug('Appending to subtitle: {s}'.format(s=line))
    current_record['subtitle'] += line
    current_record['subtitles'].append(line)

    else:
    logging.error('HUH: Fell into an unknown state: `{s}`'.format(s=state))
    if state == 'reading subtitles':
    # We must have finished the file without encountering a blank line. Dump the last record
    yield current_record

    def write_dict_to_worksheet(colnums_for_keys, keyed_data, worksheet, row):
    for (colname, colindex) in colnums_for_keys.items():
    worksheet.write(row, colindex, keyed_data[colname])
    def write_dict_to_worksheet(columns_for_keys, keyed_data, worksheet, row):
    """
    Write a subtitle-record to a worksheet.
    Return the row number after those that were written (since this may write multiple rows)
    """
    current_row = row
    #First, horizontally write the entry and timecode
    for (colname, colindex) in columns_for_keys.items():
    if colname != 'subtitles':
    worksheet.write(current_row, colindex, keyed_data[colname])

    #Next, vertically write the subtitle data
    subtitle_column = columns_for_keys['subtitles']
    for morelines in keyed_data['subtitles']:
    worksheet.write(current_row, subtitle_column, morelines)
    current_row+=1

    return current_row

    def convert(input_filename, output_filename):
    workbook = xlsxwriter.Workbook(output_filename)
    worksheet = workbook.add_worksheet('subtitle')
    columns = {'index':0, 'timestamp':1, 'subtitle':2}
    worksheet = workbook.add_worksheet('subtitles')
    columns = {'index':0, 'timestamp':1, 'subtitles':2}

    row = 0
    headings = {'index':"Entries", 'timestamp':"Timecodes", 'subtitle':"Subtitles"}
    write_dict_to_worksheet(columns, headings, worksheet, row)
    next_available_row = 0
    records_processed = 0
    headings = {'index':"Entries", 'timestamp':"Timecodes", 'subtitles':["Subtitles"]}
    next_available_row=write_dict_to_worksheet(columns, headings, worksheet, next_available_row)

    with open(input_filename) as textfile:
    for record in parse_subtitles(textfile):
    row += 1
    write_dict_to_worksheet(columns, record, worksheet, row)
    next_available_row = write_dict_to_worksheet(columns, record, worksheet, next_available_row)
    records_processed += 1

    print('Done converting {inp} to {outp}. {n} subtitle entries found'.format(inp=input_filename, outp=output_filename, n=row))
    print('Done converting {inp} to {outp}. {n} subtitle entries found. {m} rows written'.format(inp=input_filename, outp=output_filename, n=records_processed, m=next_available_row))
    workbook.close()

    convert(input_filename='Wildlife.srt', output_filename='Subtitle.xlsx')
  3. Prof. Bryant E Adams created this gist Sep 2, 2015.
    4 changes: 4 additions & 0 deletions Converted_output.csv
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,4 @@
    Entries,Timecodes,Subtitles
    1,"00:00:00,104 --> 00:00:02,669","Hi, I'm shell-scripting."
    2,"00:00:02,982 --> 00:00:04,965","I'm not sure if it would work,but I'll try it!"
    3,"00:00:05,085 --> 00:00:07,321",There must be a way to do it!
    12 changes: 12 additions & 0 deletions Wildlife.srt
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,12 @@
    1
    00:00:00,104 --> 00:00:02,669
    Hi, I'm shell-scripting.

    2
    00:00:02,982 --> 00:00:04,965
    I'm not sure if it would work,
    but I'll try it!

    3
    00:00:05,085 --> 00:00:07,321
    There must be a way to do it!
    2 changes: 2 additions & 0 deletions console_output.log
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,2 @@
    Done converting Wildlife.srt to Subtitle.xlsx. 3 subtitle entries found
    [Finished in 2.0s]
    68 changes: 68 additions & 0 deletions convert.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,68 @@
    import xlsxwriter
    import re
    import logging

    def parse_subtitles(lines):
    line_index = re.compile('^\d*$')
    line_timestamp = re.compile('^\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}$')
    line_seperator = re.compile('^\s*$')

    current_record = {'index':None, 'timestamp':None, 'subtitle':''}
    state = 'seeking to next entry'

    for line in lines:
    line = line.strip('\n')
    if state == 'seeking to next entry':
    if line_index.match(line):
    logging.debug('Found index: {i}'.format(i=line))
    current_record['index'] = line
    state = 'looking for timestamp'
    else:
    logging.error('HUH: Expected to find an index, but instead found: [{d}]'.format(d=line))

    elif state == 'looking for timestamp':
    if line_timestamp.match(line):
    logging.debug('Found timestamp: {t}'.format(t=line))
    current_record['timestamp'] = line
    state = 'reading subtitles'
    else:
    logging.error('HUH: Expected to find a timestamp, but instead found: [{d}]'.format(d=line))

    elif state == 'reading subtitles':
    if line_seperator.match(line):
    logging.info('Blank line reached, yielding record: {r}'.format(r=current_record))
    yield current_record
    state = 'seeking to next entry'
    current_record = {'index':None, 'timestamp':None, 'subtitle':''}
    else:
    logging.debug('Appending to subtitle: {s}'.format(s=line))
    current_record['subtitle'] += line

    else:
    logging.error('HUH: Fell into an unknown state: `{s}`'.format(s=state))
    if state == 'reading subtitles':
    # We must have finished the file without encountering a blank line. Dump the last record
    yield current_record

    def write_dict_to_worksheet(colnums_for_keys, keyed_data, worksheet, row):
    for (colname, colindex) in colnums_for_keys.items():
    worksheet.write(row, colindex, keyed_data[colname])

    def convert(input_filename, output_filename):
    workbook = xlsxwriter.Workbook(output_filename)
    worksheet = workbook.add_worksheet('subtitle')
    columns = {'index':0, 'timestamp':1, 'subtitle':2}

    row = 0
    headings = {'index':"Entries", 'timestamp':"Timecodes", 'subtitle':"Subtitles"}
    write_dict_to_worksheet(columns, headings, worksheet, row)

    with open(input_filename) as textfile:
    for record in parse_subtitles(textfile):
    row += 1
    write_dict_to_worksheet(columns, record, worksheet, row)

    print('Done converting {inp} to {outp}. {n} subtitle entries found'.format(inp=input_filename, outp=output_filename, n=row))
    workbook.close()

    convert(input_filename='Wildlife.srt', output_filename='Subtitle.xlsx')