Skip to content

Instantly share code, notes, and snippets.

@serif
Last active May 2, 2025 04:16
Show Gist options
  • Select an option

  • Save serif/a1281c676cf5a1f77af6ff1a25255a85 to your computer and use it in GitHub Desktop.

Select an option

Save serif/a1281c676cf5a1f77af6ff1a25255a85 to your computer and use it in GitHub Desktop.

Revisions

  1. serif revised this gist Nov 27, 2023. 1 changed file with 5 additions and 0 deletions.
    5 changes: 5 additions & 0 deletions bwclean2.py
    Original file line number Diff line number Diff line change
    @@ -1,4 +1,9 @@
    #!/usr/bin/env python3
    # updated 2023-11-27
    # updated 2023-10-12
    # updated 2021
    # updated 2020
    # created 2018
    import sys
    import hashlib
    from urllib.parse import urlparse
  2. serif revised this gist Nov 27, 2023. 1 changed file with 70 additions and 64 deletions.
    134 changes: 70 additions & 64 deletions bwclean2.py
    Original file line number Diff line number Diff line change
    @@ -1,80 +1,86 @@
    #!/usr/bin/env python3

    # bwclean2.py
    # Removes duplicates from Bitwarden export .csv
    # 2019-02-09
    # 2023-10-12

    import sys
    import hashlib
    from urllib.parse import urlparse

    # Field ordinals in Bitwarden CSV
    FOLDER = 0
    FAVORITE = 1
    TYPE = 2
    NAME = 3
    NOTES = 4
    FIELDS = 5
    REPROMPT = 6
    URI = 7
    USERNAME = 8
    PASSWORD = 9
    TOTP = 10



    def main(argv):


    # Fields in Bitwarden CSV
    f = 'folder,favorite,type,name,notes,fields,reprompt,login_uri,login_username,login_password,login_totp'.split(',')

    if len(argv) < 1:
    print('Missing input file path')
    sys.exit(1)

    in_file_path = argv[0]
    out_file_path = in_file_path[0:(len(in_file_path)-4)]+'_out.csv'
    rem_file_path = in_file_path[0:(len(in_file_path)-4)]+'_rem.csv'
    sys.exit('Supply input file path as command argument')

    in_path = argv[0]
    csv = '.csv'
    csv_out = '_out' + csv
    csv_rem = '_rem' + csv
    out_path = in_path.replace(csv, csv_out)
    rem_path = in_path.replace(csv, csv_rem)
    completed_lines_hash = set()
    line_number = -1
    write_count = 0
    cache = ''

    out_file = open(out_file_path, 'w', encoding = 'utf8')
    rem_file = open(rem_file_path, 'w', encoding = 'utf8')
    for line in open(in_file_path, 'r', encoding = 'utf8'):
    line_number += 1
    fields = line.split(',')
    if len(fields) < 10:
    # Add previous line if short
    line = cache.strip('\n') + line
    cache = line

    # Process file
    with open(out_path, 'w', encoding='utf8') as out_file, \
    open(rem_path, 'w', encoding='utf8') as rem_file, \
    open(in_path, 'r', encoding='utf8') as in_file:
    for line in in_file:
    line_number += 1

    # Validate .csv format
    if line_number == 0 and not line.strip() == ','.join(f):
    print('\nBitwarden CSV format has changed.')
    print('Contact author for update.')
    exit(1)

    # Skip empty lines
    if not line.strip():
    continue
    fields = line.split(',')
    if len(fields) > 9:
    print(f'Recovered with line {line_number}:\n{line}')

    # If the line has fewer fields than expected,
    # try to combine with the previous line
    if len(fields) < len(f):
    # Add previous line if short
    line = cache.strip('\n') + line
    cache = line
    fields = line.split(',')
    if len(fields) == len(f):
    print(f'Recovered with line {line_number}:\n{line}')
    cache = ''
    else:
    print(f'Missing fields in line {line_number}:\n{line}')
    rem_file.write(line)
    continue
    else:
    cache = ''

    # Generate an MD5 hash based on login URI, username, and password
    if line_number != 0:
    domain = urlparse(fields[f.index('login_uri')]).netloc
    if len(domain) > 0:
    fields[f.index('login_uri')] = domain
    token = fields[f.index('login_uri')]
    token += fields[f.index('login_username')]
    token += fields[f.index('login_password')]
    hashValue = hashlib.md5(token.rstrip().encode('utf-8')).hexdigest()

    # Write entry
    if hashValue not in completed_lines_hash:
    out_file.write(line)
    completed_lines_hash.add(hashValue)
    write_count += 1
    else:
    print(f'Missing fields in line {line_number}:\n{line}')
    rem_file.write(line)
    continue
    else:
    cache = ''
    if line_number != 0:
    domain = urlparse(fields[URI]).netloc
    if len(domain) > 0:
    fields[URI] = domain
    token = fields[URI] + fields[USERNAME] + fields[PASSWORD]
    hashValue = hashlib.md5(token.rstrip().encode('utf-8')).hexdigest()
    if hashValue not in completed_lines_hash:
    out_file.write(line)
    completed_lines_hash.add(hashValue)
    write_count += 1
    else:
    rem_file.write(line)
    # Uncomment for verbose mode
    # print(f'Skipping duplicate on line {line_number}:\n{line}')
    out_file.close()
    rem_file.close()

    # print(f'Duplicate on line {line_number}:\n{line}')

    # Report
    dup_count = line_number - write_count
    print(f'\nOutput file: {out_file_path}\n{write_count} unique entries saved')
    print(f'\n{dup_count} duplicates saved to {rem_file_path}')
    print(f'\nOutput file: {out_path}\n{write_count} unique entries saved')
    print(f'\n{dup_count} duplicates saved to {rem_path}')

    if __name__ == "__main__":
    main(sys.argv[1:])
    main(sys.argv[1:])
  3. serif created this gist Oct 13, 2023.
    80 changes: 80 additions & 0 deletions bwclean2.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,80 @@
    #!/usr/bin/env python3

    # bwclean2.py
    # Removes duplicates from Bitwarden export .csv
    # 2019-02-09
    # 2023-10-12

    import sys
    import hashlib
    from urllib.parse import urlparse

    # Field ordinals in Bitwarden CSV
    FOLDER = 0
    FAVORITE = 1
    TYPE = 2
    NAME = 3
    NOTES = 4
    FIELDS = 5
    REPROMPT = 6
    URI = 7
    USERNAME = 8
    PASSWORD = 9
    TOTP = 10

    def main(argv):

    if len(argv) < 1:
    print('Missing input file path')
    sys.exit(1)

    in_file_path = argv[0]
    out_file_path = in_file_path[0:(len(in_file_path)-4)]+'_out.csv'
    rem_file_path = in_file_path[0:(len(in_file_path)-4)]+'_rem.csv'
    completed_lines_hash = set()
    line_number = -1
    write_count = 0
    cache = ''

    out_file = open(out_file_path, 'w', encoding = 'utf8')
    rem_file = open(rem_file_path, 'w', encoding = 'utf8')
    for line in open(in_file_path, 'r', encoding = 'utf8'):
    line_number += 1
    fields = line.split(',')
    if len(fields) < 10:
    # Add previous line if short
    line = cache.strip('\n') + line
    cache = line
    fields = line.split(',')
    if len(fields) > 9:
    print(f'Recovered with line {line_number}:\n{line}')
    cache = ''
    else:
    print(f'Missing fields in line {line_number}:\n{line}')
    rem_file.write(line)
    continue
    else:
    cache = ''
    if line_number != 0:
    domain = urlparse(fields[URI]).netloc
    if len(domain) > 0:
    fields[URI] = domain
    token = fields[URI] + fields[USERNAME] + fields[PASSWORD]
    hashValue = hashlib.md5(token.rstrip().encode('utf-8')).hexdigest()
    if hashValue not in completed_lines_hash:
    out_file.write(line)
    completed_lines_hash.add(hashValue)
    write_count += 1
    else:
    rem_file.write(line)
    # Uncomment for verbose mode
    # print(f'Skipping duplicate on line {line_number}:\n{line}')
    out_file.close()
    rem_file.close()

    dup_count = line_number - write_count
    print(f'\nOutput file: {out_file_path}\n{write_count} unique entries saved')
    print(f'\n{dup_count} duplicates saved to {rem_file_path}')

    if __name__ == "__main__":
    main(sys.argv[1:])