serif · May 2, 2025 04:16 · Nov 27, 2023 · Nov 27, 2023 · Oct 13, 2023
diff --git a/bwclean2.py b/bwclean2.py
@@ -1,4 +1,9 @@
 #!/usr/bin/env python3
+# updated 2023-11-27
+# updated 2023-10-12
+# updated 2021
+# updated 2020
+# created 2018
 import sys
 import hashlib
 from urllib.parse import urlparse

diff --git a/bwclean2.py b/bwclean2.py
@@ -1,80 +1,86 @@
 #!/usr/bin/env python3
-
-# bwclean2.py
-# Removes duplicates from Bitwarden export .csv
-# 2019-02-09
-# 2023-10-12
-
 import sys
 import hashlib
 from urllib.parse import urlparse
-
-# Field ordinals in Bitwarden CSV
-FOLDER   = 0
-FAVORITE = 1
-TYPE     = 2
-NAME     = 3
-NOTES    = 4
-FIELDS   = 5
-REPROMPT = 6
-URI      = 7
-USERNAME = 8
-PASSWORD = 9
-TOTP     = 10
-
+
+
 def main(argv):
-
+
+    # Fields in Bitwarden CSV
+    f = 'folder,favorite,type,name,notes,fields,reprompt,login_uri,login_username,login_password,login_totp'.split(',')
+
     if len(argv) < 1:
-        print('Missing input file path')
-        sys.exit(1)
-
-    in_file_path  = argv[0]
-    out_file_path = in_file_path[0:(len(in_file_path)-4)]+'_out.csv'
-    rem_file_path = in_file_path[0:(len(in_file_path)-4)]+'_rem.csv'
+        sys.exit('Supply input file path as command argument')
+
+    in_path  = argv[0]
+    csv = '.csv'
+    csv_out = '_out' + csv
+    csv_rem = '_rem' + csv
+    out_path = in_path.replace(csv, csv_out)
+    rem_path = in_path.replace(csv, csv_rem)
     completed_lines_hash = set()
     line_number   = -1
     write_count   = 0
     cache         = ''
-
-    out_file = open(out_file_path, 'w', encoding = 'utf8')
-    rem_file = open(rem_file_path, 'w', encoding = 'utf8')
-    for line in open(in_file_path, 'r', encoding = 'utf8'):
-        line_number += 1
-        fields = line.split(',')
-        if len(fields) < 10:
-            # Add previous line if short
-            line = cache.strip('\n') + line
-            cache = line
+
+    # Process file
+    with open(out_path, 'w', encoding='utf8') as out_file, \
+         open(rem_path, 'w', encoding='utf8') as rem_file, \
+         open(in_path, 'r', encoding='utf8') as in_file:
+        for line in in_file:
+            line_number += 1
+
+            # Validate .csv format
+            if line_number == 0 and not line.strip() == ','.join(f):
+                print('\nBitwarden CSV format has changed.')
+                print('Contact author for update.')
+                exit(1)
+
+            # Skip empty lines
+            if not line.strip():
+                continue
             fields = line.split(',')
-            if len(fields) > 9:
-                print(f'Recovered with line {line_number}:\n{line}')
+
+            # If the line has fewer fields than expected,
+            # try to combine with the previous line
+            if len(fields) < len(f):
+                # Add previous line if short
+                line = cache.strip('\n') + line
+                cache = line
+                fields = line.split(',')
+                if len(fields) == len(f):
+                    print(f'Recovered with line {line_number}:\n{line}')
+                    cache = ''
+                else:
+                    print(f'Missing fields in line {line_number}:\n{line}')
+                    rem_file.write(line)
+                    continue
+            else:
                 cache = ''
+
+            # Generate an MD5 hash based on login URI, username, and password
+            if line_number != 0:
+                domain = urlparse(fields[f.index('login_uri')]).netloc
+                if len(domain) > 0:
+                    fields[f.index('login_uri')] = domain
+            token = fields[f.index('login_uri')]
+            token += fields[f.index('login_username')]
+            token += fields[f.index('login_password')]
+            hashValue = hashlib.md5(token.rstrip().encode('utf-8')).hexdigest()
+
+            # Write entry
+            if hashValue not in completed_lines_hash:
+                out_file.write(line)
+                completed_lines_hash.add(hashValue)
+                write_count += 1
             else:
-                print(f'Missing fields in line {line_number}:\n{line}')
                 rem_file.write(line)
-                continue
-        else:
-            cache = ''
-        if line_number != 0:
-            domain = urlparse(fields[URI]).netloc
-            if len(domain) > 0:
-                fields[URI] = domain
-        token = fields[URI] + fields[USERNAME] + fields[PASSWORD]
-        hashValue = hashlib.md5(token.rstrip().encode('utf-8')).hexdigest()
-        if hashValue not in completed_lines_hash:
-            out_file.write(line)
-            completed_lines_hash.add(hashValue)
-            write_count += 1
-        else: 
-            rem_file.write(line)
-            # Uncomment for verbose mode
-            # print(f'Skipping duplicate on line {line_number}:\n{line}')
-    out_file.close()
-    rem_file.close()
-
+                # print(f'Duplicate on line {line_number}:\n{line}')
+
+    # Report
     dup_count = line_number - write_count
-    print(f'\nOutput file: {out_file_path}\n{write_count} unique entries saved')
-    print(f'\n{dup_count} duplicates saved to {rem_file_path}')
- 
+    print(f'\nOutput file: {out_path}\n{write_count} unique entries saved')
+    print(f'\n{dup_count} duplicates saved to {rem_path}')
+
 if __name__ == "__main__":
-   main(sys.argv[1:])
+   main(sys.argv[1:])
diff --git a/bwclean2.py b/bwclean2.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+
+# bwclean2.py
+# Removes duplicates from Bitwarden export .csv
+# 2019-02-09
+# 2023-10-12
+
+import sys
+import hashlib
+from urllib.parse import urlparse
+
+# Field ordinals in Bitwarden CSV
+FOLDER   = 0
+FAVORITE = 1
+TYPE     = 2
+NAME     = 3
+NOTES    = 4
+FIELDS   = 5
+REPROMPT = 6
+URI      = 7
+USERNAME = 8
+PASSWORD = 9
+TOTP     = 10
+
+def main(argv):
+
+    if len(argv) < 1:
+        print('Missing input file path')
+        sys.exit(1)
+
+    in_file_path  = argv[0]
+    out_file_path = in_file_path[0:(len(in_file_path)-4)]+'_out.csv'
+    rem_file_path = in_file_path[0:(len(in_file_path)-4)]+'_rem.csv'
+    completed_lines_hash = set()
+    line_number   = -1
+    write_count   = 0
+    cache         = ''
+
+    out_file = open(out_file_path, 'w', encoding = 'utf8')
+    rem_file = open(rem_file_path, 'w', encoding = 'utf8')
+    for line in open(in_file_path, 'r', encoding = 'utf8'):
+        line_number += 1
+        fields = line.split(',')
+        if len(fields) < 10:
+            # Add previous line if short
+            line = cache.strip('\n') + line
+            cache = line
+            fields = line.split(',')
+            if len(fields) > 9:
+                print(f'Recovered with line {line_number}:\n{line}')
+                cache = ''
+            else:
+                print(f'Missing fields in line {line_number}:\n{line}')
+                rem_file.write(line)
+                continue
+        else:
+            cache = ''
+        if line_number != 0:
+            domain = urlparse(fields[URI]).netloc
+            if len(domain) > 0:
+                fields[URI] = domain
+        token = fields[URI] + fields[USERNAME] + fields[PASSWORD]
+        hashValue = hashlib.md5(token.rstrip().encode('utf-8')).hexdigest()
+        if hashValue not in completed_lines_hash:
+            out_file.write(line)
+            completed_lines_hash.add(hashValue)
+            write_count += 1
+        else: 
+            rem_file.write(line)
+            # Uncomment for verbose mode
+            # print(f'Skipping duplicate on line {line_number}:\n{line}')
+    out_file.close()
+    rem_file.close()
+
+    dup_count = line_number - write_count
+    print(f'\nOutput file: {out_file_path}\n{write_count} unique entries saved')
+    print(f'\n{dup_count} duplicates saved to {rem_file_path}')
+
+if __name__ == "__main__":
+   main(sys.argv[1:])
No results found