mtreviso · June 27, 2024 11:13 · Jun 27, 2024 · Jun 27, 2024 · Jun 27, 2024 · Jun 27, 2024
diff --git a/removed_unused_bibs.py b/removed_unused_bibs.py
@@ -1,3 +1,20 @@
+"""
+>>> python remove_unused_bibs.py -h
+
+usage: remove_unused_bibs.py [-h] [-b BIB_FILES] [-p PAPER_FILES] [-o OUT_FILE]
+
+Remove unused bibtex entries
+
+options:
+  -h, --help            show this help message and exit
+  -b BIB_FILES, --bib_files BIB_FILES
+      Comma-separated list of bib files (default: paper.bib)
+  -p PAPER_FILES, --paper_files PAPER_FILES
+      Comma-separated list of paper files (default: paper.tex)
+  -o OUT_FILE, --out_file OUT_FILE
+      Output path for the minimal bib file (default: min.bib)
+"""
+
 import re
 import argparse
 

diff --git a/removed_unused_bibs.py b/removed_unused_bibs.py
@@ -64,9 +64,16 @@ def isolate_bibtex_entry(s, start):
 
 def create_min_bib_file(bib_string, keys, keys_used, out_file):
     min_bib_string = ""
+
+    # keep @strings (useful for ACL Anthology)
+    for line in bib_string.split('\n'):
+        x = line.strip()
+        if x.startswith('@string{'):
+            min_bib_string += x + '\n'
+
     for k, used in zip(keys, keys_used):
         if used:
-            key_regex = r"@\w*{" + re.escape(k) + "(?=\,)"
+            key_regex = r"@\w*{" + re.escape(k) + r"(?=\,)"
             match = re.search(key_regex, bib_string)
             min_bib_string += isolate_bibtex_entry(bib_string, match.start()) + "\n\n"
 

diff --git a/removed_unused_bibs.py b/removed_unused_bibs.py
@@ -0,0 +1,99 @@
+import re
+import argparse
+
+def collect_all_bibs(bib_files):
+    # regex for anything that matches "@[some word]{[something],"
+    bibtex_key_regex = r"@\w*{.*(?=\,)"
+
+    # convert strings to lists in case only one file
+    if isinstance(bib_files, str):
+        bib_files = [bib_files]
+
+    # track the combined bib file as a string
+    all_bib_string = ""
+    all_keys = []
+
+    # read in each bib file
+    for file in bib_files:
+        with open(file, "r") as f:
+            bib_string = f.read()
+            all_bib_string += bib_string
+
+            # find all of the keys that match the regex
+            matches = re.findall(pattern=bibtex_key_regex, string=bib_string)
+
+            # trim off the preceding @whatever
+            all_keys += [m.split("{")[1] for m in matches]
+
+    # return unique set of keys and entire bib string
+    return sorted(list(set(all_keys))), all_bib_string
+
+
+def check_bibs_used(paper_files, keys):
+    tex_regex = r"\\cite.*{.*tom.*}"
+    if isinstance(paper_files, str):
+        paper_files = [paper_files]
+
+    # track whether each key is ever used
+    ever_used = [False for _ in range(len(keys))]
+    for file in paper_files:
+        with open(file, "r") as f:
+            paper_string = f.read()
+
+        # check each key in each file
+        for i, key in enumerate(keys):
+            # for markdown (JOSS paper) the check is simple
+            if file.endswith(".md") and f"@{key}" in paper_string:
+                ever_used[i] = True
+            # for latex we need to be a little smarter about checking for \cite commands
+            elif file.endswith(".tex") and re.search(tex_regex.replace("tom", re.escape(key)), paper_string) is not None:
+                ever_used[i] = True
+    return ever_used
+
+def isolate_bibtex_entry(s, start):
+    # isolate a bibtex entry based on closing curly braces
+    braces, cursor, not_opened = 0, start, True
+    while braces > 0 or not_opened:
+        if s[cursor] == "{":
+            braces += 1
+            not_opened = False
+        elif s[cursor] == "}":
+            braces -= 1
+        cursor += 1
+    return s[start: cursor]
+
+def create_min_bib_file(bib_string, keys, keys_used, out_file):
+    min_bib_string = ""
+    for k, used in zip(keys, keys_used):
+        if used:
+            key_regex = r"@\w*{" + re.escape(k) + "(?=\,)"
+            match = re.search(key_regex, bib_string)
+            min_bib_string += isolate_bibtex_entry(bib_string, match.start()) + "\n\n"
+
+    with open(out_file, "w") as f:
+        f.write(min_bib_string)
+
+
+def remove_unused_bibs(bib_files, paper_files, out_file):
+    keys, bib_string = collect_all_bibs(bib_files=bib_files)
+    keys_used = check_bibs_used(paper_files=paper_files, keys=keys)
+    create_min_bib_file(bib_string=bib_string, keys=keys, keys_used=keys_used, out_file=out_file)
+
+def main():
+    parser = argparse.ArgumentParser(description='Remove unused bibtex entries',
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter) 
+    parser.add_argument('-b', '--bib_files', default="paper.bib", type=str,
+                        help="Comma-separated list of bib files")
+    parser.add_argument('-p', '--paper_files', default="paper.tex", type=str,
+                        help="Comma-separated list of paper files")
+    parser.add_argument('-o', '--out_file', default="min.bib", type=str,
+                        help="Output path for the minimal bib file")
+    args = parser.parse_args()
+
+    bib_files = args.bib_files.split(",")
+    paper_files = args.paper_files.split(",")
+
+    remove_unused_bibs(bib_files=bib_files, paper_files=paper_files, out_file=args.out_file)
+
+if __name__ == "__main__":
+    main()
No results found