JotaRata · July 28, 2025 04:16 · Jul 28, 2025
diff --git a/paper-cleanup.yaml b/paper-cleanup.yaml
@@ -0,0 +1,65 @@
+# organize configuration file
+# https://organize.readthedocs.io
+
+rules:
+  - name: Move and organize Scientific papers from the Downloads directory
+    locations: ~/Downloads
+    subfolders: false
+    filters:
+      - extension: pdf
+      - filecontent: (?si)^(?=.*(?:abstract|a\s*b\s*s\s*t\s*r\s*a\s*c\s*t))(?=.*(?:introduction|i\s*n\s*t\s*r\s*o\s*d\s*u\s*c\s*t\s*i\s*o\s*n))(?P<text>.*)
+
+      - python: |
+            import re, requests, unicodedata
+            import xml.etree.ElementTree as ET
+
+            def parse_names(authors):
+                names = [f'{a.split()[-1]}_{a.split()[0][0]}' for a in authors][0].replace("'",'')
+                names = ''.join(c for c in unicodedata.normalize('NFKD', names) if not unicodedata.combining(c))
+                return names
+
+            def parse_title(title):
+                return title.replace('\n', '').replace(':', '').replace(' ','_').replace('/', '').replace('<scp>','')
+            
+            text = filecontent.get("text")
+            cutoff = re.search(r'\b(references|r\s*e\s*f\s*e\s*r\s*e\s*n\s*c\s*e\s*s|bibliography)\b', text, re.IGNORECASE)
+
+            if cutoff:
+                text = text[:cutoff.start()]
+
+            doi_match = re.search(r'10\.\d{4,9}/[-._;()/:a-zA-Z0-9]+', text)
+            arxiv_match = re.search(r'arxiv[:\s]?\d{4}\.\d{4,5}(v\d+)?', text, re.I)
+            if not arxiv_match:
+                arxiv_match = re.search(r'arxiv[:\s]?astro-ph/\d{7}(v\d+)?', text, re.I)
+
+            if arxiv_match:
+                arxiv = arxiv_match.group().split(":")[-1]
+                print('Found ArXiv', arxiv)
+                resp = requests.get(f'http://export.arxiv.org/api/query?id_list={arxiv}')
+                if resp.status_code == 200:
+                    prefix = '{http://www.w3.org/2005/Atom}'
+                    root = ET.fromstring(resp.text)
+                    entry = root.find(prefix+'entry')
+                    title = entry.find(prefix+'title').text.strip()
+                    authors = [a.find(prefix+'name').text for a in entry.findall(prefix+'author')]
+                    year = entry.find(prefix+'published').text[:4]
+                    if len(authors) > 2:
+                        authors = authors[:2] + ['et al.']
+                    return {'title': parse_title(title), 'authors': parse_names(authors), 'year' : year, 'code': arxiv }
+
+            elif doi_match:
+                doi = doi_match.group().rstrip('.,;\'"')
+                print('Found DOI', doi)
+                resp = requests.get(f'https://api.crossref.org/works/{doi}', timeout=5)
+                if resp.status_code == 200:
+                    data = resp.json()['message']
+                    title = data.get('title', [' '])[0]
+                    authors = [f"{a.get('given', '')} {a.get('family', '')}".strip() for a in data.get('author', [])]
+                    year = data.get('issued', {}).get('date-parts', [[None]])[0][0]
+                    return {'title': parse_title(title), 'authors': parse_names(authors), 'year' : year, 'code': doi }
+
+            raise Exception('No valid arXiv or DOI found in main content')
+
+
+    actions:
+      -  move: "~/Downloads/Papers/{python.authors}_{python.year}__{python.title}.pdf"