Created
September 30, 2025 14:26
-
-
Save robertknight/6cfac2f72993367f4c871d42f37b68fc to your computer and use it in GitHub Desktop.
Hypothesis annotations export script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| import argparse | |
| import json | |
| import os | |
| import sys | |
| import time | |
| from datetime import datetime, timezone | |
| from typing import List, Dict, Any, Optional | |
| import requests | |
| from requests import Response | |
| API_BASE = "https://api.hypothes.is/api" | |
| def get_session(token: str) -> requests.Session: | |
| s = requests.Session() | |
| s.headers.update({ | |
| "Authorization": f"Bearer {token}", | |
| "Accept": "application/json", | |
| "User-Agent": "hypothesis-export/1.0 (+https://hypothes.is/)" | |
| }) | |
| s.params = {} # default no params | |
| return s | |
| def safe_request(session: requests.Session, method: str, url: str, **kwargs) -> Response: | |
| # Simple retry with exponential backoff for 429/5xx | |
| delay = 1.0 | |
| for attempt in range(7): | |
| resp = session.request(method, url, timeout=60, **kwargs) | |
| if resp.status_code < 400: | |
| return resp | |
| if resp.status_code in (429, 500, 502, 503, 504): | |
| wait_for = resp.headers.get("Retry-After") | |
| if wait_for and wait_for.isdigit(): | |
| delay = float(wait_for) | |
| else: | |
| delay = min(delay * 2, 32.0) | |
| print(f"[warn] {resp.status_code} {resp.reason}; retrying in {delay:.1f}s...", file=sys.stderr) | |
| time.sleep(delay) | |
| continue | |
| # Other client errors: raise immediately | |
| resp.raise_for_status() | |
| # If we drop out here, raise the last error | |
| resp.raise_for_status() | |
| return resp # not reached | |
| def get_userid(session: requests.Session) -> str: | |
| resp = safe_request(session, "GET", f"{API_BASE}/profile") | |
| data = resp.json() | |
| userid = data.get("userid") | |
| if not userid: | |
| raise RuntimeError("Could not determine userid from /api/profile response.") | |
| return userid | |
| def fetch_all_annotations(session: requests.Session, userid: str, limit: int = 100) -> List[Dict[str, Any]]: | |
| annotations: List[Dict[str, Any]] = [] | |
| search_after: Optional[str] = None | |
| total_reported: Optional[int] = None | |
| # We page in ascending updated order to make search_after monotonic | |
| params = { | |
| "user": userid, | |
| "limit": limit, | |
| "sort": "updated", | |
| "order": "asc", | |
| } | |
| page = 0 | |
| while True: | |
| pg_params = dict(params) | |
| if search_after: | |
| pg_params["search_after"] = search_after # ISO 8601 timestamp (for chronological sorts) | |
| resp = safe_request(session, "GET", f"{API_BASE}/search", params=pg_params) | |
| payload = resp.json() | |
| rows = payload.get("rows", []) | |
| if total_reported is None: | |
| total_reported = payload.get("total") # may be approximate | |
| if not rows: | |
| break | |
| annotations.extend(rows) | |
| page += 1 | |
| # Next cursor = last row's "updated" timestamp | |
| last_updated = rows[-1].get("updated") | |
| if not isinstance(last_updated, str): | |
| raise RuntimeError("Annotation missing 'updated' timestamp needed for search_after pagination.") | |
| search_after = last_updated | |
| # Progress line | |
| now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S %Z") | |
| if total_reported is not None: | |
| print(f"[{now}] page {page} — fetched {len(rows)} (batch size {limit}); total so far {len(annotations)}/{total_reported}") | |
| else: | |
| print(f"[{now}] page {page} — fetched {len(rows)} (batch size {limit}); total so far {len(annotations)}") | |
| if not len(rows): | |
| # Likely last page | |
| break | |
| return annotations | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Export all Hypothesis annotations for the authenticated user.") | |
| parser.add_argument("-t", "--token", default=os.getenv("HYPOTHESIS_API_TOKEN") or os.getenv("H_TOKEN"), | |
| help="Hypothesis API token. Defaults to $HYPOTHESIS_API_TOKEN or $H_TOKEN.") | |
| parser.add_argument("-o", "--output", default="annotations.json", | |
| help="Output JSON file (default: annotations.json)") | |
| parser.add_argument("-n", "--batch-size", type=int, default=100, | |
| help="Batch size (limit) per request (default: 100, max 200).") | |
| args = parser.parse_args() | |
| if not args.token: | |
| print("Error: provide an API token via --token or HYPOTHESIS_API_TOKEN / H_TOKEN env var.", file=sys.stderr) | |
| sys.exit(2) | |
| if args.batch_size < 1 or args.batch_size > 200: | |
| print("Error: batch size must be between 1 and 200.", file=sys.stderr) | |
| sys.exit(2) | |
| session = get_session(args.token) | |
| try: | |
| userid = get_userid(session) | |
| print(f"[info] Exporting annotations for {userid}") | |
| anns = fetch_all_annotations(session, userid, limit=args.batch_size) | |
| with open(args.output, "w", encoding="utf-8") as f: | |
| json.dump(anns, f, ensure_ascii=False, indent=2) | |
| print(f"[done] Wrote {len(anns)} annotations to {args.output}") | |
| except requests.HTTPError as e: | |
| print(f"[error] HTTP error: {e} — response: {getattr(e, 'response', None) and e.response.text}", file=sys.stderr) | |
| sys.exit(1) | |
| except Exception as e: | |
| print(f"[error] {e}", file=sys.stderr) | |
| sys.exit(1) | |
| if __name__ == "__main__": | |
| main() |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is a script generated by ChatGPT for exporting your own annotations from your Hypothesis account in JSON format, with some minor corrections from me. See the API docs at https://h.readthedocs.io/en/latest/api-reference/v1/#tag/annotations/paths/~1search/get. For authentication you will need an API token which you can get from https://hypothes.is/account/developer.