Skip to content

Instantly share code, notes, and snippets.

@robertknight
Created September 30, 2025 14:26
Show Gist options
  • Select an option

  • Save robertknight/6cfac2f72993367f4c871d42f37b68fc to your computer and use it in GitHub Desktop.

Select an option

Save robertknight/6cfac2f72993367f4c871d42f37b68fc to your computer and use it in GitHub Desktop.
Hypothesis annotations export script
#!/usr/bin/env python3
import argparse
import json
import os
import sys
import time
from datetime import datetime, timezone
from typing import List, Dict, Any, Optional
import requests
from requests import Response
API_BASE = "https://api.hypothes.is/api"
def get_session(token: str) -> requests.Session:
s = requests.Session()
s.headers.update({
"Authorization": f"Bearer {token}",
"Accept": "application/json",
"User-Agent": "hypothesis-export/1.0 (+https://hypothes.is/)"
})
s.params = {} # default no params
return s
def safe_request(session: requests.Session, method: str, url: str, **kwargs) -> Response:
# Simple retry with exponential backoff for 429/5xx
delay = 1.0
for attempt in range(7):
resp = session.request(method, url, timeout=60, **kwargs)
if resp.status_code < 400:
return resp
if resp.status_code in (429, 500, 502, 503, 504):
wait_for = resp.headers.get("Retry-After")
if wait_for and wait_for.isdigit():
delay = float(wait_for)
else:
delay = min(delay * 2, 32.0)
print(f"[warn] {resp.status_code} {resp.reason}; retrying in {delay:.1f}s...", file=sys.stderr)
time.sleep(delay)
continue
# Other client errors: raise immediately
resp.raise_for_status()
# If we drop out here, raise the last error
resp.raise_for_status()
return resp # not reached
def get_userid(session: requests.Session) -> str:
resp = safe_request(session, "GET", f"{API_BASE}/profile")
data = resp.json()
userid = data.get("userid")
if not userid:
raise RuntimeError("Could not determine userid from /api/profile response.")
return userid
def fetch_all_annotations(session: requests.Session, userid: str, limit: int = 100) -> List[Dict[str, Any]]:
annotations: List[Dict[str, Any]] = []
search_after: Optional[str] = None
total_reported: Optional[int] = None
# We page in ascending updated order to make search_after monotonic
params = {
"user": userid,
"limit": limit,
"sort": "updated",
"order": "asc",
}
page = 0
while True:
pg_params = dict(params)
if search_after:
pg_params["search_after"] = search_after # ISO 8601 timestamp (for chronological sorts)
resp = safe_request(session, "GET", f"{API_BASE}/search", params=pg_params)
payload = resp.json()
rows = payload.get("rows", [])
if total_reported is None:
total_reported = payload.get("total") # may be approximate
if not rows:
break
annotations.extend(rows)
page += 1
# Next cursor = last row's "updated" timestamp
last_updated = rows[-1].get("updated")
if not isinstance(last_updated, str):
raise RuntimeError("Annotation missing 'updated' timestamp needed for search_after pagination.")
search_after = last_updated
# Progress line
now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S %Z")
if total_reported is not None:
print(f"[{now}] page {page} — fetched {len(rows)} (batch size {limit}); total so far {len(annotations)}/{total_reported}")
else:
print(f"[{now}] page {page} — fetched {len(rows)} (batch size {limit}); total so far {len(annotations)}")
if not len(rows):
# Likely last page
break
return annotations
def main():
parser = argparse.ArgumentParser(description="Export all Hypothesis annotations for the authenticated user.")
parser.add_argument("-t", "--token", default=os.getenv("HYPOTHESIS_API_TOKEN") or os.getenv("H_TOKEN"),
help="Hypothesis API token. Defaults to $HYPOTHESIS_API_TOKEN or $H_TOKEN.")
parser.add_argument("-o", "--output", default="annotations.json",
help="Output JSON file (default: annotations.json)")
parser.add_argument("-n", "--batch-size", type=int, default=100,
help="Batch size (limit) per request (default: 100, max 200).")
args = parser.parse_args()
if not args.token:
print("Error: provide an API token via --token or HYPOTHESIS_API_TOKEN / H_TOKEN env var.", file=sys.stderr)
sys.exit(2)
if args.batch_size < 1 or args.batch_size > 200:
print("Error: batch size must be between 1 and 200.", file=sys.stderr)
sys.exit(2)
session = get_session(args.token)
try:
userid = get_userid(session)
print(f"[info] Exporting annotations for {userid}")
anns = fetch_all_annotations(session, userid, limit=args.batch_size)
with open(args.output, "w", encoding="utf-8") as f:
json.dump(anns, f, ensure_ascii=False, indent=2)
print(f"[done] Wrote {len(anns)} annotations to {args.output}")
except requests.HTTPError as e:
print(f"[error] HTTP error: {e} — response: {getattr(e, 'response', None) and e.response.text}", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f"[error] {e}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()
@robertknight
Copy link
Author

This is a script generated by ChatGPT for exporting your own annotations from your Hypothesis account in JSON format, with some minor corrections from me. See the API docs at https://h.readthedocs.io/en/latest/api-reference/v1/#tag/annotations/paths/~1search/get. For authentication you will need an API token which you can get from https://hypothes.is/account/developer.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment