robertknight · September 30, 2025 14:26 · robertknight · Sep 30, 2025
diff --git a/export.py b/export.py
 #!/usr/bin/env python3

 import argparse
 import json
 import os
 import sys
 import time
 from datetime import datetime, timezone
 from typing import List, Dict, Any, Optional

 import requests
 from requests import Response

 API_BASE = "https://api.hypothes.is/api"

 def get_session(token: str) -> requests.Session:
    s = requests.Session()
    s.headers.update({
        "Authorization": f"Bearer {token}",
        "Accept": "application/json",
        "User-Agent": "hypothesis-export/1.0 (+https://hypothes.is/)"
    })
    s.params = {}  # default no params
    return s

 def safe_request(session: requests.Session, method: str, url: str, **kwargs) -> Response:
    # Simple retry with exponential backoff for 429/5xx
    delay = 1.0
    for attempt in range(7):
        resp = session.request(method, url, timeout=60, **kwargs)
        if resp.status_code < 400:
            return resp
        if resp.status_code in (429, 500, 502, 503, 504):
            wait_for = resp.headers.get("Retry-After")
            if wait_for and wait_for.isdigit():
                delay = float(wait_for)
            else:
                delay = min(delay * 2, 32.0)
            print(f"[warn] {resp.status_code} {resp.reason}; retrying in {delay:.1f}s...", file=sys.stderr)
            time.sleep(delay)
            continue
        # Other client errors: raise immediately
        resp.raise_for_status()
    # If we drop out here, raise the last error
    resp.raise_for_status()
    return resp  # not reached

 def get_userid(session: requests.Session) -> str:
    resp = safe_request(session, "GET", f"{API_BASE}/profile")
    data = resp.json()
    userid = data.get("userid")
    if not userid:
        raise RuntimeError("Could not determine userid from /api/profile response.")
    return userid

 def fetch_all_annotations(session: requests.Session, userid: str, limit: int = 100) -> List[Dict[str, Any]]:
    annotations: List[Dict[str, Any]] = []
    search_after: Optional[str] = None
    total_reported: Optional[int] = None

    # We page in ascending updated order to make search_after monotonic
    params = {
        "user": userid,
        "limit": limit,
        "sort": "updated",
        "order": "asc",
    }

    page = 0
    while True:
        pg_params = dict(params)
        if search_after:
            pg_params["search_after"] = search_after  # ISO 8601 timestamp (for chronological sorts)
        resp = safe_request(session, "GET", f"{API_BASE}/search", params=pg_params)
        payload = resp.json()
        rows = payload.get("rows", [])
        if total_reported is None:
            total_reported = payload.get("total")  # may be approximate

        if not rows:
            break

        annotations.extend(rows)
        page += 1

        # Next cursor = last row's "updated" timestamp
        last_updated = rows[-1].get("updated")
        if not isinstance(last_updated, str):
            raise RuntimeError("Annotation missing 'updated' timestamp needed for search_after pagination.")
        search_after = last_updated

        # Progress line
        now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S %Z")
        if total_reported is not None:
            print(f"[{now}] page {page} — fetched {len(rows)} (batch size {limit}); total so far {len(annotations)}/{total_reported}")
        else:
            print(f"[{now}] page {page} — fetched {len(rows)} (batch size {limit}); total so far {len(annotations)}")

        if not len(rows):
            # Likely last page
            break

    return annotations

 def main():
    parser = argparse.ArgumentParser(description="Export all Hypothesis annotations for the authenticated user.")
    parser.add_argument("-t", "--token", default=os.getenv("HYPOTHESIS_API_TOKEN") or os.getenv("H_TOKEN"),
                        help="Hypothesis API token. Defaults to $HYPOTHESIS_API_TOKEN or $H_TOKEN.")
    parser.add_argument("-o", "--output", default="annotations.json",
                        help="Output JSON file (default: annotations.json)")
    parser.add_argument("-n", "--batch-size", type=int, default=100,
                        help="Batch size (limit) per request (default: 100, max 200).")
    args = parser.parse_args()

    if not args.token:
        print("Error: provide an API token via --token or HYPOTHESIS_API_TOKEN / H_TOKEN env var.", file=sys.stderr)
        sys.exit(2)

    if args.batch_size < 1 or args.batch_size > 200:
        print("Error: batch size must be between 1 and 200.", file=sys.stderr)
        sys.exit(2)

    session = get_session(args.token)

    try:
        userid = get_userid(session)
        print(f"[info] Exporting annotations for {userid}")
        anns = fetch_all_annotations(session, userid, limit=args.batch_size)
        with open(args.output, "w", encoding="utf-8") as f:
            json.dump(anns, f, ensure_ascii=False, indent=2)
        print(f"[done] Wrote {len(anns)} annotations to {args.output}")
    except requests.HTTPError as e:
        print(f"[error] HTTP error: {e} — response: {getattr(e, 'response', None) and e.response.text}", file=sys.stderr)
        sys.exit(1)
    except Exception as e:
        print(f"[error] {e}", file=sys.stderr)
        sys.exit(1)

 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3

	import argparse
	import json
	import os
	import sys
	import time
	from datetime import datetime, timezone
	from typing import List, Dict, Any, Optional

	import requests
	from requests import Response

	API_BASE = "https://api.hypothes.is/api"

	def get_session(token: str) -> requests.Session:
	s = requests.Session()
	s.headers.update({
	"Authorization": f"Bearer {token}",
	"Accept": "application/json",
	"User-Agent": "hypothesis-export/1.0 (+https://hypothes.is/)"
	})
	s.params = {} # default no params
	return s

	def safe_request(session: requests.Session, method: str, url: str, **kwargs) -> Response:
	# Simple retry with exponential backoff for 429/5xx
	delay = 1.0
	for attempt in range(7):
	resp = session.request(method, url, timeout=60, **kwargs)
	if resp.status_code < 400:
	return resp
	if resp.status_code in (429, 500, 502, 503, 504):
	wait_for = resp.headers.get("Retry-After")
	if wait_for and wait_for.isdigit():
	delay = float(wait_for)
	else:
	delay = min(delay * 2, 32.0)
	print(f"[warn] {resp.status_code} {resp.reason}; retrying in {delay:.1f}s...", file=sys.stderr)
	time.sleep(delay)
	continue
	# Other client errors: raise immediately
	resp.raise_for_status()
	# If we drop out here, raise the last error
	resp.raise_for_status()
	return resp # not reached

	def get_userid(session: requests.Session) -> str:
	resp = safe_request(session, "GET", f"{API_BASE}/profile")
	data = resp.json()
	userid = data.get("userid")
	if not userid:
	raise RuntimeError("Could not determine userid from /api/profile response.")
	return userid

	def fetch_all_annotations(session: requests.Session, userid: str, limit: int = 100) -> List[Dict[str, Any]]:
	annotations: List[Dict[str, Any]] = []
	search_after: Optional[str] = None
	total_reported: Optional[int] = None

	# We page in ascending updated order to make search_after monotonic
	params = {
	"user": userid,
	"limit": limit,
	"sort": "updated",
	"order": "asc",
	}

	page = 0
	while True:
	pg_params = dict(params)
	if search_after:
	pg_params["search_after"] = search_after # ISO 8601 timestamp (for chronological sorts)
	resp = safe_request(session, "GET", f"{API_BASE}/search", params=pg_params)
	payload = resp.json()
	rows = payload.get("rows", [])
	if total_reported is None:
	total_reported = payload.get("total") # may be approximate

	if not rows:
	break

	annotations.extend(rows)
	page += 1

	# Next cursor = last row's "updated" timestamp
	last_updated = rows[-1].get("updated")
	if not isinstance(last_updated, str):
	raise RuntimeError("Annotation missing 'updated' timestamp needed for search_after pagination.")
	search_after = last_updated

	# Progress line
	now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S %Z")
	if total_reported is not None:
	print(f"[{now}] page {page} — fetched {len(rows)} (batch size {limit}); total so far {len(annotations)}/{total_reported}")
	else:
	print(f"[{now}] page {page} — fetched {len(rows)} (batch size {limit}); total so far {len(annotations)}")

	if not len(rows):
	# Likely last page
	break

	return annotations

	def main():
	parser = argparse.ArgumentParser(description="Export all Hypothesis annotations for the authenticated user.")
	parser.add_argument("-t", "--token", default=os.getenv("HYPOTHESIS_API_TOKEN") or os.getenv("H_TOKEN"),
	help="Hypothesis API token. Defaults to $HYPOTHESIS_API_TOKEN or $H_TOKEN.")
	parser.add_argument("-o", "--output", default="annotations.json",
	help="Output JSON file (default: annotations.json)")
	parser.add_argument("-n", "--batch-size", type=int, default=100,
	help="Batch size (limit) per request (default: 100, max 200).")
	args = parser.parse_args()

	if not args.token:
	print("Error: provide an API token via --token or HYPOTHESIS_API_TOKEN / H_TOKEN env var.", file=sys.stderr)
	sys.exit(2)

	if args.batch_size < 1 or args.batch_size > 200:
	print("Error: batch size must be between 1 and 200.", file=sys.stderr)
	sys.exit(2)

	session = get_session(args.token)

	try:
	userid = get_userid(session)
	print(f"[info] Exporting annotations for {userid}")
	anns = fetch_all_annotations(session, userid, limit=args.batch_size)
	with open(args.output, "w", encoding="utf-8") as f:
	json.dump(anns, f, ensure_ascii=False, indent=2)
	print(f"[done] Wrote {len(anns)} annotations to {args.output}")
	except requests.HTTPError as e:
	print(f"[error] HTTP error: {e} — response: {getattr(e, 'response', None) and e.response.text}", file=sys.stderr)
	sys.exit(1)
	except Exception as e:
	print(f"[error] {e}", file=sys.stderr)
	sys.exit(1)

	if __name__ == "__main__":
	main()
No results found