Skip to content

Instantly share code, notes, and snippets.

@gubatron
Created October 7, 2025 14:31
Show Gist options
  • Save gubatron/fc48992eec564c0a30cbab896ec678f3 to your computer and use it in GitHub Desktop.
Save gubatron/fc48992eec564c0a30cbab896ec678f3 to your computer and use it in GitHub Desktop.

Revisions

  1. gubatron created this gist Oct 7, 2025.
    79 changes: 79 additions & 0 deletions dbtc_newsletter_stories_getter
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,79 @@
    #!/usr/bin/env bash
    set -euo pipefail

    if [ "$#" -ne 1 ]; then
    echo "Usage: $(basename "$0") <newsletter_url>" >&2
    exit 1
    fi

    NEWSLETTER_URL="$1"

    if ! command -v curl >/dev/null 2>&1; then
    echo "Error: curl is required but not installed." >&2
    exit 1
    fi

    if command -v python3 >/dev/null 2>&1; then
    PYTHON="python3"
    elif command -v python >/dev/null 2>&1; then
    PYTHON="python"
    else
    echo "Error: python3 or python is required but not installed." >&2
    exit 1
    fi

    if ! command -v uninews >/dev/null 2>&1; then
    echo "Error: uninews command is required but not available in PATH." >&2
    exit 1
    fi

    tmpfile=$(mktemp)
    trap 'rm -f "$tmpfile"' EXIT

    curl -fsSL "$NEWSLETTER_URL" -o "$tmpfile"

    story_urls=()
    while IFS= read -r line; do
    story_urls+=("$line")
    done < <("$PYTHON" - "$tmpfile" <<'PY'
    import sys
    import re
    from pathlib import Path
    html = Path(sys.argv[1]).read_text()
    pattern = re.compile(r'<h2[^>]*>\s*<a[^>]+href="([^"]+)"', re.IGNORECASE)
    seen = set()
    urls = []
    for match in pattern.finditer(html):
    url = match.group(1)
    if url in seen:
    continue
    seen.add(url)
    urls.append(url)
    if len(urls) == 5:
    break
    for url in urls:
    print(url)
    PY
    )

    if [ "${#story_urls[@]}" -eq 0 ]; then
    echo "Error: Unable to find any story URLs in the provided newsletter." >&2
    exit 1
    fi

    if [ "${#story_urls[@]}" -lt 5 ]; then
    printf 'Warning: only found %d story URLs in the newsletter.\n' "${#story_urls[@]}" >&2
    fi

    separator=$'\n\n=============\n\n'

    for idx in "${!story_urls[@]}"; do
    url="${story_urls[$idx]}"
    if [ "$idx" -ne 0 ]; then
    printf '%s' "$separator"
    fi
    uninews "$url"
    done