mnot · February 8, 2022 08:14 · Feb 8, 2022
diff --git a/w3c-licenses.py b/w3c-licenses.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+
+""" Use the W3C API to understand what document licenses are in use."""
+
+import re
+import sys
+import time
+from urllib.parse import urlparse, urlunparse, urljoin
+
+from bs4 import BeautifulSoup, SoupStrainer
+import requests
+import requests_cache
+
+API_KEY = "REPLACE_ME"
+RETRY_MAX = 3
+RETRY_WAIT = 2
+
+s = requests_cache.CachedSession("W3C_cache")
+ws = re.compile(r"\W+")
+COPYRIGHT = SoupStrainer(class_="copyright")
+
+
+def apifetch(url):
+    url = urljoin("https://api.w3.org/", url)
+    fetch_url = urlparse(url)
+    fetch_url = fetch_url._replace(query=fetch_url.query + f"&apikey={API_KEY}")
+    fetch_url = urlunparse(fetch_url)
+    headers = {"accept": "application/json"}
+    response = fetch(fetch_url, headers=headers)
+    results = response.json()
+    if "next" in results.get("_links", {}):
+        next_uri = results["_links"]["next"]["href"]
+        next_results = apifetch(next_uri)
+        results = combine_members(results, next_results)
+    return results
+
+
+def htmlfetch(spec_details, retries=0):
+    url = spec_details["shortlink"]
+    title = spec_details["title"]
+    status = spec_details["_links"]["latest-version"]["title"]
+    group_url = spec_details["_links"]["latest-version"]["href"] + "/deliverers"
+    group = apifetch(group_url)["_links"].get("deliverers", [{"title": "Unknown"}])[0]["title"]
+    try:
+        html = fetch(url)
+    except AssertionError as why:
+        sys.stderr.write(f"* {str(why)}\n")
+        return
+    soup = BeautifulSoup(html.text, "html.parser", parse_only=COPYRIGHT)
+    license = findlicense(soup)
+    print(f"{url}\t{title}\t{group}\t{status}\t{license}")
+
+
+def fetch(url, headers=None, retries=0):
+    try:
+        response = s.get(url, headers=headers)
+        assert response.status_code in [200, 206], f"{response.status_code} on {url}"
+        return response
+    except requests.exceptions.ConnectionError:
+        if retries > RETRY_MAX:
+            sys.stderr.write(f"Max retries for {url}; aborting.\n")
+            sys.exit(1)
+        time.sleep(RETRY_WAIT)
+        sys.stderr.write(f"Retrying {url}\n")
+        return fetch(url, headers=headers, retries=retries + 1)
+
+
+def findlicense(copyright):
+    for tag in copyright.find_all("a"):
+        text = ws.sub(" ", tag.get_text(strip=True).strip().lower())
+        if text in [
+            "document use",
+            "document license",
+            "document licensing",
+            "permissive document license",
+        ]:
+            return tag["href"]
+    return "Unknown"
+
+
+def combine_members(a, b):
+    o = {"_embedded": {}}
+    for key in o:
+        for member in a[key]:
+            o[key][member] = a[key][member] + b[key].get(member, [])
+    return o
+
+
+def main():
+    results = apifetch("/specifications?embed=true")
+    n = 0
+    for spec_details in results["_embedded"]["specifications"]:
+        n += 1
+        htmlfetch(spec_details)
+        sys.stderr.write(f"{n}\n")
+
+
+if __name__ == "__main__":
+    main()
No results found