Skip to content

Instantly share code, notes, and snippets.

@mnot
Created February 8, 2022 08:14
Show Gist options
  • Select an option

  • Save mnot/7f8de5920f84ef77eea13f81016e3a5f to your computer and use it in GitHub Desktop.

Select an option

Save mnot/7f8de5920f84ef77eea13f81016e3a5f to your computer and use it in GitHub Desktop.

Revisions

  1. mnot created this gist Feb 8, 2022.
    99 changes: 99 additions & 0 deletions w3c-licenses.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,99 @@
    #!/usr/bin/env python3

    """ Use the W3C API to understand what document licenses are in use."""

    import re
    import sys
    import time
    from urllib.parse import urlparse, urlunparse, urljoin

    from bs4 import BeautifulSoup, SoupStrainer
    import requests
    import requests_cache

    API_KEY = "REPLACE_ME"
    RETRY_MAX = 3
    RETRY_WAIT = 2

    s = requests_cache.CachedSession("W3C_cache")
    ws = re.compile(r"\W+")
    COPYRIGHT = SoupStrainer(class_="copyright")


    def apifetch(url):
    url = urljoin("https://api.w3.org/", url)
    fetch_url = urlparse(url)
    fetch_url = fetch_url._replace(query=fetch_url.query + f"&apikey={API_KEY}")
    fetch_url = urlunparse(fetch_url)
    headers = {"accept": "application/json"}
    response = fetch(fetch_url, headers=headers)
    results = response.json()
    if "next" in results.get("_links", {}):
    next_uri = results["_links"]["next"]["href"]
    next_results = apifetch(next_uri)
    results = combine_members(results, next_results)
    return results


    def htmlfetch(spec_details, retries=0):
    url = spec_details["shortlink"]
    title = spec_details["title"]
    status = spec_details["_links"]["latest-version"]["title"]
    group_url = spec_details["_links"]["latest-version"]["href"] + "/deliverers"
    group = apifetch(group_url)["_links"].get("deliverers", [{"title": "Unknown"}])[0]["title"]
    try:
    html = fetch(url)
    except AssertionError as why:
    sys.stderr.write(f"* {str(why)}\n")
    return
    soup = BeautifulSoup(html.text, "html.parser", parse_only=COPYRIGHT)
    license = findlicense(soup)
    print(f"{url}\t{title}\t{group}\t{status}\t{license}")


    def fetch(url, headers=None, retries=0):
    try:
    response = s.get(url, headers=headers)
    assert response.status_code in [200, 206], f"{response.status_code} on {url}"
    return response
    except requests.exceptions.ConnectionError:
    if retries > RETRY_MAX:
    sys.stderr.write(f"Max retries for {url}; aborting.\n")
    sys.exit(1)
    time.sleep(RETRY_WAIT)
    sys.stderr.write(f"Retrying {url}\n")
    return fetch(url, headers=headers, retries=retries + 1)


    def findlicense(copyright):
    for tag in copyright.find_all("a"):
    text = ws.sub(" ", tag.get_text(strip=True).strip().lower())
    if text in [
    "document use",
    "document license",
    "document licensing",
    "permissive document license",
    ]:
    return tag["href"]
    return "Unknown"


    def combine_members(a, b):
    o = {"_embedded": {}}
    for key in o:
    for member in a[key]:
    o[key][member] = a[key][member] + b[key].get(member, [])
    return o


    def main():
    results = apifetch("/specifications?embed=true")
    n = 0
    for spec_details in results["_embedded"]["specifications"]:
    n += 1
    htmlfetch(spec_details)
    sys.stderr.write(f"{n}\n")


    if __name__ == "__main__":
    main()