Skip to content

Instantly share code, notes, and snippets.

@nilslindemann
Last active October 31, 2023 08:27
Show Gist options
  • Select an option

  • Save nilslindemann/4a996ac153c7ada29b88c8b4a7bab8c2 to your computer and use it in GitHub Desktop.

Select an option

Save nilslindemann/4a996ac153c7ada29b88c8b4a7bab8c2 to your computer and use it in GitHub Desktop.

Revisions

  1. Nils Lindemann revised this gist Oct 31, 2023. 1 changed file with 8 additions and 5 deletions.
    13 changes: 8 additions & 5 deletions linkchecker.py
    Original file line number Diff line number Diff line change
    @@ -77,7 +77,7 @@ def iter_lines_and_context(

    def iter_file_search_results(
    root: Path,
    ignore: list[Callable[[Url], IfFilterMatches]] = [],
    ignore: list[Callable[[Url], IfFilterMatches]],
    linkpattern: re.Pattern[str] = re.compile(
    r"""
    \[ [^\]]* \]
    @@ -108,7 +108,7 @@ def iter_file_search_results(
    results.links[lnum] = line_results
    line_results.append((link, True)) # type: ignore
    for filter in ignore:
    for lnum, links in results.links.items():
    for links in results.links.values():
    for index, link in enumerate(links):
    if link[1] and filter(link[0]):
    links[index] = (link[0], False)
    @@ -174,7 +174,7 @@ def __call__(self, url: Url):

    @dataclass
    class IsWellFormed:
    desc: str = "a local relative link to a .md / .png / .py"
    desc: str = "a well-formed local relative link to a .md / .png / .py"

    word: PatternSnippet = r"""
    [a-z][a-z0-9]*
    @@ -192,7 +192,7 @@ class IsWellFormed:
    {word}[.](?: md|png|py )
    """

    hash: PatternSnippet = rf"""
    hash: PatternSnippet = r"""
    (?: [#][^#]+ )
    """

    @@ -227,7 +227,10 @@ def __call__(self, url: Url):
    if searchresult:
    nasty_urls.append(searchresult)
    if nasty_urls:
    print("\nThese links are not well-formed:\n")
    print(f"\nThese links under {DOCS_ROOT} are not:\n")
    for filter in filters:
    print(f"* {filter.desc}")
    print()
    print("\n".join(nasty_urls))
    else:
    print(f"\nEach link under {DOCS_ROOT} is:\n")
  2. Nils Lindemann revised this gist Oct 30, 2023. 1 changed file with 24 additions and 23 deletions.
    47 changes: 24 additions & 23 deletions linkchecker.py
    Original file line number Diff line number Diff line change
    @@ -9,7 +9,8 @@

    Line = str
    LineNumber = int
    Link = str
    Url = str
    SearchResult = str
    PatternSnippet = str
    IfNotFiltered = bool
    IfFilterMatches = bool
    @@ -24,13 +25,13 @@
    CODE_EXAMPLES_REFERENCE_POINT = DOCS_ROOT / "en/docs"

    NONEXISTING_TARGETS: set[Path] = set()
    EXTERNAL_URLS: set[Link] = set()
    EXTERNAL_URLS: set[Url] = set()


    @dataclass
    class FileSearchResults:
    filepath: Path
    links: dict[LineNumber, list[tuple[Link, IfNotFiltered]]] = field(
    links: dict[LineNumber, list[tuple[Url, IfNotFiltered]]] = field(
    default_factory=dict
    )

    @@ -76,7 +77,7 @@ def iter_lines_and_context(

    def iter_file_search_results(
    root: Path,
    ignore: list[Callable[[Link], IfFilterMatches]] = [],
    ignore: list[Callable[[Url], IfFilterMatches]] = [],
    linkpattern: re.Pattern[str] = re.compile(
    r"""
    \[ [^\]]* \]
    @@ -117,7 +118,7 @@ def iter_file_search_results(

    def check_target_exists(
    file: Path,
    link: Link,
    url: Url,
    in_code: bool,
    without_hash_pattern: re.Pattern[str] = re.compile(
    r"""
    @@ -137,19 +138,19 @@ def check_target_exists(
    ):
    if in_code:
    reference_point = CODE_EXAMPLES_REFERENCE_POINT
    cleanlink = link
    cleanurl = url
    else:
    if link.startswith("https://"):
    EXTERNAL_URLS.add(link)
    if url.startswith("https://"):
    EXTERNAL_URLS.add(url)
    return
    reference_point = file.parent
    match = without_hash_pattern.match(link)
    match = without_hash_pattern.match(url)
    if not match:
    raise Exception("could not match link, this should not happen")
    cleanlink = match.group(1)
    if not cleanlink:
    cleanurl = match.group(1)
    if not cleanurl:
    return
    joined = reference_point / cleanlink
    joined = reference_point / cleanurl
    try:
    joined.resolve(strict=True)
    except FileNotFoundError:
    @@ -165,8 +166,8 @@ def check_target_exists(
    class IsExternal:
    desc: str = "a https link to an external url"

    def __call__(self, link: Link):
    if link.startswith("https://"):
    def __call__(self, url: Url):
    if url.startswith("https://"):
    return True
    return False

    @@ -212,22 +213,22 @@ class IsWellFormed:
    re.VERBOSE,
    )

    def __call__(self, link: Link):
    if self.wellformed_pat.match(link):
    def __call__(self, url: Url):
    if self.wellformed_pat.match(url):
    return True
    return False


    if __name__ == "__main__":
    filters = [IsExternal(), IsWellFormed()]
    nasty_links: list[str] = []
    for result in iter_file_search_results(DOCS_ROOT, ignore=filters):
    result = str(result)
    if result:
    nasty_links.append(result)
    if nasty_links:
    nasty_urls: list[SearchResult] = []
    for searchresult in iter_file_search_results(DOCS_ROOT, ignore=filters):
    searchresult = str(searchresult)
    if searchresult:
    nasty_urls.append(searchresult)
    if nasty_urls:
    print("\nThese links are not well-formed:\n")
    print("\n".join(nasty_links))
    print("\n".join(nasty_urls))
    else:
    print(f"\nEach link under {DOCS_ROOT} is:\n")
    for filter in filters:
  3. Nils Lindemann revised this gist Oct 30, 2023. 1 changed file with 15 additions and 9 deletions.
    24 changes: 15 additions & 9 deletions linkchecker.py
    Original file line number Diff line number Diff line change
    @@ -7,9 +7,13 @@
    from pathlib import Path
    from typing import Any, Callable, Generator

    Link = str
    LineNumber = int
    Line = str
    LineNumber = int
    Link = str
    PatternSnippet = str
    IfNotFiltered = bool
    IfFilterMatches = bool
    IfInCode = bool

    # config

    @@ -26,7 +30,9 @@
    @dataclass
    class FileSearchResults:
    filepath: Path
    links: dict[LineNumber, list[tuple[Link, bool]]] = field(default_factory=dict)
    links: dict[LineNumber, list[tuple[Link, IfNotFiltered]]] = field(
    default_factory=dict
    )

    def __str__(self):
    result = [" ", str(self.filepath), "\n"]
    @@ -58,7 +64,7 @@ def iter_markdowns(path: Path) -> Generator[Path, Any, None]:

    def iter_lines_and_context(
    filepath: Path,
    ) -> Generator[tuple[LineNumber, Line, bool], Any, None]:
    ) -> Generator[tuple[LineNumber, Line, IfInCode], Any, None]:
    with filepath.open("r", encoding="utf-8") as f:
    in_code = False
    for lnum, line in enumerate(f, start=1):
    @@ -70,7 +76,7 @@ def iter_lines_and_context(

    def iter_file_search_results(
    root: Path,
    ignore: list[Callable[[Link], bool]] = [],
    ignore: list[Callable[[Link], IfFilterMatches]] = [],
    linkpattern: re.Pattern[str] = re.compile(
    r"""
    \[ [^\]]* \]
    @@ -169,23 +175,23 @@ def __call__(self, link: Link):
    class IsWellFormed:
    desc: str = "a local relative link to a .md / .png / .py"

    word: str = r"""
    word: PatternSnippet = r"""
    [a-z][a-z0-9]*
    (?:
    [-_]
    [a-z0-9]+
    )*
    """

    path: str = rf"""
    path: PatternSnippet = rf"""
    (?: [.][.]/ )* (?: {word}/ )*
    """

    filename: str = rf"""
    filename: PatternSnippet = rf"""
    {word}[.](?: md|png|py )
    """

    hash: str = rf"""
    hash: PatternSnippet = rf"""
    (?: [#][^#]+ )
    """

  4. Nils Lindemann revised this gist Oct 29, 2023. 1 changed file with 19 additions and 19 deletions.
    38 changes: 19 additions & 19 deletions linkchecker.py
    Original file line number Diff line number Diff line change
    @@ -24,7 +24,7 @@


    @dataclass
    class FileLinks:
    class FileSearchResults:
    filepath: Path
    links: dict[LineNumber, list[tuple[Link, bool]]] = field(default_factory=dict)

    @@ -68,7 +68,7 @@ def iter_lines_and_context(
    yield (lnum, line, in_code)


    def iter_file_results(
    def iter_file_search_results(
    root: Path,
    ignore: list[Callable[[Link], bool]] = [],
    linkpattern: re.Pattern[str] = re.compile(
    @@ -88,9 +88,9 @@ def iter_file_results(
    """,
    re.VERBOSE,
    ),
    ) -> Generator[FileLinks, Any, None]:
    ) -> Generator[FileSearchResults, Any, None]:
    for filepath in iter_markdowns(root):
    results = FileLinks(filepath=filepath.resolve(strict=True))
    results = FileSearchResults(filepath=filepath.resolve(strict=True))
    for lnum, line, in_code in iter_lines_and_context(filepath):
    pattern = in_code_linkpattern if in_code else linkpattern
    for match in pattern.finditer(line):
    @@ -102,9 +102,9 @@ def iter_file_results(
    line_results.append((link, True)) # type: ignore
    for filter in ignore:
    for lnum, links in results.links.items():
    for index, match in enumerate(links):
    if match[1] and filter(match[0]):
    links[index] = (match[0], False)
    for index, link in enumerate(links):
    if link[1] and filter(link[0]):
    links[index] = (link[0], False)
    if results.links:
    yield results

    @@ -116,8 +116,8 @@ def check_target_exists(
    without_hash_pattern: re.Pattern[str] = re.compile(
    r"""
    ^
    ( [^#]* ) # path part before the hash
    (?: [#].* )? # hash
    ( [^#]* ) # part before the hash
    (?: [#].* )? # the hash
    $
    """,
    re.VERBOSE,
    @@ -140,7 +140,7 @@ def check_target_exists(
    match = without_hash_pattern.match(link)
    if not match:
    raise Exception("could not match link, this should not happen")
    cleanlink = match.group(1).strip()
    cleanlink = match.group(1)
    if not cleanlink:
    return
    joined = reference_point / cleanlink
    @@ -166,7 +166,7 @@ def __call__(self, link: Link):


    @dataclass
    class WellFormed:
    class IsWellFormed:
    desc: str = "a local relative link to a .md / .png / .py"

    word: str = r"""
    @@ -213,15 +213,15 @@ def __call__(self, link: Link):


    if __name__ == "__main__":
    filters = [IsExternal(), WellFormed()]
    all_results: list[str] = []
    for file_result in iter_file_results(DOCS_ROOT, ignore=filters):
    file_result = str(file_result)
    if file_result:
    all_results.append(file_result)
    if all_results:
    filters = [IsExternal(), IsWellFormed()]
    nasty_links: list[str] = []
    for result in iter_file_search_results(DOCS_ROOT, ignore=filters):
    result = str(result)
    if result:
    nasty_links.append(result)
    if nasty_links:
    print("\nThese links are not well-formed:\n")
    print("\n".join(all_results))
    print("\n".join(nasty_links))
    else:
    print(f"\nEach link under {DOCS_ROOT} is:\n")
    for filter in filters:
  5. Nils Lindemann revised this gist Oct 29, 2023. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion linkchecker.py
    Original file line number Diff line number Diff line change
    @@ -167,7 +167,7 @@ def __call__(self, link: Link):

    @dataclass
    class WellFormed:
    desc: str = "a local relative link to a .md or a .png"
    desc: str = "a local relative link to a .md / .png / .py"

    word: str = r"""
    [a-z][a-z0-9]*
  6. Nils Lindemann revised this gist Oct 29, 2023. 1 changed file with 0 additions and 1 deletion.
    1 change: 0 additions & 1 deletion linkchecker.py
    Original file line number Diff line number Diff line change
    @@ -8,7 +8,6 @@
    from typing import Any, Callable, Generator

    Link = str
    RawPath = str
    LineNumber = int
    Line = str

  7. Nils Lindemann revised this gist Oct 29, 2023. 1 changed file with 103 additions and 95 deletions.
    198 changes: 103 additions & 95 deletions linkchecker.py
    Original file line number Diff line number Diff line change
    @@ -1,72 +1,78 @@
    # Link checker script for https://github.com/tiangolo/fastapi
    # Place it under fastapi/scripts or configure DOCS_ROOT below

    import os
    import re
    from dataclasses import dataclass, field
    from pathlib import Path
    from typing import Any, Callable, Generator

    link = str
    raw_path = str
    line_number = int
    line = str
    Link = str
    RawPath = str
    LineNumber = int
    Line = str

    # config

    SEARCH_HERE = "../docs"
    DOCS_ROOT = Path("../docs").resolve(strict=True)

    # end config

    CODE_EXAMPLES_REFERENCE_POINT = DOCS_ROOT / "en/docs"

    NONEXISTING_TARGETS: set[Path] = set()
    EXTERNAL_URLS: set[link] = set()
    EXTERNAL_URLS: set[Link] = set()


    @dataclass
    class FileResults:
    file: Path
    matches: dict[line_number, list[tuple[link, bool]]] = field(default_factory=dict)
    class FileLinks:
    filepath: Path
    links: dict[LineNumber, list[tuple[Link, bool]]] = field(default_factory=dict)

    def __str__(self):
    result = [str(self.file), "\n"]
    for lnum, matches in self.matches.items():
    minus_filter = [match[0] for match in matches if match[1]]
    if minus_filter:
    result = [" ", str(self.filepath), "\n"]
    for lnum, links in self.links.items():
    not_filtered = [match[0] for match in links if match[1]]
    if not_filtered:
    result.extend(
    [
    " ",
    "[",
    str(lnum),
    "] ",
    ", ".join(minus_filter),
    ", ".join(not_filtered),
    "\n",
    ]
    )
    if len(result) > 2:
    if len(result) > 3:
    return "".join(result)
    return ""


    def iter_markdowns(path: raw_path) -> Generator[Path, Any, None]:
    def iter_markdowns(path: Path) -> Generator[Path, Any, None]:
    for root, _, files in os.walk(path):
    root = Path(root)
    for file in files:
    if file.endswith(".md"):
    yield root / file


    def iter_relevant_lines(file: Path) -> Generator[tuple[line_number, line], Any, None]:
    with file.open("r", encoding="utf-8") as f:
    def iter_lines_and_context(
    filepath: Path,
    ) -> Generator[tuple[LineNumber, Line, bool], Any, None]:
    with filepath.open("r", encoding="utf-8") as f:
    in_code = False
    for lnum, line in enumerate(f, start=1):
    if line.lstrip().startswith("```"):
    in_code = not in_code
    continue
    if not in_code:
    yield (lnum, line)
    yield (lnum, line, in_code)


    def iter_file_results(
    path: str,
    ignore: list[Callable[[link], bool]] = [],
    linkpat: re.Pattern[str] = re.compile(
    root: Path,
    ignore: list[Callable[[Link], bool]] = [],
    linkpattern: re.Pattern[str] = re.compile(
    r"""
    \[ [^\]]* \]
    \(
    @@ -75,111 +81,116 @@ def iter_file_results(
    """,
    re.VERBOSE,
    ),
    ) -> Generator[FileResults, Any, None]:
    for file in iter_markdowns(path):
    file_results = FileResults(file=file)
    for lnum, line in iter_relevant_lines(file):
    for match in linkpat.finditer(line):
    check_target_exists(file, match.group(1))
    if lnum not in file_results.matches:
    in_code_linkpattern: re.Pattern[str] = re.compile(
    r"""
    [{]!> [^\S\n]*
    ( [\S]+?[.]py )
    [^\S\n]* ![}]
    """,
    re.VERBOSE,
    ),
    ) -> Generator[FileLinks, Any, None]:
    for filepath in iter_markdowns(root):
    results = FileLinks(filepath=filepath.resolve(strict=True))
    for lnum, line, in_code in iter_lines_and_context(filepath):
    pattern = in_code_linkpattern if in_code else linkpattern
    for match in pattern.finditer(line):
    link = match.group(1)
    check_target_exists(filepath, link, in_code)
    if lnum not in results.links:
    line_results = []
    file_results.matches[lnum] = line_results
    line_results.append((match.group(1), True)) # type: ignore
    results.links[lnum] = line_results
    line_results.append((link, True)) # type: ignore
    for filter in ignore:
    for lnum, matches in file_results.matches.items():
    for index, match in enumerate(matches):
    for lnum, links in results.links.items():
    for index, match in enumerate(links):
    if match[1] and filter(match[0]):
    matches[index] = (match[0], False)
    if file_results.matches:
    yield file_results
    links[index] = (match[0], False)
    if results.links:
    yield results


    def check_target_exists(
    file: Path,
    url: link,
    strip_hash_pat: re.Pattern[str] = re.compile(
    link: Link,
    in_code: bool,
    without_hash_pattern: re.Pattern[str] = re.compile(
    r"""
    ^
    ( [^#]* )
    (?: [#].* )?
    ( [^#]* ) # path part before the hash
    (?: [#].* )? # hash
    $
    """,
    re.VERBOSE,
    ),
    replace_pat: re.Pattern[str] = re.compile(
    lang_id_pattern: re.Pattern[str] = re.compile(
    r"""
    [a-z]{2}/docs/
    /[a-z]{2}/docs/ # matches the two digit language identifier
    """,
    re.VERBOSE,
    ),
    ):
    if url.startswith("https://"):
    EXTERNAL_URLS.add(url)
    return
    match = strip_hash_pat.match(url)
    if not match:
    raise Exception("could not match url, this should not happen")
    if in_code:
    reference_point = CODE_EXAMPLES_REFERENCE_POINT
    cleanlink = link
    else:
    targetpath = match.group(1).strip()
    if not targetpath:
    if link.startswith("https://"):
    EXTERNAL_URLS.add(link)
    return
    hashless_url = file.parent / targetpath
    reference_point = file.parent
    match = without_hash_pattern.match(link)
    if not match:
    raise Exception("could not match link, this should not happen")
    cleanlink = match.group(1).strip()
    if not cleanlink:
    return
    joined = reference_point / cleanlink
    try:
    joined.resolve(strict=True)
    except FileNotFoundError:
    try:
    hashless_url.resolve(strict=True)
    joined = Path(lang_id_pattern.sub("/en/docs/", str(joined))).resolve(
    strict=True
    )
    except FileNotFoundError:
    try:
    hashless_url = Path(
    replace_pat.sub("en/docs/", str(hashless_url))
    ).resolve(strict=True)
    except FileNotFoundError:
    NONEXISTING_TARGETS.add(hashless_url.resolve(strict=False))


    # @dataclass
    # class IsPng:
    # desc: str = "is a link to a .png"

    # def __call__(self, url: link):
    # if url.endswith(".png"):
    # return True
    # return False
    NONEXISTING_TARGETS.add(joined.resolve(strict=False))


    @dataclass
    class IsExternal:
    desc: str = "is a https link to an external url"
    desc: str = "a https link to an external url"

    def __call__(self, url: link):
    if url.startswith("https://"):
    def __call__(self, link: Link):
    if link.startswith("https://"):
    return True
    return False


    @dataclass
    class WellFormed:
    desc: str = "is a local relative link to a .md or a .png"
    desc: str = "a local relative link to a .md or a .png"

    word: str = r"""
    [a-z][a-z0-9]*
    (?:
    [a-z][a-z0-9]*
    -
    [-_]
    [a-z0-9]+
    )*
    [a-z][a-z0-9]*
    """

    path: str = rf"""
    (?: [.][.]/ )* (?: {word}/ )*
    """

    filename: str = rf"""
    {word}[.](?: md|png )
    {word}[.](?: md|png|py )
    """

    hash: str = rf"""
    (?: [#][^#]+ )
    """

    ok_url_pat: re.Pattern[str] = re.compile(
    wellformed_pat: re.Pattern[str] = re.compile(
    rf"""
    ^
    (?:
    @@ -196,31 +207,28 @@ class WellFormed:
    re.VERBOSE,
    )

    def __call__(self, url: link):
    if self.ok_url_pat.match(url):
    def __call__(self, link: Link):
    if self.wellformed_pat.match(link):
    return True
    return False


    if __name__ == "__main__":
    counter = 0
    filters = [IsExternal(), WellFormed()]
    for file_results in iter_file_results(SEARCH_HERE, ignore=filters):
    file_results = str(file_results)
    if file_results:
    print(file_results)
    counter += 1
    if not counter:
    print(f"All links under {SEARCH_HERE} match one of these criteria:")
    all_results: list[str] = []
    for file_result in iter_file_results(DOCS_ROOT, ignore=filters):
    file_result = str(file_result)
    if file_result:
    all_results.append(file_result)
    if all_results:
    print("\nThese links are not well-formed:\n")
    print("\n".join(all_results))
    else:
    print(f"\nEach link under {DOCS_ROOT} is:\n")
    for filter in filters:
    print(f" * {filter.desc}")

    # if EXTERNAL_URLS:
    # print("\nExternal URLs:")
    # for url in sorted(EXTERNAL_URLS):
    # print(f" {url}")
    print(f"* {filter.desc}")

    if NONEXISTING_TARGETS:
    print("\nThese non-existing targets are referenced by some links:")
    print("\nThese files are referenced in links but do not exist:")
    for url in sorted(NONEXISTING_TARGETS):
    print(f" {url}")
  8. Nils Lindemann revised this gist Oct 28, 2023. 1 changed file with 35 additions and 35 deletions.
    70 changes: 35 additions & 35 deletions linkchecker.py
    Original file line number Diff line number Diff line change
    @@ -63,6 +63,37 @@ def iter_relevant_lines(file: Path) -> Generator[tuple[line_number, line], Any,
    yield (lnum, line)


    def iter_file_results(
    path: str,
    ignore: list[Callable[[link], bool]] = [],
    linkpat: re.Pattern[str] = re.compile(
    r"""
    \[ [^\]]* \]
    \(
    ( [^)]* )
    \)
    """,
    re.VERBOSE,
    ),
    ) -> Generator[FileResults, Any, None]:
    for file in iter_markdowns(path):
    file_results = FileResults(file=file)
    for lnum, line in iter_relevant_lines(file):
    for match in linkpat.finditer(line):
    check_target_exists(file, match.group(1))
    if lnum not in file_results.matches:
    line_results = []
    file_results.matches[lnum] = line_results
    line_results.append((match.group(1), True)) # type: ignore
    for filter in ignore:
    for lnum, matches in file_results.matches.items():
    for index, match in enumerate(matches):
    if match[1] and filter(match[0]):
    matches[index] = (match[0], False)
    if file_results.matches:
    yield file_results


    def check_target_exists(
    file: Path,
    url: link,
    @@ -104,37 +135,6 @@ def check_target_exists(
    NONEXISTING_TARGETS.add(hashless_url.resolve(strict=False))


    def iter_file_matches(
    path: str,
    ignore: list[Callable[[link], bool]] = [],
    linkpat: re.Pattern[str] = re.compile(
    r"""
    \[ [^\]]* \]
    \(
    ( [^)]* )
    \)
    """,
    re.VERBOSE,
    ),
    ) -> Generator[FileResults, Any, None]:
    for file in iter_markdowns(path):
    file_results = FileResults(file=file)
    for lnum, line in iter_relevant_lines(file):
    for match in linkpat.finditer(line):
    check_target_exists(file, match.group(1))
    if lnum not in file_results.matches:
    line_results = []
    file_results.matches[lnum] = line_results
    line_results.append((match.group(1), True)) # type: ignore
    for filter in ignore:
    for lnum, matches in file_results.matches.items():
    for index, match in enumerate(matches):
    if match[1] and filter(match[0]):
    matches[index] = (match[0], False)
    if file_results.matches:
    yield file_results


    # @dataclass
    # class IsPng:
    # desc: str = "is a link to a .png"
    @@ -205,10 +205,10 @@ def __call__(self, url: link):
    if __name__ == "__main__":
    counter = 0
    filters = [IsExternal(), WellFormed()]
    for file_matches in iter_file_matches(SEARCH_HERE, ignore=filters):
    file_matches = str(file_matches)
    if file_matches:
    print(file_matches)
    for file_results in iter_file_results(SEARCH_HERE, ignore=filters):
    file_results = str(file_results)
    if file_results:
    print(file_results)
    counter += 1
    if not counter:
    print(f"All links under {SEARCH_HERE} match one of these criteria:")
  9. Nils Lindemann revised this gist Oct 28, 2023. 1 changed file with 6 additions and 6 deletions.
    12 changes: 6 additions & 6 deletions linkchecker.py
    Original file line number Diff line number Diff line change
    @@ -147,7 +147,7 @@ def iter_file_matches(

    @dataclass
    class IsExternal:
    desc: str = "is a link to an external file"
    desc: str = "is a https link to an external url"

    def __call__(self, url: link):
    if url.startswith("https://"):
    @@ -156,8 +156,8 @@ def __call__(self, url: link):


    @dataclass
    class WellFormatted:
    desc: str = "is a link which is well formatted"
    class WellFormed:
    desc: str = "is a local relative link to a .md or a .png"

    word: str = r"""
    (?:
    @@ -204,14 +204,14 @@ def __call__(self, url: link):

    if __name__ == "__main__":
    counter = 0
    filters = [IsExternal(), WellFormatted()]
    filters = [IsExternal(), WellFormed()]
    for file_matches in iter_file_matches(SEARCH_HERE, ignore=filters):
    file_matches = str(file_matches)
    if file_matches:
    print(file_matches)
    counter += 1
    if not counter:
    print(f"All links under {SEARCH_HERE} match one of those criteria:")
    print(f"All links under {SEARCH_HERE} match one of these criteria:")
    for filter in filters:
    print(f" * {filter.desc}")

    @@ -221,6 +221,6 @@ def __call__(self, url: link):
    # print(f" {url}")

    if NONEXISTING_TARGETS:
    print("\nThese non-existing Targets are referenced by some links:")
    print("\nThese non-existing targets are referenced by some links:")
    for url in sorted(NONEXISTING_TARGETS):
    print(f" {url}")
  10. Nils Lindemann created this gist Oct 28, 2023.
    226 changes: 226 additions & 0 deletions linkchecker.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,226 @@
    import os
    import re
    from dataclasses import dataclass, field
    from pathlib import Path
    from typing import Any, Callable, Generator

    link = str
    raw_path = str
    line_number = int
    line = str

    # config

    SEARCH_HERE = "../docs"

    # end config

    NONEXISTING_TARGETS: set[Path] = set()
    EXTERNAL_URLS: set[link] = set()


    @dataclass
    class FileResults:
    file: Path
    matches: dict[line_number, list[tuple[link, bool]]] = field(default_factory=dict)

    def __str__(self):
    result = [str(self.file), "\n"]
    for lnum, matches in self.matches.items():
    minus_filter = [match[0] for match in matches if match[1]]
    if minus_filter:
    result.extend(
    [
    " ",
    "[",
    str(lnum),
    "] ",
    ", ".join(minus_filter),
    "\n",
    ]
    )
    if len(result) > 2:
    return "".join(result)
    return ""


    def iter_markdowns(path: raw_path) -> Generator[Path, Any, None]:
    for root, _, files in os.walk(path):
    root = Path(root)
    for file in files:
    if file.endswith(".md"):
    yield root / file


    def iter_relevant_lines(file: Path) -> Generator[tuple[line_number, line], Any, None]:
    with file.open("r", encoding="utf-8") as f:
    in_code = False
    for lnum, line in enumerate(f, start=1):
    if line.lstrip().startswith("```"):
    in_code = not in_code
    continue
    if not in_code:
    yield (lnum, line)


    def check_target_exists(
    file: Path,
    url: link,
    strip_hash_pat: re.Pattern[str] = re.compile(
    r"""
    ^
    ( [^#]* )
    (?: [#].* )?
    $
    """,
    re.VERBOSE,
    ),
    replace_pat: re.Pattern[str] = re.compile(
    r"""
    [a-z]{2}/docs/
    """,
    re.VERBOSE,
    ),
    ):
    if url.startswith("https://"):
    EXTERNAL_URLS.add(url)
    return
    match = strip_hash_pat.match(url)
    if not match:
    raise Exception("could not match url, this should not happen")
    else:
    targetpath = match.group(1).strip()
    if not targetpath:
    return
    hashless_url = file.parent / targetpath
    try:
    hashless_url.resolve(strict=True)
    except FileNotFoundError:
    try:
    hashless_url = Path(
    replace_pat.sub("en/docs/", str(hashless_url))
    ).resolve(strict=True)
    except FileNotFoundError:
    NONEXISTING_TARGETS.add(hashless_url.resolve(strict=False))


    def iter_file_matches(
    path: str,
    ignore: list[Callable[[link], bool]] = [],
    linkpat: re.Pattern[str] = re.compile(
    r"""
    \[ [^\]]* \]
    \(
    ( [^)]* )
    \)
    """,
    re.VERBOSE,
    ),
    ) -> Generator[FileResults, Any, None]:
    for file in iter_markdowns(path):
    file_results = FileResults(file=file)
    for lnum, line in iter_relevant_lines(file):
    for match in linkpat.finditer(line):
    check_target_exists(file, match.group(1))
    if lnum not in file_results.matches:
    line_results = []
    file_results.matches[lnum] = line_results
    line_results.append((match.group(1), True)) # type: ignore
    for filter in ignore:
    for lnum, matches in file_results.matches.items():
    for index, match in enumerate(matches):
    if match[1] and filter(match[0]):
    matches[index] = (match[0], False)
    if file_results.matches:
    yield file_results


    # @dataclass
    # class IsPng:
    # desc: str = "is a link to a .png"

    # def __call__(self, url: link):
    # if url.endswith(".png"):
    # return True
    # return False


    @dataclass
    class IsExternal:
    desc: str = "is a link to an external file"

    def __call__(self, url: link):
    if url.startswith("https://"):
    return True
    return False


    @dataclass
    class WellFormatted:
    desc: str = "is a link which is well formatted"

    word: str = r"""
    (?:
    [a-z][a-z0-9]*
    -
    )*
    [a-z][a-z0-9]*
    """

    path: str = rf"""
    (?: [.][.]/ )* (?: {word}/ )*
    """

    filename: str = rf"""
    {word}[.](?: md|png )
    """

    hash: str = rf"""
    (?: [#][^#]+ )
    """

    ok_url_pat: re.Pattern[str] = re.compile(
    rf"""
    ^
    (?:
    (?:
    {path}
    {filename}
    {hash}?
    )
    |
    {hash}
    )
    $
    """,
    re.VERBOSE,
    )

    def __call__(self, url: link):
    if self.ok_url_pat.match(url):
    return True
    return False


    if __name__ == "__main__":
    counter = 0
    filters = [IsExternal(), WellFormatted()]
    for file_matches in iter_file_matches(SEARCH_HERE, ignore=filters):
    file_matches = str(file_matches)
    if file_matches:
    print(file_matches)
    counter += 1
    if not counter:
    print(f"All links under {SEARCH_HERE} match one of those criteria:")
    for filter in filters:
    print(f" * {filter.desc}")

    # if EXTERNAL_URLS:
    # print("\nExternal URLs:")
    # for url in sorted(EXTERNAL_URLS):
    # print(f" {url}")

    if NONEXISTING_TARGETS:
    print("\nThese non-existing Targets are referenced by some links:")
    for url in sorted(NONEXISTING_TARGETS):
    print(f" {url}")