#!/usr/bin/env ipython from glob import glob import os import re # from urlparse import urlparse from urllib.parse import urlparse from ipdb import set_trace as debug from pprint import pprint as pp """ Documentation site generators/frameworks that we have used do not have the best support for link validation. That is, they may correctly and identify broken links when the URL does not exist, but they don't even try to check whether the URL *fragment* exists. This is a major usability issue, because when doc sections get moved around between pages, or the section headings get renamed, the anchor links / fragments get renamed as well, resulting in broken internal links on the site, which is quite unprofessional. It is often tricky to keep this in mind while in the middle of "refactoring" docs, so instead of trying to remember, I wrote this script to scan the entire docs repo for issues with URLs and a few other minor things. This was originally written for Hugo, then modified to work for Docusaurus. The general approach is probably applicable to Jekyll, but probably needs to be updated to account for the directory structure. """ # TODO use an md parser # TODO handle relative paths robustly # TODO: check for external->internal links # TODO: check if all files are discoverable via nav sidebar # TODO make sure links are local-absolute and end in slash # end in slash: That would avoid a redirect (giving us referrers in analytics and lower latency) and I think the anchors get wiped during the redirect. This could be done with a clever sed line. # local-absolute (/docs/introduction instead of ../../docs/introduction): for consistency, and it would probably be easier to find and change later content_dir = os.getenv('HOME') + '/src/molecula-docs/docs/' scan_dirs = [ content_dir ] ignore_files = [ ('docs', 'style-guide'), # contains informative examples of broken links ] file_pattern = '*.md' markdown_link_regex = '\[([^\]]*)\]\(([^)]*)\)' # finds '[X](Y)', where X excludes ']', Y excludes ')', and X, Y are captured groups def main(): links, anchors = scan_files(scan_dirs, file_pattern) # debug() sidebars = load_sidebars() #print('XXX links') #print_links(links) #print('XXX anchors') #print_anchors(anchors) #debug() #print('XXX problems') find_problems(links, anchors) # find_unreachable_pages(pages, sidebars) def load_sidebars(): return [] def find_links_to_duplicates(file_data): # check if the anchor that a link links to has duplicate lines... pass def internal_missing_slash(link): #if not link['is_pilosa']: # # only want to check internal # return False if link['is_image']: # don't care about images return False if link['url'].startswith('mailto'): # don't care about mail links return False parsed = urlparse(link['url']) if parsed.path == '' and parsed.fragment: # same-page fragment, not really a link return False elif not parsed.path.endswith('/'): return True def internal_unmatched(link, anchors): # this isn't perfect, because it doesn't scan everything # TODO check that the page exists (how?) if link['is_anchor'] and link['netloc'] == '' and not link['anchor_key'] in anchors: return True return False def find_problems(links, anchors): print('checking links in:') for d in scan_dirs: print(' %s' % d) print('bad links to other sections are false positives.\n') error_count = 0 problem_files_count = 0 for file_key, file_links in links.items(): if file_key in ignore_files: continue errors = [] for link in file_links: if link['scheme'] in ['mailto', 'tel']: # ignore these continue if not link['url'].startswith('http') and ".md" in link['raw']: error_type = '.md' errors.append('%s (%d) : %s -- %s' % (error_type, link['line'], link['anchor_key'], link['raw'])) if not link['url'].startswith('http') and not link['url'].startswith('#') and not link['url'].startswith('/') and not link['url'].startswith('localhost'): error_type = 'no leading slash' errors.append('%s (%d) : %s -- %s' % (error_type, link['line'], link['anchor_key'], link['raw'])) if internal_unmatched(link, anchors): # debug() error_type = 'bad link' errors.append('%s (%d) : %s -- %s' % (error_type, link['line'], link['anchor_key'], link['raw'])) #if internal_missing_slash(link): # errors.append('missing slash: %s -- %s' % (link['anchor_key'], link['raw'])) if errors: problem_files_count += 1 print('/'.join(file_key)) for e in errors: error_count += 1 print(' %s' % e) print('found %d errors across %d files' % (error_count, problem_files_count)) def print_anchors(anchors, filt_func=None): filt_func = filt_func or true_filter for anchor_key, anchor in anchors.items(): if filt_func(anchor): print(' %d:%s' % (anchor['line'], anchor['raw'])) def print_links(links, filt_func=None): filt_func = filt_func or true_filter for file_key, file_links in links.items(): print(file_key) for link in file_links: if filt_func(link): print(' %d:%s' % (link['line'], link['raw'])) def ref_filter(link): return link['is_ref'] def anchor_filter(link): return link['is_anchor'] def true_filter(x): return True def get_file_data(fname, file_key): with open(fname) as f: lines = f.read().strip().split('\n') code_fence = False links = [] anchors = {} # print(fname) for n, line in enumerate(lines, 1): # find anchors (any element with an `id` attribute in html if line.startswith('```'): code_fence = not code_fence if code_fence: continue slugs = [] if line.startswith('#'): # section header automatically gets an `id` slugs = [slugify(line)] elif 'id="' in line: # other things can have `id`s manually added (like in the pilosa.com glossary) slugs = re.findall('id="([^"]*)"', line) for slug in slugs: anchor_key = tuple(list(file_key) + [slug]) # debug() if anchor_key in anchors: if 'duplicate_lines' in anchors[anchor_key]: anchors[anchor_key]['duplicate_lines'].append(n) else: anchors[anchor_key]['duplicate_lines'] = [n] continue anchors[anchor_key] = { 'raw': line, 'file': fname, 'line': n, 'slug': slug, } # print(anchor_key) # find links matches = re.findall(markdown_link_regex, line) for match in matches: url = match[1] if ' ' in url: parts = url.split(' ') url = parts[0] title = ' '.join(parts[1:]) else: title = '' parsed = urlparse(url) is_ref = '{{< ref ' in url if is_ref: print('!!!!!!!!!!! ref link found: %s:%d' % (fname, n)) is_relative = parsed.netloc == '' anchor = parsed.fragment is_anchor = '#' in url is_pilosa = 'pilosa.com' in url or is_relative is_image = '.gif' in url.lower() or '.jpg' in url.lower() or '.png' in url.lower() or '.svg' in url.lower() anchor_key = None if not is_image and not parsed.netloc: if not parsed.path: page_key = file_key else: # TODO un-hack parsed_path = parsed.path.replace('docs/latest', 'docs') # this "aliases" docs/latest to docs, so i dont have to deal with the multiple # docs directories path_parts = [p for p in parsed_path.split('/') if p not in ['..', '']] if len(path_parts) == 1: path_parts = [file_key[0]] + path_parts page_key = tuple(path_parts) anchor_key = tuple(list(page_key) + [anchor]) # print(' %s %s' % (right_pad(url, 40), anchor_key)) links.append({ 'raw': '[%s](%s)' % match, 'file': fname, 'line': n, 'scheme': parsed.scheme, 'netloc': parsed.netloc, 'text': match[0], 'title': title, 'url': url, 'is_ref': is_ref, 'is_relative': is_relative, 'is_anchor': is_anchor, 'is_pilosa': is_pilosa, 'is_image': is_image, 'anchor': anchor, 'anchor_key': anchor_key, }) return links, anchors def right_pad(s, n): spaces = ' ' * (n - len(s)) return s + spaces def slugify(text): # if it's a link, remove the url and link syntax, leaving only the display text m = re.search('\[(.*)\]\((.*)\)', text) if m: text = '# ' + m.groups()[0] text = text.lower().replace('#', '').strip() text = re.sub('[ _]', '-', text) text = re.sub('[^0-9a-z\-]', '', text) return text def scan_files(scan_dirs, file_pattern): # returns two dicts: # links = { # grouped by file # file_key: [ # file_key is a tuple of path elements, like ('explanations', 'architecture') # link_data, # ... # ], # } # anchors = { # all in one dict for easy search # anchor_key: anchor_data, # ... # } links = {} anchors = {} files = {} print('scanning files...') for dir in scan_dirs: # files = glob(dir + '/' + file_pattern) files = glob(dir + '/**/' + file_pattern, recursive=True) for file in files: # file_key = tuple(file[:-3].split('/')[-2:]) # for having only two path components file_key = tuple(file[len(dir):].split('/')) # get rid of '.md' suffix for file key, because links shouldn't be using it. file_key = tuple([part.replace('.md', '') for part in file_key]) # filekey = tuple(file[len(dir):].split('/')) # TODO: fix filekey, return files list, with filename, markdown id, path key # use those to determine if file is present in sidebar print(' %s %s' % (file, file_key)) l, a = get_file_data(file, file_key) links[file_key] = l anchors.update(a) return links, anchors main()