#!/usr/bin/env python ''' check_link.py Parses a url, and checks links in it for validity. Normally, does not check links to the source URL's pages. Normally, only reports problem links. Usage: check_link.py [-i] [-v] [-b BASEURL] ... Options: -b BASEURL --base= BASEURL checking multiple pages on a site? set the base, and supages to check ('/' will check BASEURL) -i --internal also check links internal to the site; -v --verbose report all link outcomes, good and bad; -h --help Show this message and exit; Examples: check_link.py -b https://mysite.io books blog # check only 2 subpages check_link.py -b https://mysite.io / /blog # check home page too ''' import sys import requests from urllib.parse import urljoin, urlsplit, urlunsplit from docopt import docopt from colorama import init as colorama_init from colorama import Fore from bs4 import BeautifulSoup, SoupStrainer # this is a modification of a gist; # it got me quickly started for now. # Original: # git@gist.github.com:2872d7f994d192188970408980267e6e.git def check(address, netloc): global VERBOSE global FULL_GET global session msg = None # the normal "ok" is no message # optimize which retrieve we use: retrieve = session.get if netloc in FULL_GET else session.head try: # NOTE: amazon denies requests from python scripts, so we use # a session with an updated 'User-Agent' throughout ('session') # amazon.com remembers if the session.get() was from a python agent, # and then denies the session.get(), even if it updated # its 'user-agent' with a 503 - Service Unavailable # OPTIMIZATION: # we try a light-weight session.head() call first; # if it fails for a domain w/ 405 (Method Not Allowed), then # we retry with a full session.get(), and log the location so # we always try the long way; resp = retrieve(address) if resp.status_code == 405: resp = session.get(address) FULL_GET.add(netloc) except Exception as e: return f'{Fore.YELLOW}{e} - {address}{Fore.RESET}' if resp.status_code in \ [301, 308, 400, 401, 402, 403, 404, 405, 408, 409, 410, 501, 502, 503]: msg = f'{resp.status_code} - {resp.reason} => {address}' # TODO: scrub other permanent redirection codes to include in this: if resp.status_code == 301: newaddress = urljoin(address, resp.headers["Location"].split(";")[0]) msg += f'\n{" "*19}NEW: => {newaddress}' elif VERBOSE: msg = f'{Fore.GREEN}{resp.status_code} - ' \ f'{resp.reason} => {address}{Fore.RESET}' return msg def pattern_adjust(link_address, rbase=None): ''' returns "adjusted" address and netloc; don't follow local addresses, unless option set to follow internal addresses ''' global website global INTERNAL # if we're checking local, might as well # check on-page, too - for typos if link_address[0] == '#' and not INTERNAL: # local return (None, None) # create a local static var: # - depends on global "website" # if 'rbase' not in pattern_adjust.__dict__ \ # pattern_adjust.rbase = urlsplit(website) r = urlsplit(link_address) # don't follow local: if not INTERNAL and \ r.netloc == rbase.netloc: return (None, None) # NOTE: I don't really understand # what this is doing, so annotating: # if relative URL (local) # TODO: I am getting convinced what this wants to do # should be done w/ a simple urljoin() # I'm also thinking this code branch isn't traversed; if r.scheme == '' and (r.netloc != '' or r.path != ''): # reconstitute - it won't be a full path d = urlunsplit(r) # This if seems exceedingly wonky if d.startswith('//'): # if it starts with '//', throw that away... # m = re.search('(?<=//)\S+', d) # d = m.group(0) # TODO: if r.netloc is empty, then this # could result in an incorrect URL: # => if address = foo.com/something - then ok # => if address relaive: ./static/something - then trouble return ("https://" + d[2:], r.netloc) elif r.scheme == '' and r.netloc == '': # is this what I want to do? # would I rather do urljoin(urlunsplit(rbase), link_address)? return (urljoin(website, link_address), r.netloc) \ if INTERNAL else (None, None) else: return (link_address, r.netloc) def string_trunc(s, field_width=73, fill='.'): ''' usage: s, f, w = string_trunc(longurl) print(f']{s:{f}<{w}}[') returns: a truncated (if needed) string, a fill char, and a matching field_width ''' str_width = len(s) if str_width > field_width: # room for 3 fill chars return s[:field_width-3], fill, field_width else: # this doesn't work: need real values # return s, None, None # width of zero seems to not do width, # but I need returned str_width to clear progress line; # fill could be anything (another "0", but...) return s, " ", str_width def progress(msg): ''' hack to print progress ''' if 'w' not in progress.__dict__: progress.w = 0 # clear previous progress line print(f'\r{" "*progress.w}', end='', file=sys.stderr) s, f, w = string_trunc(msg) print(f'\r{s:{f}<{w}}', end='', file=sys.stderr) progress.w = w def extract_link(address): global link_status global session tags = {'a': 'href', 'img': 'src', 'script': 'src', 'link': 'href'} # the partitioned pieces of URL we're checking rbase = urlsplit(address) response = session.get(address) for key, value in tags.items(): for link in BeautifulSoup(response.content, "html.parser", parse_only=SoupStrainer(key)): if link.has_attr(value): # I'm jonesin' for some progress indicators progress(link[value]) p, netloc = pattern_adjust(link[value], rbase) if p and p not in link_status: link_status[p] = check(p, netloc) if link_status[p]: # the '\r' is a hack to show stdout ok w/ progress msgs print('\r', end='', file=sys.stderr) print(link_status[p]) if __name__ == "__main__": arguments = docopt(__doc__) BASEURL = arguments['--base'] INTERNAL = arguments['--internal'] VERBOSE = arguments['--verbose'] websites = arguments[''] # "https://davericho.com/books/" colorama_init() # to facilitate checking each link only once link_status = {} # sites which don't accept 'head' requests (dynamic) # populate with results of urlsplit().netloc FULL_GET = ('www.amazon.com',) # for places like amazon.com, which will deny python scripts: # Now - use session throughout this script! session = requests.Session() session.headers.update({'User-Agent': 'test'}) for website in websites: if BASEURL: website = urljoin(BASEURL, website) print(f'{Fore.CYAN}--- checking links on {website} ---{Fore.RESET}') result = extract_link(website)