Last active
December 30, 2017 21:40
-
-
Save yarko/f113f9f9948480fc66c5f566a301a27d to your computer and use it in GitHub Desktop.
quick check for valid links on a webpage
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| ''' | |
| check_link.py | |
| Parses a url, and checks links in it for validity. | |
| Normally, does not check links to the target URL. | |
| Normally, only reports problem links. | |
| Usage: | |
| check_link.py [-i] [-v] <url>... | |
| Options: | |
| -i --internal check links internal to the site, also; | |
| -v --verbose report all link outcomes, good and bad; | |
| -h --help Show this message and exit; | |
| ''' | |
| import requests | |
| from urllib.parse import urljoin, urlsplit, urlunsplit | |
| from docopt import docopt | |
| from colorama import init as colorama_init | |
| from colorama import Fore | |
| from bs4 import BeautifulSoup, SoupStrainer | |
| # this is a modification of a gist; | |
| # it got me quickly started for now. | |
| # Original: | |
| # [email protected]:2872d7f994d192188970408980267e6e.git | |
| def check(address, netloc): | |
| global VERBOSE | |
| global FULL_GET | |
| global session | |
| msg = None # the normal "ok" is no message | |
| # optimize which retrieve we use: | |
| retrieve = session.get if netloc in FULL_GET else session.head | |
| try: | |
| # NOTE: amazon denies requests from python scripts, so we use | |
| # a session with an updated 'User-Agent' throughout ('session') | |
| # amazon.com remembers if the session.get() was from a python agent, | |
| # and then denies the session.get(), even if it updated | |
| # its 'user-agent' with a 503 - Service Unavailable | |
| # OPTIMIZATION: | |
| # we try a light-weight session.head() call first; | |
| # if it fails for a domain w/ 405 (Method Not Allowed), then | |
| # we retry with a full session.get(), and log the location so | |
| # we always try the long way; | |
| resp = retrieve(address) | |
| if resp.status_code == 405: | |
| resp = session.get(address) | |
| FULL_GET.add(netloc) | |
| except Exception as e: | |
| return f'{Fore.YELLOW}{e} - {address}' | |
| if resp.status_code in \ | |
| [301, 308, | |
| 400, 401, 402, 403, 404, 405, 408, 409, 410, | |
| 501, 502, 503]: | |
| msg = f'{resp.status_code} - {resp.reason} => {address}' | |
| # TODO: scrub other permanent redirection codes to include in this: | |
| if resp.status_code == 301: | |
| newaddress = urljoin(address, | |
| resp.headers["Location"].split(";")[0]) | |
| msg += f'\n NEW: => {newaddress}' | |
| elif VERBOSE: | |
| msg = f'{Fore.GREEN}{resp.status_code} - {resp.reason} => {address}' | |
| return msg | |
| def pattern_adjust(link_address, rbase=None): | |
| ''' | |
| returns "adjusted" address and netloc; | |
| don't follow local addresses, unless | |
| option set to follow internal addresses | |
| ''' | |
| global website | |
| global INTERNAL | |
| if link_address[0] == '#': # local | |
| return (None, None) | |
| # create a local static var: | |
| # - depends on global "website" | |
| # TODO: this has potential to break when | |
| # we loop over multiple URLs | |
| # if 'rbase' not in pattern_adjust.__dict__ \ | |
| # pattern_adjust.rbase = urlsplit(website) | |
| r = urlsplit(link_address) | |
| # don't follow local: | |
| if not INTERNAL and \ | |
| r.netloc == rbase.netloc: | |
| return (None, None) | |
| # TODO: this could possibly follow local links | |
| # NOTE: I don't really understand | |
| # what this is doing, so annotating: | |
| # if relative URL (local) | |
| # TODO: I am getting convinced what this wants to do | |
| # should be done w/ a simple urljoin() | |
| # I'm also thinking this code branch isn't traversed; | |
| if r.scheme == '' and (r.netloc != '' or r.path != ''): | |
| # reconstitute - it won't be a full path | |
| d = urlunsplit(r) | |
| # This if seems exceedingly wonky | |
| if d.startswith('//'): | |
| # if it starts with '//', throw that away... | |
| # m = re.search('(?<=//)\S+', d) | |
| # d = m.group(0) | |
| # TODO: if r.netloc is empty, then this | |
| # could result in an incorrect URL: | |
| # => if address = foo.com/something - then ok | |
| # => if address relaive: ./static/something - then trouble | |
| return ("https://" + d[2:], r.netloc) | |
| elif r.scheme == '' and r.netloc == '': | |
| # is this what I want to do? | |
| # would I rather do urljoin(urlunsplit(rbase), link_address)? | |
| return (urljoin(website, link_address), r.netloc) \ | |
| if INTERNAL else (None, None) | |
| else: | |
| return (link_address, r.netloc) | |
| def extract_link(address): | |
| global link_status | |
| global session | |
| tags = {'a': 'href', 'img': 'src', 'script': 'src', 'link': 'href'} | |
| # the partitioned pieces of URL we're checking | |
| rbase = urlsplit(address) | |
| response = session.get(address) | |
| for key, value in tags.items(): | |
| for link in BeautifulSoup(response.content, "html.parser", | |
| parse_only=SoupStrainer(key)): | |
| if link.has_attr(value): | |
| p, netloc = pattern_adjust(link[value], rbase) | |
| if p and p not in link_status: | |
| link_status[p] = check(p, netloc) | |
| if link_status[p]: | |
| print(link_status[p]) | |
| if __name__ == "__main__": | |
| arguments = docopt(__doc__) | |
| INTERNAL = arguments['--internal'] | |
| VERBOSE = arguments['--verbose'] | |
| websites = arguments['<url>'] # "https://davericho.com/books/" | |
| colorama_init() | |
| # to facilitate checking each link only once | |
| link_status = {} | |
| # sites which don't accept 'head' requests (dynamic) | |
| # populate with results of urlsplit().netloc | |
| FULL_GET = ('www.amazon.com',) | |
| # for places like amazon.com, which will deny python scripts: | |
| # Now - use session throughout this script! | |
| session = requests.Session() | |
| session.headers.update({'User-Agent': 'test'}) | |
| for website in websites: | |
| print(f'{Fore.CYAN}--- checking links on {website} ---') | |
| result = extract_link(website) |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is a bit of a work-in-progress - to check, non-recursively, the links on a URL for validity.
It's just starting to come along.