yarko · December 30, 2017 21:40 · yarko · Dec 27, 2017
diff --git a/check_links.py b/check_links.py
 #!/usr/bin/env python
 '''
 check_link.py

 Parses a url, and checks links in it for validity.
 Normally, does not check links to the target URL.
 Normally, only reports problem links.

 Usage:
    check_link.py [-i] [-v] <url>...

 Options:
    -i --internal   check links internal to the site, also;
    -v --verbose    report all link outcomes, good and bad;
    -h --help       Show this message and exit;
 '''
 import requests
 from urllib.parse import urljoin, urlsplit, urlunsplit
 from docopt import docopt
 from colorama import init as colorama_init
 from colorama import Fore
 from bs4 import BeautifulSoup, SoupStrainer

 # this is a modification of a gist;
 #   it got me quickly started for now.
 #  Original:
 #  [email protected]:2872d7f994d192188970408980267e6e.git


 def check(address, netloc):
    global VERBOSE
    global FULL_GET
    global session
    msg = None  # the normal "ok" is no message

    # optimize which retrieve we use:
    retrieve = session.get if netloc in FULL_GET else session.head
    try:
        # NOTE: amazon denies requests from python scripts, so we use
        #    a session with an updated 'User-Agent' throughout ('session')
        #    amazon.com remembers if the session.get() was from a python agent,
        #    and then denies the session.get(), even if it updated
        #    its 'user-agent' with a 503 - Service Unavailable
        # OPTIMIZATION:
        #    we try a light-weight session.head() call first;
        #    if it fails for a domain w/ 405 (Method Not Allowed), then
        #    we retry with a full session.get(), and log the location so
        #    we always try the long way;
        resp = retrieve(address)
        if resp.status_code == 405:
            resp = session.get(address)
            FULL_GET.add(netloc)
    except Exception as e:
        return f'{Fore.YELLOW}{e} - {address}'

    if resp.status_code in \
            [301, 308,
             400, 401, 402, 403, 404, 405, 408, 409, 410,
             501, 502, 503]:
        msg = f'{resp.status_code} - {resp.reason} => {address}'
        # TODO: scrub other permanent redirection codes to include in this:
        if resp.status_code == 301:
            newaddress = urljoin(address,
                                 resp.headers["Location"].split(";")[0])
            msg += f'\n      NEW: => {newaddress}'
    elif VERBOSE:
        msg = f'{Fore.GREEN}{resp.status_code} - {resp.reason} => {address}'
    return msg


 def pattern_adjust(link_address, rbase=None):
    '''
    returns  "adjusted" address and netloc;

    don't follow local addresses, unless
    option set to follow internal addresses
    '''
    global website
    global INTERNAL

    if link_address[0] == '#':  # local
        return (None, None)

    # create a local static var:
    #  - depends on global "website"
    # TODO:  this has potential to break when
    #    we loop over multiple URLs
    # if 'rbase' not in pattern_adjust.__dict__ \
    #    pattern_adjust.rbase = urlsplit(website)

    r = urlsplit(link_address)
    # don't follow local:
    if not INTERNAL and \
       r.netloc == rbase.netloc:
        return (None, None)
    # TODO:  this could possibly follow local links
    # NOTE:  I don't really understand
    #    what this is doing, so annotating:
    #  if relative URL (local)
    # TODO: I am getting convinced what this wants to do
    #   should be done w/ a simple urljoin()
    #   I'm also thinking this code branch isn't traversed;
    if r.scheme == '' and (r.netloc != '' or r.path != ''):
        # reconstitute - it won't be a full path
        d = urlunsplit(r)
        # This if seems exceedingly wonky
        if d.startswith('//'):
            # if it starts with '//', throw that away...
            # m = re.search('(?<=//)\S+', d)
            # d = m.group(0)
            # TODO:  if r.netloc is empty, then this
            #  could result in an incorrect URL:
            # => if address = foo.com/something - then ok
            # => if address relaive: ./static/something - then trouble
            return ("https://" + d[2:], r.netloc)
    elif r.scheme == '' and r.netloc == '':
        # is this what I want to do?
        #  would I rather do urljoin(urlunsplit(rbase), link_address)?
        return (urljoin(website, link_address), r.netloc) \
                if INTERNAL else (None, None)
    else:
        return (link_address, r.netloc)


 def extract_link(address):
    global link_status
    global session

    tags = {'a': 'href', 'img': 'src', 'script': 'src', 'link': 'href'}

    # the partitioned pieces of URL we're checking
    rbase = urlsplit(address)
    response = session.get(address)
    for key, value in tags.items():
        for link in BeautifulSoup(response.content, "html.parser",
                                  parse_only=SoupStrainer(key)):
            if link.has_attr(value):
                p, netloc = pattern_adjust(link[value], rbase)
                if p and p not in link_status:
                    link_status[p] = check(p, netloc)
                    if link_status[p]:
                        print(link_status[p])


 if __name__ == "__main__":
    arguments = docopt(__doc__)
    INTERNAL = arguments['--internal']
    VERBOSE = arguments['--verbose']
    websites = arguments['<url>']  # "https://davericho.com/books/"

    colorama_init()
    # to facilitate checking each link only once
    link_status = {}
    # sites which don't accept 'head' requests (dynamic)
    #  populate with results of urlsplit().netloc
    FULL_GET = ('www.amazon.com',)
    # for places like amazon.com, which will deny python scripts:
    #  Now - use session throughout this script!
    session = requests.Session()
    session.headers.update({'User-Agent': 'test'})
    for website in websites:
        print(f'{Fore.CYAN}--- checking links on {website} ---')
        result = extract_link(website)
	#!/usr/bin/env python
	'''
	check_link.py

	Parses a url, and checks links in it for validity.
	Normally, does not check links to the target URL.
	Normally, only reports problem links.

	Usage:
	check_link.py [-i] [-v] <url>...

	Options:
	-i --internal check links internal to the site, also;
	-v --verbose report all link outcomes, good and bad;
	-h --help Show this message and exit;
	'''
	import requests
	from urllib.parse import urljoin, urlsplit, urlunsplit
	from docopt import docopt
	from colorama import init as colorama_init
	from colorama import Fore
	from bs4 import BeautifulSoup, SoupStrainer

	# this is a modification of a gist;
	# it got me quickly started for now.
	# Original:
	# [email protected]:2872d7f994d192188970408980267e6e.git


	def check(address, netloc):
	global VERBOSE
	global FULL_GET
	global session
	msg = None # the normal "ok" is no message

	# optimize which retrieve we use:
	retrieve = session.get if netloc in FULL_GET else session.head
	try:
	# NOTE: amazon denies requests from python scripts, so we use
	# a session with an updated 'User-Agent' throughout ('session')
	# amazon.com remembers if the session.get() was from a python agent,
	# and then denies the session.get(), even if it updated
	# its 'user-agent' with a 503 - Service Unavailable
	# OPTIMIZATION:
	# we try a light-weight session.head() call first;
	# if it fails for a domain w/ 405 (Method Not Allowed), then
	# we retry with a full session.get(), and log the location so
	# we always try the long way;
	resp = retrieve(address)
	if resp.status_code == 405:
	resp = session.get(address)
	FULL_GET.add(netloc)
	except Exception as e:
	return f'{Fore.YELLOW}{e} - {address}'

	if resp.status_code in \
	[301, 308,
	400, 401, 402, 403, 404, 405, 408, 409, 410,
	501, 502, 503]:
	msg = f'{resp.status_code} - {resp.reason} => {address}'
	# TODO: scrub other permanent redirection codes to include in this:
	if resp.status_code == 301:
	newaddress = urljoin(address,
	resp.headers["Location"].split(";")[0])
	msg += f'\n NEW: => {newaddress}'
	elif VERBOSE:
	msg = f'{Fore.GREEN}{resp.status_code} - {resp.reason} => {address}'
	return msg


	def pattern_adjust(link_address, rbase=None):
	'''
	returns "adjusted" address and netloc;

	don't follow local addresses, unless
	option set to follow internal addresses
	'''
	global website
	global INTERNAL

	if link_address[0] == '#': # local
	return (None, None)

	# create a local static var:
	# - depends on global "website"
	# TODO: this has potential to break when
	# we loop over multiple URLs
	# if 'rbase' not in pattern_adjust.__dict__ \
	# pattern_adjust.rbase = urlsplit(website)

	r = urlsplit(link_address)
	# don't follow local:
	if not INTERNAL and \
	r.netloc == rbase.netloc:
	return (None, None)
	# TODO: this could possibly follow local links
	# NOTE: I don't really understand
	# what this is doing, so annotating:
	# if relative URL (local)
	# TODO: I am getting convinced what this wants to do
	# should be done w/ a simple urljoin()
	# I'm also thinking this code branch isn't traversed;
	if r.scheme == '' and (r.netloc != '' or r.path != ''):
	# reconstitute - it won't be a full path
	d = urlunsplit(r)
	# This if seems exceedingly wonky
	if d.startswith('//'):
	# if it starts with '//', throw that away...
	# m = re.search('(?<=//)\S+', d)
	# d = m.group(0)
	# TODO: if r.netloc is empty, then this
	# could result in an incorrect URL:
	# => if address = foo.com/something - then ok
	# => if address relaive: ./static/something - then trouble
	return ("https://" + d[2:], r.netloc)
	elif r.scheme == '' and r.netloc == '':
	# is this what I want to do?
	# would I rather do urljoin(urlunsplit(rbase), link_address)?
	return (urljoin(website, link_address), r.netloc) \
	if INTERNAL else (None, None)
	else:
	return (link_address, r.netloc)


	def extract_link(address):
	global link_status
	global session

	tags = {'a': 'href', 'img': 'src', 'script': 'src', 'link': 'href'}

	# the partitioned pieces of URL we're checking
	rbase = urlsplit(address)
	response = session.get(address)
	for key, value in tags.items():
	for link in BeautifulSoup(response.content, "html.parser",
	parse_only=SoupStrainer(key)):
	if link.has_attr(value):
	p, netloc = pattern_adjust(link[value], rbase)
	if p and p not in link_status:
	link_status[p] = check(p, netloc)
	if link_status[p]:
	print(link_status[p])


	if __name__ == "__main__":
	arguments = docopt(__doc__)
	INTERNAL = arguments['--internal']
	VERBOSE = arguments['--verbose']
	websites = arguments['<url>'] # "https://davericho.com/books/"

	colorama_init()
	# to facilitate checking each link only once
	link_status = {}
	# sites which don't accept 'head' requests (dynamic)
	# populate with results of urlsplit().netloc
	FULL_GET = ('www.amazon.com',)
	# for places like amazon.com, which will deny python scripts:
	# Now - use session throughout this script!
	session = requests.Session()
	session.headers.update({'User-Agent': 'test'})
	for website in websites:
	print(f'{Fore.CYAN}--- checking links on {website} ---')
	result = extract_link(website)
No results found