Last active
December 30, 2017 21:40
-
-
Save yarko/f113f9f9948480fc66c5f566a301a27d to your computer and use it in GitHub Desktop.
Revisions
-
yarko revised this gist
Dec 30, 2017 . 1 changed file with 3 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -87,7 +87,9 @@ def pattern_adjust(link_address, rbase=None): global website global INTERNAL # if we're checking local, might as well # check on-page, too - for typos if link_address[0] == '#' and not INTERNAL: # local return (None, None) # create a local static var: -
yarko revised this gist
Dec 28, 2017 . 1 changed file with 1 addition and 4 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -92,8 +92,6 @@ def pattern_adjust(link_address, rbase=None): # create a local static var: # - depends on global "website" # if 'rbase' not in pattern_adjust.__dict__ \ # pattern_adjust.rbase = urlsplit(website) @@ -102,7 +100,6 @@ def pattern_adjust(link_address, rbase=None): if not INTERNAL and \ r.netloc == rbase.netloc: return (None, None) # NOTE: I don't really understand # what this is doing, so annotating: # if relative URL (local) @@ -195,10 +192,10 @@ def extract_link(address): if __name__ == "__main__": arguments = docopt(__doc__) BASEURL = arguments['--base'] INTERNAL = arguments['--internal'] VERBOSE = arguments['--verbose'] websites = arguments['<url>'] # "https://davericho.com/books/" colorama_init() # to facilitate checking each link only once -
yarko revised this gist
Dec 28, 2017 . 1 changed file with 4 additions and 4 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -3,7 +3,7 @@ check_link.py Parses a url, and checks links in it for validity. Normally, does not check links to the source URL's pages. Normally, only reports problem links. Usage: @@ -13,13 +13,13 @@ -b BASEURL --base= BASEURL checking multiple pages on a site? set the base, and supages to check ('/' will check BASEURL) -i --internal also check links internal to the site; -v --verbose report all link outcomes, good and bad; -h --help Show this message and exit; Examples: check_link.py -b https://mysite.io books blog # check only 2 subpages check_link.py -b https://mysite.io / /blog # check home page too ''' import sys import requests -
yarko revised this gist
Dec 28, 2017 . 1 changed file with 14 additions and 4 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -7,12 +7,19 @@ Normally, only reports problem links. Usage: check_link.py [-i] [-v] [-b BASEURL] <url>... Options: -b BASEURL --base= BASEURL checking multiple pages on a site? set the base, and supages to check ('/' will check BASEURL) -i --internal check links internal to the site, also; -v --verbose report all link outcomes, good and bad; -h --help Show this message and exit; Examples: check_link.py -b https://mysite.io /books /blog # check only 2 subpages check_link.py -b https://mysite.io / /blog # check home page too ''' import sys import requests @@ -191,6 +198,7 @@ def extract_link(address): INTERNAL = arguments['--internal'] VERBOSE = arguments['--verbose'] websites = arguments['<url>'] # "https://davericho.com/books/" BASEURL = arguments['--base'] colorama_init() # to facilitate checking each link only once @@ -203,5 +211,7 @@ def extract_link(address): session = requests.Session() session.headers.update({'User-Agent': 'test'}) for website in websites: if BASEURL: website = urljoin(BASEURL, website) print(f'{Fore.CYAN}--- checking links on {website} ---{Fore.RESET}') result = extract_link(website) -
yarko revised this gist
Dec 28, 2017 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -63,7 +63,7 @@ def check(address, netloc): if resp.status_code == 301: newaddress = urljoin(address, resp.headers["Location"].split(";")[0]) msg += f'\n{" "*19}NEW: => {newaddress}' elif VERBOSE: msg = f'{Fore.GREEN}{resp.status_code} - ' \ f'{resp.reason} => {address}{Fore.RESET}' -
yarko revised this gist
Dec 28, 2017 . 1 changed file with 47 additions and 3 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -14,6 +14,7 @@ -v --verbose report all link outcomes, good and bad; -h --help Show this message and exit; ''' import sys import requests from urllib.parse import urljoin, urlsplit, urlunsplit from docopt import docopt @@ -51,7 +52,7 @@ def check(address, netloc): resp = session.get(address) FULL_GET.add(netloc) except Exception as e: return f'{Fore.YELLOW}{e} - {address}{Fore.RESET}' if resp.status_code in \ [301, 308, @@ -64,7 +65,8 @@ def check(address, netloc): resp.headers["Location"].split(";")[0]) msg += f'\n NEW: => {newaddress}' elif VERBOSE: msg = f'{Fore.GREEN}{resp.status_code} - ' \ f'{resp.reason} => {address}{Fore.RESET}' return msg @@ -122,6 +124,44 @@ def pattern_adjust(link_address, rbase=None): return (link_address, r.netloc) def string_trunc(s, field_width=73, fill='.'): ''' usage: s, f, w = string_trunc(longurl) print(f']{s:{f}<{w}}[') returns: a truncated (if needed) string, a fill char, and a matching field_width ''' str_width = len(s) if str_width > field_width: # room for 3 fill chars return s[:field_width-3], fill, field_width else: # this doesn't work: need real values # return s, None, None # width of zero seems to not do width, # but I need returned str_width to clear progress line; # fill could be anything (another "0", but...) return s, " ", str_width def progress(msg): ''' hack to print progress ''' if 'w' not in progress.__dict__: progress.w = 0 # clear previous progress line print(f'\r{" "*progress.w}', end='', file=sys.stderr) s, f, w = string_trunc(msg) print(f'\r{s:{f}<{w}}', end='', file=sys.stderr) progress.w = w def extract_link(address): global link_status global session @@ -135,10 +175,14 @@ def extract_link(address): for link in BeautifulSoup(response.content, "html.parser", parse_only=SoupStrainer(key)): if link.has_attr(value): # I'm jonesin' for some progress indicators progress(link[value]) p, netloc = pattern_adjust(link[value], rbase) if p and p not in link_status: link_status[p] = check(p, netloc) if link_status[p]: # the '\r' is a hack to show stdout ok w/ progress msgs print('\r', end='', file=sys.stderr) print(link_status[p]) @@ -159,5 +203,5 @@ def extract_link(address): session = requests.Session() session.headers.update({'User-Agent': 'test'}) for website in websites: print(f'{Fore.CYAN}--- checking links on {website} ---{Fore.RESET}') result = extract_link(website) -
yarko revised this gist
Dec 27, 2017 . 1 changed file with 5 additions and 4 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -21,9 +21,9 @@ from colorama import Fore from bs4 import BeautifulSoup, SoupStrainer # this is a modification of a gist; # it got me quickly started for now. # Original: # [email protected]:2872d7f994d192188970408980267e6e.git @@ -116,7 +116,8 @@ def pattern_adjust(link_address, rbase=None): elif r.scheme == '' and r.netloc == '': # is this what I want to do? # would I rather do urljoin(urlunsplit(rbase), link_address)? return (urljoin(website, link_address), r.netloc) \ if INTERNAL else (None, None) else: return (link_address, r.netloc) -
yarko revised this gist
Dec 27, 2017 . 1 changed file with 2 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -148,13 +148,14 @@ def extract_link(address): websites = arguments['<url>'] # "https://davericho.com/books/" colorama_init() # to facilitate checking each link only once link_status = {} # sites which don't accept 'head' requests (dynamic) # populate with results of urlsplit().netloc FULL_GET = ('www.amazon.com',) # for places like amazon.com, which will deny python scripts: # Now - use session throughout this script! session = requests.Session() session.headers.update({'User-Agent': 'test'}) for website in websites: print(f'{Fore.CYAN}--- checking links on {website} ---') -
yarko revised this gist
Dec 27, 2017 . 1 changed file with 112 additions and 53 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -7,96 +7,155 @@ Normally, only reports problem links. Usage: check_link.py [-i] [-v] <url>... Options: -i --internal check links internal to the site, also; -v --verbose report all link outcomes, good and bad; -h --help Show this message and exit; ''' import requests from urllib.parse import urljoin, urlsplit, urlunsplit from docopt import docopt from colorama import init as colorama_init from colorama import Fore from bs4 import BeautifulSoup, SoupStrainer # this is a modification of a pretty terrible gist, # but it got me quickly started for now. # Original (I may delete this link eventually): # [email protected]:2872d7f994d192188970408980267e6e.git def check(address, netloc): global VERBOSE global FULL_GET global session msg = None # the normal "ok" is no message # optimize which retrieve we use: retrieve = session.get if netloc in FULL_GET else session.head try: # NOTE: amazon denies requests from python scripts, so we use # a session with an updated 'User-Agent' throughout ('session') # amazon.com remembers if the session.get() was from a python agent, # and then denies the session.get(), even if it updated # its 'user-agent' with a 503 - Service Unavailable # OPTIMIZATION: # we try a light-weight session.head() call first; # if it fails for a domain w/ 405 (Method Not Allowed), then # we retry with a full session.get(), and log the location so # we always try the long way; resp = retrieve(address) if resp.status_code == 405: resp = session.get(address) FULL_GET.add(netloc) except Exception as e: return f'{Fore.YELLOW}{e} - {address}' if resp.status_code in \ [301, 308, 400, 401, 402, 403, 404, 405, 408, 409, 410, 501, 502, 503]: msg = f'{resp.status_code} - {resp.reason} => {address}' # TODO: scrub other permanent redirection codes to include in this: if resp.status_code == 301: newaddress = urljoin(address, resp.headers["Location"].split(";")[0]) msg += f'\n NEW: => {newaddress}' elif VERBOSE: msg = f'{Fore.GREEN}{resp.status_code} - {resp.reason} => {address}' return msg def pattern_adjust(link_address, rbase=None): ''' returns "adjusted" address and netloc; don't follow local addresses, unless option set to follow internal addresses ''' global website global INTERNAL if link_address[0] == '#': # local return (None, None) # create a local static var: # - depends on global "website" # TODO: this has potential to break when # we loop over multiple URLs # if 'rbase' not in pattern_adjust.__dict__ \ # pattern_adjust.rbase = urlsplit(website) r = urlsplit(link_address) # don't follow local: if not INTERNAL and \ r.netloc == rbase.netloc: return (None, None) # TODO: this could possibly follow local links # NOTE: I don't really understand # what this is doing, so annotating: # if relative URL (local) # TODO: I am getting convinced what this wants to do # should be done w/ a simple urljoin() # I'm also thinking this code branch isn't traversed; if r.scheme == '' and (r.netloc != '' or r.path != ''): # reconstitute - it won't be a full path d = urlunsplit(r) # This if seems exceedingly wonky if d.startswith('//'): # if it starts with '//', throw that away... # m = re.search('(?<=//)\S+', d) # d = m.group(0) # TODO: if r.netloc is empty, then this # could result in an incorrect URL: # => if address = foo.com/something - then ok # => if address relaive: ./static/something - then trouble return ("https://" + d[2:], r.netloc) elif r.scheme == '' and r.netloc == '': # is this what I want to do? # would I rather do urljoin(urlunsplit(rbase), link_address)? return (urljoin(website, link_address), r.netloc) if INTERNAL else (None, None) else: return (link_address, r.netloc) def extract_link(address): global link_status global session tags = {'a': 'href', 'img': 'src', 'script': 'src', 'link': 'href'} # the partitioned pieces of URL we're checking rbase = urlsplit(address) response = session.get(address) for key, value in tags.items(): for link in BeautifulSoup(response.content, "html.parser", parse_only=SoupStrainer(key)): if link.has_attr(value): p, netloc = pattern_adjust(link[value], rbase) if p and p not in link_status: link_status[p] = check(p, netloc) if link_status[p]: print(link_status[p]) if __name__ == "__main__": arguments = docopt(__doc__) INTERNAL = arguments['--internal'] VERBOSE = arguments['--verbose'] websites = arguments['<url>'] # "https://davericho.com/books/" colorama_init() link_status = {} # sites which don't accept 'head' requests (dynamic) # populate with results of urlsplit().netloc FULL_GET = ('www.amazon.com',) session = requests.Session() # for places like amazon.com, which will deny python scripts: # Now - use session throughout this script! session.headers.update({'User-Agent': 'test'}) for website in websites: print(f'{Fore.CYAN}--- checking links on {website} ---') result = extract_link(website) -
yarko renamed this gist
Dec 27, 2017 . 1 changed file with 0 additions and 0 deletions.There are no files selected for viewing
File renamed without changes. -
yarko revised this gist
Dec 27, 2017 . 1 changed file with 87 additions and 58 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,73 +1,102 @@ #!/usr/bin/env python ''' check_link.py Parses a url, and checks links in it for validity. Normally, does not check links to the target URL. Normally, only reports problem links. Usage: check_link.py [-i] [-v] <url> Options: -i --internal check links internal to the site, also; -v --verbose report all link outcomes, good and bad; -h --help Show this message and exit; ''' import requests from urllib.parse import * from docopt import docopt import colorama,re,queue,threading from colorama import Fore from bs4 import BeautifulSoup,SoupStrainer # this is a modification of a pretty terrible gist, # but it got me quickly started for now. # Original (I may delete this link eventually): # [email protected]:2872d7f994d192188970408980267e6e.git def check(address): global VERBOSE try: resp=requests.head(address) if resp.status_code in [400,404,403,408,409,501,502,503]: return f'{Fore.RED}{resp.status_code} - {resp.reason} --> {address}' elif VERBOSE: return f'{Fore.GREEN}no problem in --> {address}' else: return None except Exception as e: return f'{Fore.YELLOW}{e} - {address}' pass def pattern_adjust(link_address): ''' don't follow local addresses, unless specifically set option to follow internal addresses ''' if re.match('^#' ,link_address): # local return 0 # create a local static var: # - depends on global "website" if 'rbase' not in pattern_adjust.__dict__: global website global INTERNAL pattern_adjust.rbase = urlsplit(website) r=urlsplit(link_address) # don't follow local: if not INTERNAL and \ r.netloc == pattern_adjust.rbase.netloc: return 0 # TODO: this could possibly follow local links if r.scheme=='' and (r.netloc!='' or r.path!=''): d=urlunsplit(r) if re.match('^//' ,d): m= re.search('(?<=//)\S+', d) d=m.group(0) return "https://"+d elif r.scheme=='' and r.netloc=='': return websit+link_address if INTERNAL else 0 else: return link_address def extract_link(address): global link_status tags= {'a':'href', 'img':'src', 'script':'src', 'link':'href' } response=requests.get(address) # for key,value in iter(list(tags.items())): for key, value in tags.items(): try: for link in BeautifulSoup(response.content,"html.parser",parse_only=SoupStrainer(key)): if link.has_attr(value): p=pattern_adjust(link[value]) if p and not p in link_status: link_status[p] = check(p) if link_status[p]: print(link_status[p]) except Exception as e: print((e,address)) if __name__=="__main__": arguments = docopt(__doc__) INTERNAL = arguments['--internal'] VERBOSE = arguments['--verbose'] colorama.init() link_status = {} website = arguments['<url>'] # "https://davericho.com/books/" result = extract_link(website) -
hackerdem created this gist
May 11, 2016 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,73 @@ from bs4 import BeautifulSoup,SoupStrainer import urllib.request import colorama,re,queue,threading from colorama import Fore from urllib.parse import * class check_link(): def __init__(self,address): self.address=address def check(self,address): try: req=urllib.request.Request(url=address) resp=urllib.request.urlopen(req) if resp.status in [400,404,403,408,409,501,502,503]:print (Fore.RED+resp.status+"-"+resp.reason+"-->"+address) else: print (Fore.GREEN+"no problem in-->"+address) except Exception as e: print (Fore.YELLOW+"{}-{}".format(e,address)) pass def pattern_adjust(a): try: if re.match('^#' ,a):return 0 r=urlsplit(a) if r.scheme=='' and (r.netloc!='' or r.path!=''): d=urlunsplit(r) if re.match('^//' ,d): m= re.search('(?<=//)\S+', d) d=m.group(0) m="https://"+d return m elif r.scheme=='' and r.netloc=='': return address+a else:return a except Exception as e: pass def extract_link(address): tags= {'a':'href', 'img':'src', 'script':'src', 'link':'href' } for key,value in iter(tags.items()): try: res=urllib.request.urlopen(address) response=res.read().decode('utf-8') #needs improvement for link in BeautifulSoup(response,"html.parser",parse_only=SoupStrainer(key)): if link.has_attr(value): p=pattern_adjust(link[value]) if p!=0 and str(p)!='None': newcheck=check_link(p) newcheck.check(p) if p not in hyperlinks: hyperlinks.add(p) if website.split('.')[1] in p:#needs improvement if not website.endswith(('.png','.jpeg','.js','jpg')): q.put(p) except Exception as e: print (e,address) def threader(): while True: value=q.get() result=extract_link(value) q.task_done() if __name__=="__main__": colorama.init() q=queue.Queue() global hyperlinks,website hyperlinks=set() website=input("Please enter the website address: ") for x in range(30): t=threading.Thread(target=threader) t.deamon=True t.start() q.put(website.strip()) q.join()