Skip to content

Instantly share code, notes, and snippets.

@eriksachse
Created September 27, 2025 08:29
Show Gist options
  • Save eriksachse/43ea634b02fc1902d1ed1ecf20864aaf to your computer and use it in GitHub Desktop.
Save eriksachse/43ea634b02fc1902d1ed1ecf20864aaf to your computer and use it in GitHub Desktop.

Revisions

  1. eriksachse created this gist Sep 27, 2025.
    58 changes: 58 additions & 0 deletions webscrape.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,58 @@
    # me - this DAT.
    # webClientDAT - The connected Web Client DAT
    # statusCode - The status code of the response, formatted as a dictionary with two key-value pairs: 'code', 'message'.
    # headerDict - The header of the response from the server formatted as a dictionary. Only sent once when streaming.
    # data - The data of the response
    # id - The request's unique identifier

    def onConnect(webClientDAT, id):
    return

    def onDisconnect(webClientDAT, id):
    return


    from html.parser import HTMLParser


    class ImageParser(HTMLParser):
    def __init__(self):
    super().__init__()
    self.image_urls = []
    def handle_starttag(self, tag, attrs):
    if tag == "img":
    attr_dict = dict(attrs)

    # 1. Skip .svg icons
    src = attr_dict.get('src', '')
    if src.endswith('.svg') or 'Searchtool' in src:
    return

    # 2. Skip small images
    width = int(attr_dict.get('width', 0))
    height = int(attr_dict.get('height', 0))
    if width < 50 or height < 50:
    return

    # If image
    self.image_urls.append(src)



    def onResponse(webClientDAT, statusCode, headerDict, data, id):
    # get the encoding from the header
    encoding = headerDict.get('content-type', '')
    encoding = encoding.split('charset=')[-1] if 'charset=' in encoding else 'utf-8'
    # data is byte data, so decode it to a string
    html = data.decode(encoding)

    # parse the html to get the image urls
    parser = ImageParser()
    parser.feed(html)

    # write the image urls to a table
    table = op('image_urls')
    table.clear()
    debug(data)
    for url in parser.image_urls:
    table.appendRow(url)