Skip to content

Instantly share code, notes, and snippets.

@eriksachse
Last active October 5, 2025 06:57
Show Gist options
  • Save eriksachse/ca188cd885dce195e1e473c21360835e to your computer and use it in GitHub Desktop.
Save eriksachse/ca188cd885dce195e1e473c21360835e to your computer and use it in GitHub Desktop.
from html.parser import HTMLParser
class ImageParser(HTMLParser):
def __init__(self):
super().__init__()
self.image_rows = [] # list of [url, caption]
self.current_img = None
self.in_caption_div = False
self.in_caption_p = False
self.current_caption = ''
def handle_starttag(self, tag, attrs):
attr_dict = dict(attrs)
# check if we entered a wp-caption div
if tag == "div" and 'class' in attr_dict and 'wp-caption' in attr_dict['class']:
self.in_caption_div = True
self.current_img = None
self.current_caption = ''
# image inside wp-caption
if self.in_caption_div and tag == "img":
src = attr_dict.get('src', '')
if src.endswith('.svg') or 'Searchtool' in src:
return
width = int(attr_dict.get('width', 0))
height = int(attr_dict.get('height', 0))
if width and height and (width < 50 or height < 50):
return
self.current_img = src
# paragraph for caption text
if self.in_caption_div and tag == "p" and 'class' in attr_dict and 'wp-caption-text' in attr_dict['class']:
self.in_caption_p = True
self.current_caption = ''
def handle_endtag(self, tag):
# leave caption paragraph
if tag == "p" and self.in_caption_p:
self.in_caption_p = False
# leave wp-caption div
if tag == "div" and self.in_caption_div:
if self.current_img: # store row [img, caption]
self.image_rows.append([self.current_img, self.current_caption.strip()])
self.in_caption_div = False
self.current_img = None
self.current_caption = ''
def handle_data(self, data):
if self.in_caption_p:
self.current_caption += data
def onResponse(webClientDAT, statusCode, headerDict, data, id):
encoding = headerDict.get('content-type', '')
encoding = encoding.split('charset=')[-1] if 'charset=' in encoding else 'utf-8'
html = data.decode(encoding)
parser = ImageParser()
parser.feed(html)
table = op('image_urls')
table.clear()
# write URL + caption to table
for row in parser.image_rows:
table.appendRow(row)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment