from html.parser import HTMLParser class ImageParser(HTMLParser): def __init__(self): super().__init__() self.image_rows = [] # list of [url, caption] self.current_img = None self.in_caption_div = False self.in_caption_p = False self.current_caption = '' def handle_starttag(self, tag, attrs): attr_dict = dict(attrs) # check if we entered a wp-caption div if tag == "div" and 'class' in attr_dict and 'wp-caption' in attr_dict['class']: self.in_caption_div = True self.current_img = None self.current_caption = '' # image inside wp-caption if self.in_caption_div and tag == "img": src = attr_dict.get('src', '') if src.endswith('.svg') or 'Searchtool' in src: return width = int(attr_dict.get('width', 0)) height = int(attr_dict.get('height', 0)) if width and height and (width < 50 or height < 50): return self.current_img = src # paragraph for caption text if self.in_caption_div and tag == "p" and 'class' in attr_dict and 'wp-caption-text' in attr_dict['class']: self.in_caption_p = True self.current_caption = '' def handle_endtag(self, tag): # leave caption paragraph if tag == "p" and self.in_caption_p: self.in_caption_p = False # leave wp-caption div if tag == "div" and self.in_caption_div: if self.current_img: # store row [img, caption] self.image_rows.append([self.current_img, self.current_caption.strip()]) self.in_caption_div = False self.current_img = None self.current_caption = '' def handle_data(self, data): if self.in_caption_p: self.current_caption += data def onResponse(webClientDAT, statusCode, headerDict, data, id): encoding = headerDict.get('content-type', '') encoding = encoding.split('charset=')[-1] if 'charset=' in encoding else 'utf-8' html = data.decode(encoding) parser = ImageParser() parser.feed(html) table = op('image_urls') table.clear() # write URL + caption to table for row in parser.image_rows: table.appendRow(row)