from html.parser import HTMLParser

class ImageParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.image_rows = []  # list of [url, caption]
        self.current_img = None
        self.in_caption_div = False
        self.in_caption_p = False
        self.current_caption = ''

    def handle_starttag(self, tag, attrs):
        attr_dict = dict(attrs)
        
        # check if we entered a wp-caption div
        if tag == "div" and 'class' in attr_dict and 'wp-caption' in attr_dict['class']:
            self.in_caption_div = True
            self.current_img = None
            self.current_caption = ''

        # image inside wp-caption
        if self.in_caption_div and tag == "img":
            src = attr_dict.get('src', '')
            if src.endswith('.svg') or 'Searchtool' in src:
                return
            width = int(attr_dict.get('width', 0))
            height = int(attr_dict.get('height', 0))
            if width and height and (width < 50 or height < 50):
                return
            self.current_img = src

        # paragraph for caption text
        if self.in_caption_div and tag == "p" and 'class' in attr_dict and 'wp-caption-text' in attr_dict['class']:
            self.in_caption_p = True
            self.current_caption = ''

    def handle_endtag(self, tag):
        # leave caption paragraph
        if tag == "p" and self.in_caption_p:
            self.in_caption_p = False

        # leave wp-caption div
        if tag == "div" and self.in_caption_div:
            if self.current_img:  # store row [img, caption]
                self.image_rows.append([self.current_img, self.current_caption.strip()])
            self.in_caption_div = False
            self.current_img = None
            self.current_caption = ''

    def handle_data(self, data):
        if self.in_caption_p:
            self.current_caption += data


def onResponse(webClientDAT, statusCode, headerDict, data, id):
    encoding = headerDict.get('content-type', '')
    encoding = encoding.split('charset=')[-1] if 'charset=' in encoding else 'utf-8'
    html = data.decode(encoding)

    parser = ImageParser()
    parser.feed(html)

    table = op('image_urls')
    table.clear()

    # write URL + caption to table
    for row in parser.image_rows:
        table.appendRow(row)