#!/usr/bin/env python # encoding: utf-8 import lxml.etree import lxml.html import requests xml_sample = """ First title Second title Third title Fourth title """.encode("utf-8") def main(): print("Demonstrating xpath on HTML") print("===========================") r = requests.get("http://www.ianhopkinson.org.uk") root = lxml.html.fromstring(r.content) title = root.xpath('/html/body/div/div/div[2]/h1') print("My blog title is: '{}'".format(title[0].text.strip())) title = root.xpath('//div[2]/h1') print("We can use the // shortcut to get the same thing more easily: '{}'".format(title[0].text_content().strip())) ids = root.xpath('//li/@id') print("We can get the id attributes of all the
  • elements. There are {} of them, the first one is {}".format(len(ids), ids[0])) tagcloud = root.xpath('//*[@class="tagcloud"]') print("We can get the parent element of the tagcloud using an attribute selector: {}".format(tagcloud)) title = root.xpath("//h1[contains(., 'SomeBeans')]") print("Another way to get the title is to select by element text content: '{}'".format(title[0].text.strip())) subtitle = root.xpath('//h1[contains(@class,"header_title")]/../h2') print("We can use the .. operator is select the subtitle: '{}'".format(subtitle[0].text.strip())) subtitle = root.xpath('//h1[contains(@class,"header_title")]/following-sibling::h2') print("Or we can use following-sibling to same effect: '{}'".format(subtitle[0].text.strip())) print("\nDemonstrating xpath on XML") print("============================") print("Processing XML is pretty similar except for namespaces") namespace = "http://www.foo.com" namespace_c = "{" + namespace + "}" NSMAP = {"foo": namespace} root = lxml.etree.fromstring(xml_sample) record_count = root.xpath('//@setCount')[0] print("Attributes are easy, this is the @setCount: {}".format(record_count)) print("These are the elements defined by the XML string at the top of this program:") for i, element in enumerate(root.getiterator()): print(element.tag) print("We can select elements by defining a namespace in our queries") records = root.xpath('//foo:Title', namespaces = {"foo": "http://www.foo.com"}) for record in records: print(record.text) print("Without defining the default namespace, we get nothing") records = root.xpath('//Title') for record in records: print(record.text) print("With the default namespace, we get something") records = root.xpath('//bah:Title', namespaces = {"bah": "http://www.bah.com"}) for record in records: print("Element name: {}, element text '{}'".format(record.tag, record.text)) if __name__ == "__main__": main()