-
-
Save IanHopkinson/ad45831a2fb73f537a79 to your computer and use it in GitHub Desktop.
| #!/usr/bin/env python | |
| # encoding: utf-8 | |
| import lxml.etree | |
| import lxml.html | |
| import requests | |
| xml_sample = """<?xml version="1.0" encoding="UTF-8"?> | |
| <foo:Results xmlns:foo="http://www.foo.com" xmlns="http://www.bah.com"> | |
| <foo:Recordset setCount="2"> | |
| <foo:Record setEntry="0"> | |
| <foo:Title>First title</foo:Title> | |
| </foo:Record> | |
| <foo:Record setEntry="1"> | |
| <foo:Title>Second title</foo:Title> | |
| </foo:Record> | |
| <Record setEntry="2"> | |
| <Title>Third title</Title> | |
| </Record> | |
| <Record setEntry="3"> | |
| <Title>Fourth title</Title> | |
| </Record> | |
| </foo:Recordset> | |
| </foo:Results> | |
| """.encode("utf-8") | |
| def main(): | |
| print("Demonstrating xpath on HTML") | |
| print("===========================") | |
| r = requests.get("http://www.ianhopkinson.org.uk") | |
| root = lxml.html.fromstring(r.content) | |
| title = root.xpath('/html/body/div/div/div[2]/h1') | |
| print("My blog title is: '{}'".format(title[0].text.strip())) | |
| title = root.xpath('//div[2]/h1') | |
| print("We can use the // shortcut to get the same thing more easily: '{}'".format(title[0].text_content().strip())) | |
| ids = root.xpath('//li/@id') | |
| print("We can get the id attributes of all the <li> elements. There are {} of them, the first one is {}".format(len(ids), ids[0])) | |
| tagcloud = root.xpath('//*[@class="tagcloud"]') | |
| print("We can get the parent element of the tagcloud using an attribute selector: {}".format(tagcloud)) | |
| title = root.xpath("//h1[contains(., 'SomeBeans')]") | |
| print("Another way to get the title is to select by element text content: '{}'".format(title[0].text.strip())) | |
| subtitle = root.xpath('//h1[contains(@class,"header_title")]/../h2') | |
| print("We can use the .. operator is select the subtitle: '{}'".format(subtitle[0].text.strip())) | |
| subtitle = root.xpath('//h1[contains(@class,"header_title")]/following-sibling::h2') | |
| print("Or we can use following-sibling to same effect: '{}'".format(subtitle[0].text.strip())) | |
| print("\nDemonstrating xpath on XML") | |
| print("============================") | |
| print("Processing XML is pretty similar except for namespaces") | |
| namespace = "http://www.foo.com" | |
| namespace_c = "{" + namespace + "}" | |
| NSMAP = {"foo": namespace} | |
| root = lxml.etree.fromstring(xml_sample) | |
| record_count = root.xpath('//@setCount')[0] | |
| print("Attributes are easy, this is the @setCount: {}".format(record_count)) | |
| print("These are the elements defined by the XML string at the top of this program:") | |
| for i, element in enumerate(root.getiterator()): | |
| print(element.tag) | |
| print("We can select elements by defining a namespace in our queries") | |
| records = root.xpath('//foo:Title', namespaces = {"foo": "http://www.foo.com"}) | |
| for record in records: | |
| print(record.text) | |
| print("Without defining the default namespace, we get nothing") | |
| records = root.xpath('//Title') | |
| for record in records: | |
| print(record.text) | |
| print("With the default namespace, we get something") | |
| records = root.xpath('//bah:Title', namespaces = {"bah": "http://www.bah.com"}) | |
| for record in records: | |
| print("Element name: {}, element text '{}'".format(record.tag, record.text)) | |
| if __name__ == "__main__": | |
| main() |
Gotcha. I thought that might be the case.
I forked your gist and made some changes. Then I added on some other examples of processing XML that contains QTI (Question & Test Interoperability) data. Experimenting with lxml.etree, I found that the default, unnamed namespace in the XML is available in the tree's data in nsmap[None]. See my lxml-test-etree.py, line 11…
defaultNamespace = {'_': root.nsmap[None]}I found that naming it _ makes it convenient to refer to it in the XPath statement, as on line 23…
items = root.xpath('//_:item', namespaces=defaultNamespace)I wanted to get that namespace used by default when xpath() is called. I tried setting the key to None or using root.nsmap itself, but those caused an error ("TypeError: empty namespace prefix is not supported in XPath").
I'd like to not need to use the _: prefix for the element name, but at least it's minimally obtrusive. Trying to set a truly default namespace is a lost cause, apparently. As written in the lxml FAQ, "How can I specify a default namespace for XPath expressions?". The short answer: "You can't." 🤷
As it turns out, I may prefer using lxml.objectify rather than lxml.etree, but I need to investigate a little more before I know for sure. See my lxml-test-objectify.py, for example.
@lsloan I think that is probably a hangover from an earlier version of the code, in this version it serves no purpose. The lxml documentation uses that style of namespace definition, I probably intended to use it down at line 74 and then forgot!