from lxml import html,etree import requests import pprint #This will create a list of buyers: #buyers = tree.xpath('//div[@title="buyer-name"]/text()') #This will create a list of prices #prices = tree.xpath('//span[@class="item-price"]/text()') #print 'Buyers: ', buyers #print 'Prices: ', prices user_agents = [ 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11', 'Opera/9.25 (Windows NT 5.1; U; en)', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', 'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12', 'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9' ] headers = { 'User-Agent': user_agents[0] } baseurl = "http://www.canardscanins.ca" #download page response = requests.get(baseurl+'/canins/portail.php?action=liste', headers=headers) #print response.text #convert to lxml etree tree = html.fromstring(response.text) #find all a tag and extract the href attribute links = tree.xpath('//a/@href') #find all a tag and extract the text between the open and closing tab titles = tree.xpath('//a/text()') print titles parks = {} for (i, item) in enumerate(links): parks[baseurl+item] = {"Name":titles[i]} #print titles[i] + " - " + baseurl+item pprint.pprint(parks) #for item in links: # print baseurl+item #for each key,value in parks key, value = parks.popitem() page = requests.get(key, headers=headers) ptree = html.fromstring(page.text) #imgs = ptree.find(".//img") imgs = ptree.xpath('//img') #print etree.tostring(tree) print imgs[0].getnext().text print imgs[0].getnext().tail #extract b nodes that contain text "Fondation" founded = ptree.xpath('.//b[contains(text(),"Fondation")]') print len(founded) print etree.tostring(founded[0].getnext()) #for img in ptree.iterfind('.//img'): # print etree.tostring(img)