funkey7dan · November 27, 2022 16:06
diff --git a/allroyall.py b/allroyall.py
 from bs4 import BeautifulSoup
 from lxml import etree
 import requests
 import time


 def get_spouses(seen,spouse_list):
    to_crawl = spouse_list
    while len(to_crawl) > 0:
        next_crawl = to_crawl.pop()
        if next_crawl in seen: continue
        res = requests.get(next_crawl)
        soup = BeautifulSoup(res.text,features = 'lxml')
        doc = etree.HTML(str(soup))
        if len(doc.xpath("/*[contains(.,'was Queen') or contains(.,'was King')]/text()")) > 0:
            seen.add(next_crawl)
    return seen
    


 def get_royal_family():
    to_crawl = ["https://en.wikipedia.org/wiki/George_V"]
    seen = set()
    spouse_list = []
    while len(to_crawl) > 0:
        next_crawl = to_crawl.pop()
        if next_crawl in seen: continue
        time.sleep(1)
        res = requests.get(next_crawl)
        soup = BeautifulSoup(res.text,features = 'lxml')
        doc = etree.HTML(str(soup))
        family_list = doc.xpath(
        ("//tr/th[contains(.,'Issue')]//following-sibling::*//a/@href[contains(.,'/wiki/')]"))
        spouse_list_temp = doc.xpath(
        ("//tr/th[contains(.,'Spouse')]//following-sibling::*//a/@href[contains(.,'/wiki/')]"))
        spouse_list+=([f'https://en.wikipedia.org{link}' for link in spouse_list_temp])
        family_list = [f'https://en.wikipedia.org{link}' for link in family_list]
        to_crawl+=(family_list)
        seen.add(next_crawl)
    seen = get_spouses(seen,spouse_list)
    print(seen)
    print(len(seen))
    with open(file="out.txt",mode="w") as f:
        f.writelines([x+'/n' for x in seen])

 get_royal_family()
	from bs4 import BeautifulSoup
	from lxml import etree
	import requests
	import time


	def get_spouses(seen,spouse_list):
	to_crawl = spouse_list
	while len(to_crawl) > 0:
	next_crawl = to_crawl.pop()
	if next_crawl in seen: continue
	res = requests.get(next_crawl)
	soup = BeautifulSoup(res.text,features = 'lxml')
	doc = etree.HTML(str(soup))
	if len(doc.xpath("/*[contains(.,'was Queen') or contains(.,'was King')]/text()")) > 0:
	seen.add(next_crawl)
	return seen



	def get_royal_family():
	to_crawl = ["https://en.wikipedia.org/wiki/George_V"]
	seen = set()
	spouse_list = []
	while len(to_crawl) > 0:
	next_crawl = to_crawl.pop()
	if next_crawl in seen: continue
	time.sleep(1)
	res = requests.get(next_crawl)
	soup = BeautifulSoup(res.text,features = 'lxml')
	doc = etree.HTML(str(soup))
	family_list = doc.xpath(
	("//tr/th[contains(.,'Issue')]//following-sibling::*//a/@href[contains(.,'/wiki/')]"))
	spouse_list_temp = doc.xpath(
	("//tr/th[contains(.,'Spouse')]//following-sibling::*//a/@href[contains(.,'/wiki/')]"))
	spouse_list+=([f'https://en.wikipedia.org{link}' for link in spouse_list_temp])
	family_list = [f'https://en.wikipedia.org{link}' for link in family_list]
	to_crawl+=(family_list)
	seen.add(next_crawl)
	seen = get_spouses(seen,spouse_list)
	print(seen)
	print(len(seen))
	with open(file="out.txt",mode="w") as f:
	f.writelines([x+'/n' for x in seen])

	get_royal_family()
No results found