from bs4 import BeautifulSoup from lxml import etree import requests import time def get_spouses(seen,spouse_list): to_crawl = spouse_list while len(to_crawl) > 0: next_crawl = to_crawl.pop() if next_crawl in seen: continue res = requests.get(next_crawl) soup = BeautifulSoup(res.text,features = 'lxml') doc = etree.HTML(str(soup)) if len(doc.xpath("/*[contains(.,'was Queen') or contains(.,'was King')]/text()")) > 0: seen.add(next_crawl) return seen def get_royal_family(): to_crawl = ["https://en.wikipedia.org/wiki/George_V"] seen = set() spouse_list = [] while len(to_crawl) > 0: next_crawl = to_crawl.pop() if next_crawl in seen: continue time.sleep(1) res = requests.get(next_crawl) soup = BeautifulSoup(res.text,features = 'lxml') doc = etree.HTML(str(soup)) family_list = doc.xpath( ("//tr/th[contains(.,'Issue')]//following-sibling::*//a/@href[contains(.,'/wiki/')]")) spouse_list_temp = doc.xpath( ("//tr/th[contains(.,'Spouse')]//following-sibling::*//a/@href[contains(.,'/wiki/')]")) spouse_list+=([f'https://en.wikipedia.org{link}' for link in spouse_list_temp]) family_list = [f'https://en.wikipedia.org{link}' for link in family_list] to_crawl+=(family_list) seen.add(next_crawl) seen = get_spouses(seen,spouse_list) print(seen) print(len(seen)) with open(file="out.txt",mode="w") as f: f.writelines([x+'/n' for x in seen]) get_royal_family()