Skip to content

Instantly share code, notes, and snippets.

@funkey7dan
Last active November 27, 2022 16:06
Show Gist options
  • Select an option

  • Save funkey7dan/66f67945127935a614e6a4f8c9b4c76a to your computer and use it in GitHub Desktop.

Select an option

Save funkey7dan/66f67945127935a614e6a4f8c9b4c76a to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
from lxml import etree
import requests
import time
def get_spouses(seen,spouse_list):
to_crawl = spouse_list
while len(to_crawl) > 0:
next_crawl = to_crawl.pop()
if next_crawl in seen: continue
res = requests.get(next_crawl)
soup = BeautifulSoup(res.text,features = 'lxml')
doc = etree.HTML(str(soup))
if len(doc.xpath("/*[contains(.,'was Queen') or contains(.,'was King')]/text()")) > 0:
seen.add(next_crawl)
return seen
def get_royal_family():
to_crawl = ["https://en.wikipedia.org/wiki/George_V"]
seen = set()
spouse_list = []
while len(to_crawl) > 0:
next_crawl = to_crawl.pop()
if next_crawl in seen: continue
time.sleep(1)
res = requests.get(next_crawl)
soup = BeautifulSoup(res.text,features = 'lxml')
doc = etree.HTML(str(soup))
family_list = doc.xpath(
("//tr/th[contains(.,'Issue')]//following-sibling::*//a/@href[contains(.,'/wiki/')]"))
spouse_list_temp = doc.xpath(
("//tr/th[contains(.,'Spouse')]//following-sibling::*//a/@href[contains(.,'/wiki/')]"))
spouse_list+=([f'https://en.wikipedia.org{link}' for link in spouse_list_temp])
family_list = [f'https://en.wikipedia.org{link}' for link in family_list]
to_crawl+=(family_list)
seen.add(next_crawl)
seen = get_spouses(seen,spouse_list)
print(seen)
print(len(seen))
with open(file="out.txt",mode="w") as f:
f.writelines([x+'/n' for x in seen])
get_royal_family()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment