Last active
November 27, 2022 16:06
-
-
Save funkey7dan/66f67945127935a614e6a4f8c9b4c76a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from bs4 import BeautifulSoup | |
| from lxml import etree | |
| import requests | |
| import time | |
| def get_spouses(seen,spouse_list): | |
| to_crawl = spouse_list | |
| while len(to_crawl) > 0: | |
| next_crawl = to_crawl.pop() | |
| if next_crawl in seen: continue | |
| res = requests.get(next_crawl) | |
| soup = BeautifulSoup(res.text,features = 'lxml') | |
| doc = etree.HTML(str(soup)) | |
| if len(doc.xpath("/*[contains(.,'was Queen') or contains(.,'was King')]/text()")) > 0: | |
| seen.add(next_crawl) | |
| return seen | |
| def get_royal_family(): | |
| to_crawl = ["https://en.wikipedia.org/wiki/George_V"] | |
| seen = set() | |
| spouse_list = [] | |
| while len(to_crawl) > 0: | |
| next_crawl = to_crawl.pop() | |
| if next_crawl in seen: continue | |
| time.sleep(1) | |
| res = requests.get(next_crawl) | |
| soup = BeautifulSoup(res.text,features = 'lxml') | |
| doc = etree.HTML(str(soup)) | |
| family_list = doc.xpath( | |
| ("//tr/th[contains(.,'Issue')]//following-sibling::*//a/@href[contains(.,'/wiki/')]")) | |
| spouse_list_temp = doc.xpath( | |
| ("//tr/th[contains(.,'Spouse')]//following-sibling::*//a/@href[contains(.,'/wiki/')]")) | |
| spouse_list+=([f'https://en.wikipedia.org{link}' for link in spouse_list_temp]) | |
| family_list = [f'https://en.wikipedia.org{link}' for link in family_list] | |
| to_crawl+=(family_list) | |
| seen.add(next_crawl) | |
| seen = get_spouses(seen,spouse_list) | |
| print(seen) | |
| print(len(seen)) | |
| with open(file="out.txt",mode="w") as f: | |
| f.writelines([x+'/n' for x in seen]) | |
| get_royal_family() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment