Last active
November 27, 2022 16:06
-
-
Save funkey7dan/66f67945127935a614e6a4f8c9b4c76a to your computer and use it in GitHub Desktop.
Revisions
-
funkey7dan revised this gist
Nov 27, 2022 . 1 changed file with 0 additions and 16 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -3,22 +3,6 @@ import requests import time def get_spouses(seen,spouse_list): to_crawl = spouse_list -
funkey7dan created this gist
Nov 27, 2022 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,62 @@ from bs4 import BeautifulSoup from lxml import etree import requests import time # #page = requests.get("https://en.wikipedia.org/wiki/Elizabeth_II") # #page = requests.get("https://en.wikipedia.org/wiki/Anne,_Princess_Royal") # page = requests.get("https://en.wikipedia.org/wiki/Prince_Harry,_Duke_of_Sussex") # #page = requests.get("https://en.wikipedia.org/wiki/Charles_III") # soup = BeautifulSoup(page.text,features='lxml') # doc = etree.HTML(str(soup)) # #print(set(doc.xpath('//a/@href[contains(.,"/wiki/Prince")]'))) #lotta princes # #print(set(doc.xpath('//a/@href[contains(.,"/wiki/King") and not(contains(.,"Kingdom"))]'))) # returns some kings # #print(doc.xpath('//a/@title[contains(.,"son")]/../@href[contains(.,"wiki")]')) # toprint = doc.xpath(("//tr/th[contains(.,'Issue')]//following-sibling::*//a/@href")) # print(toprint) # #print(doc.xpath('//li[contains(.,"son")]//a/@href[contains(.,"wiki")]')) # returns '/wiki/William,_Prince_of_Wales', '/wiki/Prince_Harry,_Duke_of_Sussex' # #print(doc.xpath('//tr[contains(.,"father")]//a/@href[contains(.,"wiki")]')) # returns /wiki/Prince_Philip,_Duke_of_Edinburgh # #print(len(doc.xpath('//*[contains(.,"member of the British royal family") or contains(.,"member of the royal family") or contains(.,"Queen of the United Kingdom") or contains(.,"King of the United Kingdom")]/text()'))>0) def get_spouses(seen,spouse_list): to_crawl = spouse_list while len(to_crawl) > 0: next_crawl = to_crawl.pop() if next_crawl in seen: continue res = requests.get(next_crawl) soup = BeautifulSoup(res.text,features = 'lxml') doc = etree.HTML(str(soup)) if len(doc.xpath("/*[contains(.,'was Queen') or contains(.,'was King')]/text()")) > 0: seen.add(next_crawl) return seen def get_royal_family(): to_crawl = ["https://en.wikipedia.org/wiki/George_V"] seen = set() spouse_list = [] while len(to_crawl) > 0: next_crawl = to_crawl.pop() if next_crawl in seen: continue time.sleep(1) res = requests.get(next_crawl) soup = BeautifulSoup(res.text,features = 'lxml') doc = etree.HTML(str(soup)) family_list = doc.xpath( ("//tr/th[contains(.,'Issue')]//following-sibling::*//a/@href[contains(.,'/wiki/')]")) spouse_list_temp = doc.xpath( ("//tr/th[contains(.,'Spouse')]//following-sibling::*//a/@href[contains(.,'/wiki/')]")) spouse_list+=([f'https://en.wikipedia.org{link}' for link in spouse_list_temp]) family_list = [f'https://en.wikipedia.org{link}' for link in family_list] to_crawl+=(family_list) seen.add(next_crawl) seen = get_spouses(seen,spouse_list) print(seen) print(len(seen)) with open(file="out.txt",mode="w") as f: f.writelines([x+'/n' for x in seen]) get_royal_family()