Skip to content

Instantly share code, notes, and snippets.

@funkey7dan
Last active November 27, 2022 16:06
Show Gist options
  • Select an option

  • Save funkey7dan/66f67945127935a614e6a4f8c9b4c76a to your computer and use it in GitHub Desktop.

Select an option

Save funkey7dan/66f67945127935a614e6a4f8c9b4c76a to your computer and use it in GitHub Desktop.

Revisions

  1. funkey7dan revised this gist Nov 27, 2022. 1 changed file with 0 additions and 16 deletions.
    16 changes: 0 additions & 16 deletions allroyall.py
    Original file line number Diff line number Diff line change
    @@ -3,22 +3,6 @@
    import requests
    import time

    # #page = requests.get("https://en.wikipedia.org/wiki/Elizabeth_II")
    # #page = requests.get("https://en.wikipedia.org/wiki/Anne,_Princess_Royal")
    # page = requests.get("https://en.wikipedia.org/wiki/Prince_Harry,_Duke_of_Sussex")
    # #page = requests.get("https://en.wikipedia.org/wiki/Charles_III")

    # soup = BeautifulSoup(page.text,features='lxml')
    # doc = etree.HTML(str(soup))
    # #print(set(doc.xpath('//a/@href[contains(.,"/wiki/Prince")]'))) #lotta princes
    # #print(set(doc.xpath('//a/@href[contains(.,"/wiki/King") and not(contains(.,"Kingdom"))]'))) # returns some kings
    # #print(doc.xpath('//a/@title[contains(.,"son")]/../@href[contains(.,"wiki")]'))
    # toprint = doc.xpath(("//tr/th[contains(.,'Issue')]//following-sibling::*//a/@href"))
    # print(toprint)
    # #print(doc.xpath('//li[contains(.,"son")]//a/@href[contains(.,"wiki")]')) # returns '/wiki/William,_Prince_of_Wales', '/wiki/Prince_Harry,_Duke_of_Sussex'
    # #print(doc.xpath('//tr[contains(.,"father")]//a/@href[contains(.,"wiki")]')) # returns /wiki/Prince_Philip,_Duke_of_Edinburgh

    # #print(len(doc.xpath('//*[contains(.,"member of the British royal family") or contains(.,"member of the royal family") or contains(.,"Queen of the United Kingdom") or contains(.,"King of the United Kingdom")]/text()'))>0)

    def get_spouses(seen,spouse_list):
    to_crawl = spouse_list
  2. funkey7dan created this gist Nov 27, 2022.
    62 changes: 62 additions & 0 deletions allroyall.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,62 @@
    from bs4 import BeautifulSoup
    from lxml import etree
    import requests
    import time

    # #page = requests.get("https://en.wikipedia.org/wiki/Elizabeth_II")
    # #page = requests.get("https://en.wikipedia.org/wiki/Anne,_Princess_Royal")
    # page = requests.get("https://en.wikipedia.org/wiki/Prince_Harry,_Duke_of_Sussex")
    # #page = requests.get("https://en.wikipedia.org/wiki/Charles_III")

    # soup = BeautifulSoup(page.text,features='lxml')
    # doc = etree.HTML(str(soup))
    # #print(set(doc.xpath('//a/@href[contains(.,"/wiki/Prince")]'))) #lotta princes
    # #print(set(doc.xpath('//a/@href[contains(.,"/wiki/King") and not(contains(.,"Kingdom"))]'))) # returns some kings
    # #print(doc.xpath('//a/@title[contains(.,"son")]/../@href[contains(.,"wiki")]'))
    # toprint = doc.xpath(("//tr/th[contains(.,'Issue')]//following-sibling::*//a/@href"))
    # print(toprint)
    # #print(doc.xpath('//li[contains(.,"son")]//a/@href[contains(.,"wiki")]')) # returns '/wiki/William,_Prince_of_Wales', '/wiki/Prince_Harry,_Duke_of_Sussex'
    # #print(doc.xpath('//tr[contains(.,"father")]//a/@href[contains(.,"wiki")]')) # returns /wiki/Prince_Philip,_Duke_of_Edinburgh

    # #print(len(doc.xpath('//*[contains(.,"member of the British royal family") or contains(.,"member of the royal family") or contains(.,"Queen of the United Kingdom") or contains(.,"King of the United Kingdom")]/text()'))>0)

    def get_spouses(seen,spouse_list):
    to_crawl = spouse_list
    while len(to_crawl) > 0:
    next_crawl = to_crawl.pop()
    if next_crawl in seen: continue
    res = requests.get(next_crawl)
    soup = BeautifulSoup(res.text,features = 'lxml')
    doc = etree.HTML(str(soup))
    if len(doc.xpath("/*[contains(.,'was Queen') or contains(.,'was King')]/text()")) > 0:
    seen.add(next_crawl)
    return seen



    def get_royal_family():
    to_crawl = ["https://en.wikipedia.org/wiki/George_V"]
    seen = set()
    spouse_list = []
    while len(to_crawl) > 0:
    next_crawl = to_crawl.pop()
    if next_crawl in seen: continue
    time.sleep(1)
    res = requests.get(next_crawl)
    soup = BeautifulSoup(res.text,features = 'lxml')
    doc = etree.HTML(str(soup))
    family_list = doc.xpath(
    ("//tr/th[contains(.,'Issue')]//following-sibling::*//a/@href[contains(.,'/wiki/')]"))
    spouse_list_temp = doc.xpath(
    ("//tr/th[contains(.,'Spouse')]//following-sibling::*//a/@href[contains(.,'/wiki/')]"))
    spouse_list+=([f'https://en.wikipedia.org{link}' for link in spouse_list_temp])
    family_list = [f'https://en.wikipedia.org{link}' for link in family_list]
    to_crawl+=(family_list)
    seen.add(next_crawl)
    seen = get_spouses(seen,spouse_list)
    print(seen)
    print(len(seen))
    with open(file="out.txt",mode="w") as f:
    f.writelines([x+'/n' for x in seen])

    get_royal_family()