Skip to content

Instantly share code, notes, and snippets.

@KernelPanicAUS
Created June 16, 2020 11:11
Show Gist options
  • Save KernelPanicAUS/429e88c65c2556a22d486b2dfff994b7 to your computer and use it in GitHub Desktop.
Save KernelPanicAUS/429e88c65c2556a22d486b2dfff994b7 to your computer and use it in GitHub Desktop.

Revisions

  1. KernelPanicAUS created this gist Jun 16, 2020.
    65 changes: 65 additions & 0 deletions dl.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,65 @@
    from lxml import html
    import requests
    import os

    root_url = "https://hnarayanan.github.io/springer-books/"
    category = "Computer Science"
    download_path = f"{os.getcwd()}/downloads/{category.lower().replace(' ', '_')}"
    headers = {
    "Host": "link.springer.com",
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
    "Accept-Encoding": "gzip, deflate, br",
    "Referer": "https://link.springer.com/",
    "DNT": "1",
    "Connection": "keep-alive",
    "Cookie": "sim-inst-token=1::1592330235412:4c812914; trackid=1897426dcf7a4d8f9c24344cd; recaptcha=8K1/HkRi4MLVzheCLwngJ5CMeCvMypHSbPY0yWv2KFc=",
    "Upgrade-Insecure-Requests": "1",
    "Pragma": "no-cache",
    "Cache-Control": "no-cache",
    }


    def download_file(url, filename):
    extension = url.split(".")[-1]
    local_filename = f"{download_path}/{filename.replace('/','_')}.{extension}"
    with requests.get(
    f"https://link.springer.com{url}", headers=headers, stream=True
    ) as r:
    r.raise_for_status()
    with open(local_filename, "wb") as f:
    for chunk in r.iter_content(chunk_size=8192):
    f.write(chunk)


    def extract_download_links(link):
    print(f"Visiting link {link}")

    book_page = html.fromstring(requests.get(link).content)
    title = book_page.xpath('//div[@class="page-title"]/h1[1]/text()')[0]
    links = book_page.xpath('//a[contains(@class,"c-button__icon-right")][*]/@href')
    print(f"Title [{title}]")

    print(links)
    print("=======================")
    for book_link in links:
    download_file(book_link, title.lower().replace(" ", "_"))


    def main():
    page = requests.get(root_url)
    tree = html.fromstring(page.content)

    if not os.path.exists(download_path):
    os.makedirs(download_path)

    links = tree.xpath(
    f"//h2[text() ='{category}']/following-sibling::div[@class='row mt-2'][1]/div[*]/div[@class='card mb-2']/div/a/@href"
    )
    for link in links:
    extract_download_links(link)


    if __name__ == "__main__":
    main()