KernelPanicAUS · June 16, 2020 11:11 · Jun 16, 2020
diff --git a/dl.py b/dl.py
@@ -0,0 +1,65 @@
+from lxml import html
+import requests
+import os
+
+root_url = "https://hnarayanan.github.io/springer-books/"
+category = "Computer Science"
+download_path = f"{os.getcwd()}/downloads/{category.lower().replace(' ', '_')}"
+headers = {
+    "Host": "link.springer.com",
+    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0",
+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+    "Accept-Language": "en-US,en;q=0.5",
+    "Accept-Encoding": "gzip, deflate, br",
+    "Referer": "https://link.springer.com/",
+    "DNT": "1",
+    "Connection": "keep-alive",
+    "Cookie": "sim-inst-token=1::1592330235412:4c812914; trackid=1897426dcf7a4d8f9c24344cd; recaptcha=8K1/HkRi4MLVzheCLwngJ5CMeCvMypHSbPY0yWv2KFc=",
+    "Upgrade-Insecure-Requests": "1",
+    "Pragma": "no-cache",
+    "Cache-Control": "no-cache",
+}
+
+
+def download_file(url, filename):
+    extension = url.split(".")[-1]
+    local_filename = f"{download_path}/{filename.replace('/','_')}.{extension}"
+    with requests.get(
+        f"https://link.springer.com{url}", headers=headers, stream=True
+    ) as r:
+        r.raise_for_status()
+        with open(local_filename, "wb") as f:
+            for chunk in r.iter_content(chunk_size=8192):
+                f.write(chunk)
+
+
+def extract_download_links(link):
+    print(f"Visiting link {link}")
+
+    book_page = html.fromstring(requests.get(link).content)
+    title = book_page.xpath('//div[@class="page-title"]/h1[1]/text()')[0]
+    links = book_page.xpath('//a[contains(@class,"c-button__icon-right")][*]/@href')
+    print(f"Title [{title}]")
+
+    print(links)
+    print("=======================")
+    for book_link in links:
+        download_file(book_link, title.lower().replace(" ", "_"))
+
+
+def main():
+    page = requests.get(root_url)
+    tree = html.fromstring(page.content)
+
+    if not os.path.exists(download_path):
+        os.makedirs(download_path)
+
+    links = tree.xpath(
+        f"//h2[text() ='{category}']/following-sibling::div[@class='row mt-2'][1]/div[*]/div[@class='card mb-2']/div/a/@href"
+    )
+    for link in links:
+        extract_download_links(link)
+
+
+if __name__ == "__main__":
+    main()