Last active
October 19, 2025 13:20
-
-
Save DMHYT/3755eec84f4384e575611d2ccf568b2f to your computer and use it in GitHub Desktop.
yunzhan365.com PDF downloader
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Fetches yunzhan365.com book contents and saves it to PDF. | |
| # Really slow but I just wanted to make this work in any way. | |
| # Third-party modules: requests, selenium, pillow | |
| # Usage: python yunzhan.py <needed yunzhan book url> | |
| from io import BytesIO | |
| from json import dumps, loads | |
| from math import floor | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from selenium import webdriver | |
| from selenium.webdriver.common.desired_capabilities import DesiredCapabilities | |
| from sys import argv | |
| from time import sleep, time | |
| from PIL import Image | |
| if __name__ == "__main__": | |
| LINK = argv[1] if len(argv) > 1 else input("Link: ") | |
| if "yunzhan365.com/basic/" in LINK: | |
| print("Fixing the URL...") | |
| soup = BeautifulSoup(requests.get(LINK).text, "html.parser") | |
| book_info = soup.find("div", { "class": "book-info" }) | |
| title = book_info.find("h1", { "class": "title" }) | |
| LINK = title.find("a").get("href") | |
| print("Fixed to " + LINK) | |
| desired_capabilities = DesiredCapabilities.CHROME | |
| desired_capabilities["goog:loggingPrefs"] = {"performance": "ALL"} | |
| options = webdriver.ChromeOptions() | |
| options.add_argument('headless') | |
| options.add_argument("--ignore-certificate-errors") | |
| options.add_argument("--log-level=3") | |
| driver = webdriver.Chrome(options=options) | |
| driver.get(LINK) | |
| sleep(5) | |
| NUM_PAGES = driver.execute_script("return originTotalPageCount;") | |
| flips = floor((NUM_PAGES - 3) / 2) | |
| if flips > 0: | |
| for i in range(flips): | |
| print("Fetching pages " + str(5 + 2 * i) + "/" + str(NUM_PAGES) + "...", end="\r") | |
| driver.execute_script("nextPageFun(\"mouse wheel flip\")") | |
| sleep(0.5) | |
| print("\nWriting the network log...") | |
| logs = driver.get_log("performance") | |
| with open("network_log.json", "w", encoding="utf-8") as f: | |
| f.write("[") | |
| for log in logs: | |
| network_log = loads(log["message"])["message"] | |
| if("Network.response" in network_log["method"] or "Network.request" in network_log["method"] or "Network.webSocket" in network_log["method"]): | |
| f.write(dumps(network_log)+",") | |
| f.write("{}]") | |
| driver.quit() | |
| json_file_path = "network_log.json" | |
| with open(json_file_path, "r", encoding="utf-8") as f: | |
| logs = loads(f.read()) | |
| print("Sorting the pages...") | |
| page_links = [] | |
| for log in logs: | |
| try: | |
| url = log["params"]["request"]["url"] | |
| if "files/large/" in url: | |
| page_links.append(url.split('?')[0]) | |
| except Exception: pass | |
| if flips > 0: | |
| for i in range(flips): | |
| p1 = 3 + 2 * i | |
| p2 = 4 + 2 * i | |
| if p2 < len(page_links): | |
| page_links[p1], page_links[p2] = page_links[p2], page_links[p1] | |
| images = [] | |
| for page in range(len(page_links)): | |
| print("Loading pages " + str(page + 1) + "/" + str(NUM_PAGES) + "...", end="\r") | |
| images.append(Image.open(BytesIO(requests.get(page_links[page]).content)).convert("RGB")) | |
| print("\nSaving to PDF...") | |
| images[0].save("result-" + str(round(time() * 1000)) + ".pdf", save_all=True, append_images=images[1:]) | |
| print("Done!") |
Rewrited and updated version on NodeJS.
It will also translate the name(page title) to English in this format:
_<DD.MM.YYYY_HH.MM>.pdf
How to use:
- Install NodeJS from official site.
- Open/create an empty folder and create this 2 files: package.json, yunzhan.js.
- Put the code bellow to the corresponding files.
- run in console: npm install
- run in console: npm start
package.json
{
"name": "url-to-pdf",
"version": "1.0.0",
"main": "index.js",
"type": "module",
"engines": {
"node": ">=14.0.0"
},
"scripts": {
"start": "node --no-deprecation yunzhan.js"
},
"keywords": [],
"author": "",
"license": "ISC",
"description": "",
"dependencies": {
"@vitalets/google-translate-api": "^9.2.1",
"axios": "^1.8.4",
"cheerio": "^1.0.0",
"image-size": "^2.0.2",
"pdfkit": "^0.16.0",
"puppeteer": "^24.6.0",
"sharp": "^0.34.0"
}
}yunzhan.js
#!/usr/bin/env node
import puppeteer from 'puppeteer';
import axios from 'axios';
import sharp from 'sharp';
import PDFDocument from 'pdfkit';
import fs from 'fs';
import path from 'path';
import { fileURLToPath } from 'url';
import { load } from 'cheerio';
import { translate } from '@vitalets/google-translate-api';
import readline from 'readline';
import sizeOfImport from 'image-size';
// Resolve __dirname in ES modules
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
// Ensure image-size works regardless of export style
const sizeOf = sizeOfImport.default || sizeOfImport;
// ---- Constants ----
const WAIT_AFTER_NAVIGATION = 3000; // ms to wait after navigation
const WAIT_AFTER_FLIP = 1000; // ms to wait after each page flip
const MAX_ATTEMPTS = 50; // maximum iterations if no new pages appear
// ---- Utility Functions ----
const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
const promptInput = (query) => {
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout
});
return new Promise((resolve) => {
rl.question(query, (answer) => {
rl.close();
resolve(answer.trim());
});
});
};
const cleanTitle = (title) =>
title.replace(/[^\w\s-]/g, '').trim().replace(/\s+/g, '-');
const getFormattedDate = () => {
const now = new Date();
const pad = (num) => String(num).padStart(2, '0');
return `${pad(now.getDate())}.${pad(now.getMonth() + 1)}.${now.getFullYear()}_${pad(now.getHours())}.${pad(now.getMinutes())}`;
};
// ---- Main Process ----
(async () => {
// 1. Get URL from command line argument or prompt the user
let url = process.argv[2];
if (!url) {
url = await promptInput('Please give me a URL for parsing from yunzhan: ');
if (!url) {
console.error('No URL provided. Exiting.');
process.exit(1);
}
}
// 2. Fix URL if needed
if (url.includes('yunzhan365.com/basic/')) {
console.log('Fixing the URL...');
try {
const resp = await axios.get(url);
const $ = load(resp.data);
const bookInfo = $('div.book-info');
const titleElem = bookInfo.find('h1.title');
const fixedUrl = titleElem.find('a').attr('href');
if (fixedUrl) {
url = fixedUrl;
console.log('Fixed to ' + url);
} else {
console.error('Could not find the fixed URL.');
process.exit(1);
}
} catch (err) {
console.error('Error fetching the URL:', err);
process.exit(1);
}
}
// 3. Launch Puppeteer and navigate to the page
console.log('Launching headless browser...');
const browser = await puppeteer.launch({
headless: true,
args: ['--ignore-certificate-errors', '--disable-gpu', '--log-level=3']
});
const page = await browser.newPage();
console.log('Navigating to page...');
await page.goto(url, { waitUntil: 'networkidle2' });
await sleep(WAIT_AFTER_NAVIGATION);
// 4. Get total pages from the page's JS variable
let numPages = await page.evaluate(() =>
typeof originTotalPageCount === 'number' ? originTotalPageCount : null
);
if (!numPages) {
console.log('Could not read originTotalPageCount; using fallback of 9999.');
numPages = 9999;
}
console.log('Total pages (claimed):', numPages);
// 5. Get and translate the page title
let pageTitle = await page.title();
console.log('Page title:', pageTitle);
let translatedTitle = pageTitle;
try {
if (/[^\x00-\x7F]/.test(pageTitle)) {
console.log('Translating title from Chinese (Simplified) to English...');
const translation = await translate(pageTitle, { from: 'zh-CN', to: 'en' });
translatedTitle = translation.text;
console.log('Translated title:', translatedTitle);
}
} catch (err) {
console.error('Translation error, using original title.', err);
}
console.log('Collecting images from DOM using page numbers...');
// 6. Collect image URLs from the DOM using page numbers
const pageMap = {};
let attempts = 0;
let prevCount = 0;
while (attempts < MAX_ATTEMPTS) {
const newPages = await page.evaluate(() => {
const elems = Array.from(document.querySelectorAll('[id^="page"]'));
return elems.map(el => {
const num = parseInt(el.id.replace('page', ''), 10);
const img = el.querySelector('img');
const src = img ? img.src.split('?')[0] : null;
return { num, src };
}).filter(item => item.src !== null);
});
for (const p of newPages) {
if (!pageMap[p.num]) {
pageMap[p.num] = p.src;
}
}
const currentCount = Object.keys(pageMap).length;
console.log(`Total collected pages so far: ${currentCount}...`);
if (currentCount > prevCount) {
prevCount = currentCount;
attempts = 0;
} else {
attempts++;
}
if (currentCount >= numPages) break;
await page.evaluate(() => {
if (typeof nextPageFun === 'function') {
nextPageFun('mouse wheel flip');
}
});
await sleep(WAIT_AFTER_FLIP);
}
let sortedPages = Object.keys(pageMap)
.map(Number)
.sort((a, b) => a - b);
if (sortedPages.length > numPages) {
sortedPages = sortedPages.slice(0, numPages);
}
const collectedImageURLs = sortedPages.map(num => pageMap[num]);
console.log(`Collected ${collectedImageURLs.length} images in DOM order.`);
await browser.close();
// 7. Prepare PDF filename
const safeTitle = cleanTitle(translatedTitle);
const formattedDate = getFormattedDate();
const resultsFolder = path.join(__dirname, 'results');
if (!fs.existsSync(resultsFolder)) {
fs.mkdirSync(resultsFolder);
}
const pdfPath = path.join(resultsFolder, `${safeTitle}_${formattedDate}.pdf`);
// 8. Download images concurrently
const downloadPromises = collectedImageURLs.map((link, idx) => {
console.log(`Downloading page ${idx + 1} / ${collectedImageURLs.length}...`);
return axios
.get(link, { responseType: 'arraybuffer' })
.then((res) => Buffer.from(res.data, 'binary'))
.catch((err) => {
console.error('Error downloading image:', link, err);
return null;
});
});
const downloaded = await Promise.all(downloadPromises);
const imagesBuffers = downloaded.filter((buf) => buf !== null);
if (imagesBuffers.length === 0) {
console.error('No images were downloaded. Exiting.');
process.exit(1);
}
// 9. Generate PDF using PDFKit
console.log('Saving to PDF...');
const doc = new PDFDocument({ autoFirstPage: false });
const writeStream = fs.createWriteStream(pdfPath);
doc.pipe(writeStream);
for (const buffer of imagesBuffers) {
let dimensions;
try {
dimensions = sizeOf(buffer);
} catch (err) {
console.error('Error getting image dimensions, skipping this page.', err);
continue;
}
if (!['jpg', 'jpeg', 'png'].includes(dimensions.type)) {
try {
const converted = await sharp(buffer).png().toBuffer();
dimensions = sizeOf(converted);
doc.addPage({ size: [dimensions.width, dimensions.height] });
doc.image(converted, 0, 0, {
width: dimensions.width,
height: dimensions.height
});
} catch (err) {
console.error('Error converting image to PNG, skipping this page.', err);
}
} else {
doc.addPage({ size: [dimensions.width, dimensions.height] });
doc.image(buffer, 0, 0, {
width: dimensions.width,
height: dimensions.height
});
}
}
doc.end();
await new Promise((resolve) => writeStream.on('finish', resolve));
console.log('Done! PDF saved as', pdfPath);
})();you can use User scripts with tampermonkey
https://greasyfork.org/zh-CN/scripts/435884-wenku-doc-downloader
#!/usr/bin/env node import puppeteer from 'puppeteer'; import axios from 'axios'; import sharp from 'sharp'; import PDFDocument from 'pdfkit'; import fs from 'fs'; import path from 'path'; import { fileURLToPath } from 'url'; import { load } from 'cheerio'; import { translate } from '@vitalets/google-translate-api'; import readline from 'readline'; import sizeOfImport from 'image-size'; // Resolve __dirname in ES modules const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); // Ensure image-size works regardless of export style const sizeOf = sizeOfImport.default || sizeOfImport; // ---- Constants ---- const WAIT_AFTER_NAVIGATION = 3000; // ms to wait after navigation const WAIT_AFTER_FLIP = 1000; // ms to wait after each page flip const MAX_ATTEMPTS = 50; // maximum iterations if no new pages appear // ---- Utility Functions ---- const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms)); const promptInput = (query) => { const rl = readline.createInterface({ input: process.stdin, output: process.stdout }); return new Promise((resolve) => { rl.question(query, (answer) => { rl.close(); resolve(answer.trim()); }); }); }; const cleanTitle = (title) => title.replace(/[^\w\s-]/g, '').trim().replace(/\s+/g, '-'); const getFormattedDate = () => { const now = new Date(); const pad = (num) => String(num).padStart(2, '0'); return `${pad(now.getDate())}.${pad(now.getMonth() + 1)}.${now.getFullYear()}_${pad(now.getHours())}.${pad(now.getMinutes())}`; }; // ---- Main Process ---- (async () => { // 1. Get URL from command line argument or prompt the user let url = process.argv[2]; if (!url) { url = await promptInput('Please give me a URL for parsing from yunzhan: '); if (!url) { console.error('No URL provided. Exiting.'); process.exit(1); } } // 2. Fix URL if needed if (url.includes('yunzhan365.com/basic/')) { console.log('Fixing the URL...'); try { const resp = await axios.get(url); const $ = load(resp.data); const bookInfo = $('div.book-info'); const titleElem = bookInfo.find('h1.title'); const fixedUrl = titleElem.find('a').attr('href'); if (fixedUrl) { url = fixedUrl; console.log('Fixed to ' + url); } else { console.error('Could not find the fixed URL.'); process.exit(1); } } catch (err) { console.error('Error fetching the URL:', err); process.exit(1); } } // 3. Launch Puppeteer and navigate to the page console.log('Launching headless browser...'); const browser = await puppeteer.launch({ headless: true, args: ['--ignore-certificate-errors', '--disable-gpu', '--log-level=3'] }); const page = await browser.newPage(); console.log('Navigating to page...'); await page.goto(url, { waitUntil: 'networkidle2' }); await sleep(WAIT_AFTER_NAVIGATION); // 4. Get total pages from the page's JS variable let numPages = await page.evaluate(() => typeof originTotalPageCount === 'number' ? originTotalPageCount : null ); if (!numPages) { console.log('Could not read originTotalPageCount; using fallback of 9999.'); numPages = 9999; } console.log('Total pages (claimed):', numPages); // 5. Get and translate the page title let pageTitle = await page.title(); console.log('Page title:', pageTitle); let translatedTitle = pageTitle; try { if (/[^\x00-\x7F]/.test(pageTitle)) { console.log('Translating title from Chinese (Simplified) to English...'); const translation = await translate(pageTitle, { from: 'zh-CN', to: 'en' }); translatedTitle = translation.text; console.log('Translated title:', translatedTitle); } } catch (err) { console.error('Translation error, using original title.', err); } console.log('Collecting images from DOM using page numbers...'); // 6. Collect image URLs from the DOM using page numbers const pageMap = {}; let attempts = 0; let prevCount = 0; while (attempts < MAX_ATTEMPTS) { const newPages = await page.evaluate(() => { const elems = Array.from(document.querySelectorAll('[id^="page"]')); return elems.map(el => { const num = parseInt(el.id.replace('page', ''), 10); const img = el.querySelector('img'); const src = img ? img.src.split('?')[0] : null; return { num, src }; }).filter(item => item.src !== null); }); for (const p of newPages) { if (!pageMap[p.num]) { pageMap[p.num] = p.src; } } const currentCount = Object.keys(pageMap).length; console.log(`Total collected pages so far: ${currentCount}...`); if (currentCount > prevCount) { prevCount = currentCount; attempts = 0; } else { attempts++; } if (currentCount >= numPages) break; await page.evaluate(() => { if (typeof nextPageFun === 'function') { nextPageFun('mouse wheel flip'); } }); await sleep(WAIT_AFTER_FLIP); } let sortedPages = Object.keys(pageMap) .map(Number) .sort((a, b) => a - b); if (sortedPages.length > numPages) { sortedPages = sortedPages.slice(0, numPages); } const collectedImageURLs = sortedPages.map(num => pageMap[num]); console.log(`Collected ${collectedImageURLs.length} images in DOM order.`); await browser.close(); // 7. Prepare PDF filename const safeTitle = cleanTitle(translatedTitle); const formattedDate = getFormattedDate(); const resultsFolder = path.join(__dirname, 'results'); if (!fs.existsSync(resultsFolder)) { fs.mkdirSync(resultsFolder); } const pdfPath = path.join(resultsFolder, `${safeTitle}_${formattedDate}.pdf`); // 8. Download images concurrently const downloadPromises = collectedImageURLs.map((link, idx) => { console.log(`Downloading page ${idx + 1} / ${collectedImageURLs.length}...`); return axios .get(link, { responseType: 'arraybuffer' }) .then((res) => Buffer.from(res.data, 'binary')) .catch((err) => { console.error('Error downloading image:', link, err); return null; }); }); const downloaded = await Promise.all(downloadPromises); const imagesBuffers = downloaded.filter((buf) => buf !== null); if (imagesBuffers.length === 0) { console.error('No images were downloaded. Exiting.'); process.exit(1); } // 9. Generate PDF using PDFKit console.log('Saving to PDF...'); const doc = new PDFDocument({ autoFirstPage: false }); const writeStream = fs.createWriteStream(pdfPath); doc.pipe(writeStream); for (const buffer of imagesBuffers) { let dimensions; try { dimensions = sizeOf(buffer); } catch (err) { console.error('Error getting image dimensions, skipping this page.', err); continue; } if (!['jpg', 'jpeg', 'png'].includes(dimensions.type)) { try { const converted = await sharp(buffer).png().toBuffer(); dimensions = sizeOf(converted); doc.addPage({ size: [dimensions.width, dimensions.height] }); doc.image(converted, 0, 0, { width: dimensions.width, height: dimensions.height }); } catch (err) { console.error('Error converting image to PNG, skipping this page.', err); } } else { doc.addPage({ size: [dimensions.width, dimensions.height] }); doc.image(buffer, 0, 0, { width: dimensions.width, height: dimensions.height }); } } doc.end(); await new Promise((resolve) => writeStream.on('finish', resolve)); console.log('Done! PDF saved as', pdfPath); })();
thanks, can you please support the url with a password to open? such as https://book.yunzhan365.com/irtl/ycmy/mobile/index.html,the pwd is 999999
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
It worked. Thanks