DMHYT/yunzhan.py

ngvannguyen · 2024-11-07T03:10:46Z

It worked. Thanks

DrDager · 2025-04-05T13:40:49Z

Rewrited and updated version on NodeJS.
It will also translate the name(page title) to English in this format:
_<DD.MM.YYYY_HH.MM>.pdf

How to use:

Install NodeJS from official site.
Open/create an empty folder and create this 2 files: package.json, yunzhan.js.
Put the code bellow to the corresponding files.
run in console: npm install
run in console: npm start

package.json

{
  "name": "url-to-pdf",
  "version": "1.0.0",
  "main": "index.js",
  "type": "module",
  "engines": {
    "node": ">=14.0.0"
  },
  "scripts": {
    "start": "node --no-deprecation yunzhan.js"
  },
  "keywords": [],
  "author": "",
  "license": "ISC",
  "description": "",
  "dependencies": {
    "@vitalets/google-translate-api": "^9.2.1",
    "axios": "^1.8.4",
    "cheerio": "^1.0.0",
    "image-size": "^2.0.2",
    "pdfkit": "^0.16.0",
    "puppeteer": "^24.6.0",
    "sharp": "^0.34.0"
  }
}

yunzhan.js

#!/usr/bin/env node
import puppeteer from 'puppeteer';
import axios from 'axios';
import sharp from 'sharp';
import PDFDocument from 'pdfkit';
import fs from 'fs';
import path from 'path';
import { fileURLToPath } from 'url';
import { load } from 'cheerio';
import { translate } from '@vitalets/google-translate-api';
import readline from 'readline';
import sizeOfImport from 'image-size';

// Resolve __dirname in ES modules
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);

// Ensure image-size works regardless of export style
const sizeOf = sizeOfImport.default || sizeOfImport;

// ---- Constants ----
const WAIT_AFTER_NAVIGATION = 3000;  // ms to wait after navigation
const WAIT_AFTER_FLIP = 1000;        // ms to wait after each page flip
const MAX_ATTEMPTS = 50;             // maximum iterations if no new pages appear

// ---- Utility Functions ----
const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms));

const promptInput = (query) => {
  const rl = readline.createInterface({
    input: process.stdin,
    output: process.stdout
  });
  return new Promise((resolve) => {
    rl.question(query, (answer) => {
      rl.close();
      resolve(answer.trim());
    });
  });
};

const cleanTitle = (title) =>
  title.replace(/[^\w\s-]/g, '').trim().replace(/\s+/g, '-');

const getFormattedDate = () => {
  const now = new Date();
  const pad = (num) => String(num).padStart(2, '0');
  return `${pad(now.getDate())}.${pad(now.getMonth() + 1)}.${now.getFullYear()}_${pad(now.getHours())}.${pad(now.getMinutes())}`;
};

// ---- Main Process ----
(async () => {
  // 1. Get URL from command line argument or prompt the user
  let url = process.argv[2];
  if (!url) {
    url = await promptInput('Please give me a URL for parsing from yunzhan: ');
    if (!url) {
      console.error('No URL provided. Exiting.');
      process.exit(1);
    }
  }

  // 2. Fix URL if needed
  if (url.includes('yunzhan365.com/basic/')) {
    console.log('Fixing the URL...');
    try {
      const resp = await axios.get(url);
      const $ = load(resp.data);
      const bookInfo = $('div.book-info');
      const titleElem = bookInfo.find('h1.title');
      const fixedUrl = titleElem.find('a').attr('href');
      if (fixedUrl) {
        url = fixedUrl;
        console.log('Fixed to ' + url);
      } else {
        console.error('Could not find the fixed URL.');
        process.exit(1);
      }
    } catch (err) {
      console.error('Error fetching the URL:', err);
      process.exit(1);
    }
  }

  // 3. Launch Puppeteer and navigate to the page
  console.log('Launching headless browser...');
  const browser = await puppeteer.launch({
    headless: true,
    args: ['--ignore-certificate-errors', '--disable-gpu', '--log-level=3']
  });
  const page = await browser.newPage();

  console.log('Navigating to page...');
  await page.goto(url, { waitUntil: 'networkidle2' });
  await sleep(WAIT_AFTER_NAVIGATION);

  // 4. Get total pages from the page's JS variable
  let numPages = await page.evaluate(() =>
    typeof originTotalPageCount === 'number' ? originTotalPageCount : null
  );
  if (!numPages) {
    console.log('Could not read originTotalPageCount; using fallback of 9999.');
    numPages = 9999;
  }
  console.log('Total pages (claimed):', numPages);

  // 5. Get and translate the page title
  let pageTitle = await page.title();
  console.log('Page title:', pageTitle);

  let translatedTitle = pageTitle;
  try {
    if (/[^\x00-\x7F]/.test(pageTitle)) {
      console.log('Translating title from Chinese (Simplified) to English...');
      const translation = await translate(pageTitle, { from: 'zh-CN', to: 'en' });
      translatedTitle = translation.text;
      console.log('Translated title:', translatedTitle);
    }
  } catch (err) {
    console.error('Translation error, using original title.', err);
  }

  console.log('Collecting images from DOM using page numbers...');

  // 6. Collect image URLs from the DOM using page numbers
  const pageMap = {};
  let attempts = 0;
  let prevCount = 0;

  while (attempts < MAX_ATTEMPTS) {
    const newPages = await page.evaluate(() => {
      const elems = Array.from(document.querySelectorAll('[id^="page"]'));
      return elems.map(el => {
        const num = parseInt(el.id.replace('page', ''), 10);
        const img = el.querySelector('img');
        const src = img ? img.src.split('?')[0] : null;
        return { num, src };
      }).filter(item => item.src !== null);
    });
  
    for (const p of newPages) {
      if (!pageMap[p.num]) {
        pageMap[p.num] = p.src;
      }
    }
    
    const currentCount = Object.keys(pageMap).length;
    console.log(`Total collected pages so far: ${currentCount}...`);
    
    if (currentCount > prevCount) {
      prevCount = currentCount;
      attempts = 0;
    } else {
      attempts++;
    }
    
    if (currentCount >= numPages) break;
    
    await page.evaluate(() => {
      if (typeof nextPageFun === 'function') {
        nextPageFun('mouse wheel flip');
      }
    });
    await sleep(WAIT_AFTER_FLIP);
  }

  let sortedPages = Object.keys(pageMap)
    .map(Number)
    .sort((a, b) => a - b);
  if (sortedPages.length > numPages) {
    sortedPages = sortedPages.slice(0, numPages);
  }
  const collectedImageURLs = sortedPages.map(num => pageMap[num]);
  console.log(`Collected ${collectedImageURLs.length} images in DOM order.`);

  await browser.close();

  // 7. Prepare PDF filename
  const safeTitle = cleanTitle(translatedTitle);
  const formattedDate = getFormattedDate();
  const resultsFolder = path.join(__dirname, 'results');
  if (!fs.existsSync(resultsFolder)) {
    fs.mkdirSync(resultsFolder);
  }
  const pdfPath = path.join(resultsFolder, `${safeTitle}_${formattedDate}.pdf`);

  // 8. Download images concurrently
  const downloadPromises = collectedImageURLs.map((link, idx) => {
    console.log(`Downloading page ${idx + 1} / ${collectedImageURLs.length}...`);
    return axios
      .get(link, { responseType: 'arraybuffer' })
      .then((res) => Buffer.from(res.data, 'binary'))
      .catch((err) => {
        console.error('Error downloading image:', link, err);
        return null;
      });
  });
  const downloaded = await Promise.all(downloadPromises);
  const imagesBuffers = downloaded.filter((buf) => buf !== null);
  if (imagesBuffers.length === 0) {
    console.error('No images were downloaded. Exiting.');
    process.exit(1);
  }

  // 9. Generate PDF using PDFKit
  console.log('Saving to PDF...');
  const doc = new PDFDocument({ autoFirstPage: false });
  const writeStream = fs.createWriteStream(pdfPath);
  doc.pipe(writeStream);

  for (const buffer of imagesBuffers) {
    let dimensions;
    try {
      dimensions = sizeOf(buffer);
    } catch (err) {
      console.error('Error getting image dimensions, skipping this page.', err);
      continue;
    }
    if (!['jpg', 'jpeg', 'png'].includes(dimensions.type)) {
      try {
        const converted = await sharp(buffer).png().toBuffer();
        dimensions = sizeOf(converted);
        doc.addPage({ size: [dimensions.width, dimensions.height] });
        doc.image(converted, 0, 0, {
          width: dimensions.width,
          height: dimensions.height
        });
      } catch (err) {
        console.error('Error converting image to PNG, skipping this page.', err);
      }
    } else {
      doc.addPage({ size: [dimensions.width, dimensions.height] });
      doc.image(buffer, 0, 0, {
        width: dimensions.width,
        height: dimensions.height
      });
    }
  }
  doc.end();

  await new Promise((resolve) => writeStream.on('finish', resolve));
  console.log('Done! PDF saved as', pdfPath);
})();

brucmao · 2025-08-25T05:53:08Z

you can use User scripts with tampermonkey
https://greasyfork.org/zh-CN/scripts/435884-wenku-doc-downloader

wangwanjie · 2025-10-19T13:20:27Z

#!/usr/bin/env node
import puppeteer from 'puppeteer';
import axios from 'axios';
import sharp from 'sharp';
import PDFDocument from 'pdfkit';
import fs from 'fs';
import path from 'path';
import { fileURLToPath } from 'url';
import { load } from 'cheerio';
import { translate } from '@vitalets/google-translate-api';
import readline from 'readline';
import sizeOfImport from 'image-size';

// Resolve __dirname in ES modules
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);

// Ensure image-size works regardless of export style
const sizeOf = sizeOfImport.default || sizeOfImport;

// ---- Constants ----
const WAIT_AFTER_NAVIGATION = 3000;  // ms to wait after navigation
const WAIT_AFTER_FLIP = 1000;        // ms to wait after each page flip
const MAX_ATTEMPTS = 50;             // maximum iterations if no new pages appear

// ---- Utility Functions ----
const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms));

const promptInput = (query) => {
  const rl = readline.createInterface({
    input: process.stdin,
    output: process.stdout
  });
  return new Promise((resolve) => {
    rl.question(query, (answer) => {
      rl.close();
      resolve(answer.trim());
    });
  });
};

const cleanTitle = (title) =>
  title.replace(/[^\w\s-]/g, '').trim().replace(/\s+/g, '-');

const getFormattedDate = () => {
  const now = new Date();
  const pad = (num) => String(num).padStart(2, '0');
  return `${pad(now.getDate())}.${pad(now.getMonth() + 1)}.${now.getFullYear()}_${pad(now.getHours())}.${pad(now.getMinutes())}`;
};

// ---- Main Process ----
(async () => {
  // 1. Get URL from command line argument or prompt the user
  let url = process.argv[2];
  if (!url) {
    url = await promptInput('Please give me a URL for parsing from yunzhan: ');
    if (!url) {
      console.error('No URL provided. Exiting.');
      process.exit(1);
    }
  }

  // 2. Fix URL if needed
  if (url.includes('yunzhan365.com/basic/')) {
    console.log('Fixing the URL...');
    try {
      const resp = await axios.get(url);
      const $ = load(resp.data);
      const bookInfo = $('div.book-info');
      const titleElem = bookInfo.find('h1.title');
      const fixedUrl = titleElem.find('a').attr('href');
      if (fixedUrl) {
        url = fixedUrl;
        console.log('Fixed to ' + url);
      } else {
        console.error('Could not find the fixed URL.');
        process.exit(1);
      }
    } catch (err) {
      console.error('Error fetching the URL:', err);
      process.exit(1);
    }
  }

  // 3. Launch Puppeteer and navigate to the page
  console.log('Launching headless browser...');
  const browser = await puppeteer.launch({
    headless: true,
    args: ['--ignore-certificate-errors', '--disable-gpu', '--log-level=3']
  });
  const page = await browser.newPage();

  console.log('Navigating to page...');
  await page.goto(url, { waitUntil: 'networkidle2' });
  await sleep(WAIT_AFTER_NAVIGATION);

  // 4. Get total pages from the page's JS variable
  let numPages = await page.evaluate(() =>
    typeof originTotalPageCount === 'number' ? originTotalPageCount : null
  );
  if (!numPages) {
    console.log('Could not read originTotalPageCount; using fallback of 9999.');
    numPages = 9999;
  }
  console.log('Total pages (claimed):', numPages);

  // 5. Get and translate the page title
  let pageTitle = await page.title();
  console.log('Page title:', pageTitle);

  let translatedTitle = pageTitle;
  try {
    if (/[^\x00-\x7F]/.test(pageTitle)) {
      console.log('Translating title from Chinese (Simplified) to English...');
      const translation = await translate(pageTitle, { from: 'zh-CN', to: 'en' });
      translatedTitle = translation.text;
      console.log('Translated title:', translatedTitle);
    }
  } catch (err) {
    console.error('Translation error, using original title.', err);
  }

  console.log('Collecting images from DOM using page numbers...');

  // 6. Collect image URLs from the DOM using page numbers
  const pageMap = {};
  let attempts = 0;
  let prevCount = 0;

  while (attempts < MAX_ATTEMPTS) {
    const newPages = await page.evaluate(() => {
      const elems = Array.from(document.querySelectorAll('[id^="page"]'));
      return elems.map(el => {
        const num = parseInt(el.id.replace('page', ''), 10);
        const img = el.querySelector('img');
        const src = img ? img.src.split('?')[0] : null;
        return { num, src };
      }).filter(item => item.src !== null);
    });
  
    for (const p of newPages) {
      if (!pageMap[p.num]) {
        pageMap[p.num] = p.src;
      }
    }
    
    const currentCount = Object.keys(pageMap).length;
    console.log(`Total collected pages so far: ${currentCount}...`);
    
    if (currentCount > prevCount) {
      prevCount = currentCount;
      attempts = 0;
    } else {
      attempts++;
    }
    
    if (currentCount >= numPages) break;
    
    await page.evaluate(() => {
      if (typeof nextPageFun === 'function') {
        nextPageFun('mouse wheel flip');
      }
    });
    await sleep(WAIT_AFTER_FLIP);
  }

  let sortedPages = Object.keys(pageMap)
    .map(Number)
    .sort((a, b) => a - b);
  if (sortedPages.length > numPages) {
    sortedPages = sortedPages.slice(0, numPages);
  }
  const collectedImageURLs = sortedPages.map(num => pageMap[num]);
  console.log(`Collected ${collectedImageURLs.length} images in DOM order.`);

  await browser.close();

  // 7. Prepare PDF filename
  const safeTitle = cleanTitle(translatedTitle);
  const formattedDate = getFormattedDate();
  const resultsFolder = path.join(__dirname, 'results');
  if (!fs.existsSync(resultsFolder)) {
    fs.mkdirSync(resultsFolder);
  }
  const pdfPath = path.join(resultsFolder, `${safeTitle}_${formattedDate}.pdf`);

  // 8. Download images concurrently
  const downloadPromises = collectedImageURLs.map((link, idx) => {
    console.log(`Downloading page ${idx + 1} / ${collectedImageURLs.length}...`);
    return axios
      .get(link, { responseType: 'arraybuffer' })
      .then((res) => Buffer.from(res.data, 'binary'))
      .catch((err) => {
        console.error('Error downloading image:', link, err);
        return null;
      });
  });
  const downloaded = await Promise.all(downloadPromises);
  const imagesBuffers = downloaded.filter((buf) => buf !== null);
  if (imagesBuffers.length === 0) {
    console.error('No images were downloaded. Exiting.');
    process.exit(1);
  }

  // 9. Generate PDF using PDFKit
  console.log('Saving to PDF...');
  const doc = new PDFDocument({ autoFirstPage: false });
  const writeStream = fs.createWriteStream(pdfPath);
  doc.pipe(writeStream);

  for (const buffer of imagesBuffers) {
    let dimensions;
    try {
      dimensions = sizeOf(buffer);
    } catch (err) {
      console.error('Error getting image dimensions, skipping this page.', err);
      continue;
    }
    if (!['jpg', 'jpeg', 'png'].includes(dimensions.type)) {
      try {
        const converted = await sharp(buffer).png().toBuffer();
        dimensions = sizeOf(converted);
        doc.addPage({ size: [dimensions.width, dimensions.height] });
        doc.image(converted, 0, 0, {
          width: dimensions.width,
          height: dimensions.height
        });
      } catch (err) {
        console.error('Error converting image to PNG, skipping this page.', err);
      }
    } else {
      doc.addPage({ size: [dimensions.width, dimensions.height] });
      doc.image(buffer, 0, 0, {
        width: dimensions.width,
        height: dimensions.height
      });
    }
  }
  doc.end();

  await new Promise((resolve) => writeStream.on('finish', resolve));
  console.log('Done! PDF saved as', pdfPath);
})();

thanks, can you please support the url with a password to open? such as https://book.yunzhan365.com/irtl/ycmy/mobile/index.html,the pwd is 999999

	# Fetches yunzhan365.com book contents and saves it to PDF.
	# Really slow but I just wanted to make this work in any way.
	# Third-party modules: requests, selenium, pillow
	# Usage: python yunzhan.py <needed yunzhan book url>


	from io import BytesIO
	from json import dumps, loads
	from math import floor
	import requests
	from bs4 import BeautifulSoup
	from selenium import webdriver
	from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
	from sys import argv
	from time import sleep, time
	from PIL import Image


	if __name__ == "__main__":

	LINK = argv[1] if len(argv) > 1 else input("Link: ")

	if "yunzhan365.com/basic/" in LINK:
	print("Fixing the URL...")
	soup = BeautifulSoup(requests.get(LINK).text, "html.parser")
	book_info = soup.find("div", { "class": "book-info" })
	title = book_info.find("h1", { "class": "title" })
	LINK = title.find("a").get("href")
	print("Fixed to " + LINK)

	desired_capabilities = DesiredCapabilities.CHROME
	desired_capabilities["goog:loggingPrefs"] = {"performance": "ALL"}

	options = webdriver.ChromeOptions()
	options.add_argument('headless')
	options.add_argument("--ignore-certificate-errors")
	options.add_argument("--log-level=3")

	driver = webdriver.Chrome(options=options)
	driver.get(LINK)
	sleep(5)

	NUM_PAGES = driver.execute_script("return originTotalPageCount;")

	flips = floor((NUM_PAGES - 3) / 2)
	if flips > 0:
	for i in range(flips):
	print("Fetching pages " + str(5 + 2 * i) + "/" + str(NUM_PAGES) + "...", end="\r")
	driver.execute_script("nextPageFun(\"mouse wheel flip\")")
	sleep(0.5)

	print("\nWriting the network log...")
	logs = driver.get_log("performance")
	with open("network_log.json", "w", encoding="utf-8") as f:
	f.write("[")
	for log in logs:
	network_log = loads(log["message"])["message"]
	if("Network.response" in network_log["method"] or "Network.request" in network_log["method"] or "Network.webSocket" in network_log["method"]):
	f.write(dumps(network_log)+",")
	f.write("{}]")
	driver.quit()
	json_file_path = "network_log.json"
	with open(json_file_path, "r", encoding="utf-8") as f:
	logs = loads(f.read())

	print("Sorting the pages...")
	page_links = []
	for log in logs:
	try:
	url = log["params"]["request"]["url"]
	if "files/large/" in url:
	page_links.append(url.split('?')[0])
	except Exception: pass

	if flips > 0:
	for i in range(flips):
	p1 = 3 + 2 * i
	p2 = 4 + 2 * i
	if p2 < len(page_links):
	page_links[p1], page_links[p2] = page_links[p2], page_links[p1]

	images = []
	for page in range(len(page_links)):
	print("Loading pages " + str(page + 1) + "/" + str(NUM_PAGES) + "...", end="\r")
	images.append(Image.open(BytesIO(requests.get(page_links[page]).content)).convert("RGB"))

	print("\nSaving to PDF...")
	images[0].save("result-" + str(round(time() * 1000)) + ".pdf", save_all=True, append_images=images[1:])
	print("Done!")

DMHYT/yunzhan.py

ngvannguyen commented Nov 7, 2024

Uh oh!

DrDager commented Apr 5, 2025

Uh oh!

brucmao commented Aug 25, 2025

Uh oh!

wangwanjie commented Oct 19, 2025

Uh oh!