#!/usr/bin/env python3 # ------------------------------------------------------------------ # Script Name: pdfcwcount_range.py # Description: This script processes a range of PDF files and # identifies those with a character count exceeding # a specified threshold. # Website: https://gist.github.com/ostechnix # Version: 1.0 # Usage: py pdf_processor.py # in a PDF File. # Use Gist: curl -s https://gist.github.com/alsiesta/07dcded8b0a2e05c0306d0922ad5c9bb/raw/countcharacter_in_pdf_range.py | py - "output_chunk_1.pdf-output_chunk_30.pdf" 40000 # ------------------------------------------------------------------ import sys import os from PyPDF2 import PdfReader def count_characters_in_pdf(pdf_path): reader = PdfReader(pdf_path) text = "" for page in reader.pages: text += page.extract_text() or "" return len(text) def generate_pdf_list(range_param): start_pdf, end_pdf = range_param.split('-') start_index = int(start_pdf.split('_')[-1].replace('.pdf', '')) end_index = int(end_pdf.split('_')[-1].replace('.pdf', '')) prefix = start_pdf.rsplit('_', 1)[0] # Extract the prefix (e.g., "output_chunk") pdf_list = [f"{prefix}_{i}.pdf" for i in range(start_index, end_index + 1)] return pdf_list def process_pdfs(input_pdfs, char_threshold): pdfs_over_threshold = [] for pdf_path in input_pdfs: if os.path.isfile(pdf_path): char_count = count_characters_in_pdf(pdf_path) if char_count > char_threshold: pdfs_over_threshold.append((pdf_path, char_count)) else: print(f"File not found: {pdf_path}") if pdfs_over_threshold: return pdfs_over_threshold else: return f"No PDF has more than {char_threshold} characters" if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: py pdf_processor.py ") sys.exit(1) range_param = sys.argv[1] char_threshold = int(sys.argv[2]) # Character count threshold passed as a command-line argument input_pdfs = generate_pdf_list(range_param) result = process_pdfs(input_pdfs, char_threshold) if isinstance(result, list): for pdf, count in result: print(f"PDF: {pdf} has {count} characters.") else: print(result)