Skip to content

Instantly share code, notes, and snippets.

@raphiz
Created September 28, 2015 19:52
Show Gist options
  • Save raphiz/3cd332d980d6f4e4cb9c to your computer and use it in GitHub Desktop.
Save raphiz/3cd332d980d6f4e4cb9c to your computer and use it in GitHub Desktop.

Revisions

  1. raphiz created this gist Sep 28, 2015.
    37 changes: 37 additions & 0 deletions pdf_remove_watermark.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,37 @@
    from PyPDF2 import PdfFileReader, PdfFileWriter
    from PyPDF2.pdf import ContentStream
    from PyPDF2.generic import TextStringObject, NameObject
    from PyPDF2.utils import b_

    wm_text = 'Persönliches Exemplar von'
    replace_with = ''

    # Load PDF into pyPDF
    source = PdfFileReader(open('input.pdf', "rb"))
    output = PdfFileWriter()

    # For each page
    for page in range(source.getNumPages()):
    # Get the current page and it's contents
    page = source.getPage(page)
    content_object = page["/Contents"].getObject()
    content = ContentStream(content_object, source)

    # Loop over all pdf elements
    for operands, operator in content.operations:
    # You might adapt this part depending on your PDF file
    if operator == b_("TJ"):
    text = operands[0][0]
    if isinstance(text, TextStringObject) and text.startswith(wm_text):
    operands[0] = TextStringObject(replace_with)


    # Set the modified content as content object on the page
    page.__setitem__(NameObject('/Contents'), content)

    # Add the page to the output
    output.addPage(page)

    # Write the stream
    outputStream = open("output.pdf", "wb")
    output.write(outputStream)