Skip to content

Instantly share code, notes, and snippets.

@mattdeboard
Created June 21, 2013 03:45
Show Gist options
  • Save mattdeboard/5828655 to your computer and use it in GitHub Desktop.
Save mattdeboard/5828655 to your computer and use it in GitHub Desktop.

Revisions

  1. mattdeboard created this gist Jun 21, 2013.
    36 changes: 36 additions & 0 deletions extract.clj
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,36 @@

    (defn ExtractRenderListener
    "A RenderListener implementation that extracts images from a PDF and
    writes them to disk."
    [^String path]
    (reify RenderListener
    (renderImage [_ renderInfo]
    (let [image (. renderInfo getImage)
    refNumber (. (. renderInfo getRef) getNumber)
    fileType (. image getFileType)]
    (if image
    (let [filename (str path refNumber "." fileType)]
    (with-open [os (clojure.java.io/output-stream filename)]
    (.write os (. image getImageAsBytes))
    (.flush os))))))
    (beginTextBlock [_] nil)
    (endTextBlock [_] nil)
    (renderText [_ renderInfo] nil)))

    (defn MatrixRenderListener
    "A RenderListener implementation that updates shared mutable state with
    nested vectors of image coordinate data."
    [^Integer page]
    (reify RenderListener
    (renderImage [_ renderInfo]
    (let [matrix (. renderInfo getImageCTM)
    x (. matrix get 6)
    y (. matrix get 7)
    w (. matrix get 0)
    h (. matrix get 4)
    pagekey (keyword (str page))]
    (dosync (alter coords update-in [pagekey] #(conj % [x y w h]))))
    nil)
    (beginTextBlock [_] nil)
    (endTextBlock [_] nil)
    (renderText [_ renderInfo] nil)))