Last active
April 15, 2025 06:29
-
-
Save LorisBachert/7b9ac408d4564caaabef to your computer and use it in GitHub Desktop.
Revisions
-
LorisBachert revised this gist
Nov 13, 2015 . 1 changed file with 1 addition and 2 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -15,8 +15,7 @@ public String extractTextOfDocument(File file) throws Exception { PDFParserConfig pdfConfig = new PDFParserConfig(); pdfConfig.setExtractInlineImages(true); // To parse images in files those lines are needed ParseContext parseContext = new ParseContext(); parseContext.set(TesseractOCRConfig.class, config); parseContext.set(PDFParserConfig.class, pdfConfig); -
LorisBachert revised this gist
Nov 13, 2015 . 1 changed file with 41 additions and 6 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,8 +1,43 @@ /** * Uses Tikas {@link AutoDetectParser} to extract the text of a file. * * @param document * @return The text content of a file */ @Override public String extractTextOfDocument(File file) throws Exception { InputStream fileStream = new FileInputStream(file); Parser parser = new AutoDetectParser(); Metadata metadata = new Metadata(); BodyContentHandler handler = new BodyContentHandler(Integer.MAX_VALUE); TesseractOCRConfig config = new TesseractOCRConfig(); PDFParserConfig pdfConfig = new PDFParserConfig(); pdfConfig.setExtractInlineImages(true); // Parse Dokumente rekursiv um auch Bilder innerhalb von Textdokumenten // einzulesen ParseContext parseContext = new ParseContext(); parseContext.set(TesseractOCRConfig.class, config); parseContext.set(PDFParserConfig.class, pdfConfig); parseContext.set(Parser.class, parser); // need to add this to make sure // recursive parsing happens! try { parser.parse(fileStream, handler, metadata, parseContext); String text = handler.toString(); if (text.trim().isEmpty()) { logger.warn("Could not extract text of '" + document.getName() + "'"); } else { logger.debug("Successfully extracted the text of '" + document.getName() + "'"); } return text; } catch (IOException | SAXException | TikaException e) { throw new Exception("TIKA was not able to exctract text of file '" + document.getName() + "'", e); } finally { try { fileStream.close(); } catch (IOException e) { throw new Exception(e); } } } -
LorisBachert created this gist
Sep 3, 2015 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,8 @@ public String extractText(InputStream stream) throws IOException, SAXException, TikaException { AutoDetectParser parser = new AutoDetectParser(); BodyContentHandler handler = new BodyContentHandler(Integer.MAX_VALUE); Metadata metadata = new Metadata(); parser.parse(stream, handler, metadata, new ParseContext()); String text = handler.toString(); return text; }