/** * Uses Tikas {@link AutoDetectParser} to extract the text of a file. * * @param document * @return The text content of a file */ @Override public String extractTextOfDocument(File file) throws Exception { InputStream fileStream = new FileInputStream(file); Parser parser = new AutoDetectParser(); Metadata metadata = new Metadata(); BodyContentHandler handler = new BodyContentHandler(Integer.MAX_VALUE); TesseractOCRConfig config = new TesseractOCRConfig(); PDFParserConfig pdfConfig = new PDFParserConfig(); pdfConfig.setExtractInlineImages(true); // To parse images in files those lines are needed ParseContext parseContext = new ParseContext(); parseContext.set(TesseractOCRConfig.class, config); parseContext.set(PDFParserConfig.class, pdfConfig); parseContext.set(Parser.class, parser); // need to add this to make sure // recursive parsing happens! try { parser.parse(fileStream, handler, metadata, parseContext); String text = handler.toString(); if (text.trim().isEmpty()) { logger.warn("Could not extract text of '" + document.getName() + "'"); } else { logger.debug("Successfully extracted the text of '" + document.getName() + "'"); } return text; } catch (IOException | SAXException | TikaException e) { throw new Exception("TIKA was not able to exctract text of file '" + document.getName() + "'", e); } finally { try { fileStream.close(); } catch (IOException e) { throw new Exception(e); } } }