LorisBachert · April 15, 2025 06:29 · Nov 13, 2015 · Nov 13, 2015 · Sep 3, 2015
diff --git a/TikaExtractor.java b/TikaExtractor.java
@@ -15,8 +15,7 @@ public String extractTextOfDocument(File file) throws Exception {
 	PDFParserConfig pdfConfig = new PDFParserConfig();
 	pdfConfig.setExtractInlineImages(true);
 
-	// Parse Dokumente rekursiv um auch Bilder innerhalb von Textdokumenten
-	// einzulesen
+	// To parse images in files those lines are needed
 	ParseContext parseContext = new ParseContext();
 	parseContext.set(TesseractOCRConfig.class, config);
 	parseContext.set(PDFParserConfig.class, pdfConfig);

diff --git a/TikaExtractor.java b/TikaExtractor.java
@@ -1,8 +1,43 @@
-public String extractText(InputStream stream) throws IOException, SAXException, TikaException {
-	AutoDetectParser parser = new AutoDetectParser();
-	BodyContentHandler handler = new BodyContentHandler(Integer.MAX_VALUE);
+/**
+ * Uses Tikas {@link AutoDetectParser} to extract the text of a file.
+ * 
+ * @param document
+ * @return The text content of a file
+ */
+@Override
+public String extractTextOfDocument(File file) throws Exception {
+	InputStream fileStream = new FileInputStream(file);
+	Parser parser = new AutoDetectParser();
 	Metadata metadata = new Metadata();
-	parser.parse(stream, handler, metadata, new ParseContext());
-	String text = handler.toString();
-	return text;
+	BodyContentHandler handler = new BodyContentHandler(Integer.MAX_VALUE);
+
+	TesseractOCRConfig config = new TesseractOCRConfig();
+	PDFParserConfig pdfConfig = new PDFParserConfig();
+	pdfConfig.setExtractInlineImages(true);
+
+	// Parse Dokumente rekursiv um auch Bilder innerhalb von Textdokumenten
+	// einzulesen
+	ParseContext parseContext = new ParseContext();
+	parseContext.set(TesseractOCRConfig.class, config);
+	parseContext.set(PDFParserConfig.class, pdfConfig);
+	parseContext.set(Parser.class, parser); // need to add this to make sure
+											// recursive parsing happens!
+	try {
+		parser.parse(fileStream, handler, metadata, parseContext);
+		String text = handler.toString();
+		if (text.trim().isEmpty()) {
+			logger.warn("Could not extract text of '" + document.getName() + "'");
+		} else {
+			logger.debug("Successfully extracted the text of '" + document.getName() + "'");
+		}
+		return text;
+	} catch (IOException | SAXException | TikaException e) {
+		throw new Exception("TIKA was not able to exctract text of file '" + document.getName() + "'", e);
+	} finally {
+		try {
+			fileStream.close();
+		} catch (IOException e) {
+			throw new Exception(e);
+		}
+	}
 }
diff --git a/TikaExtractor.java b/TikaExtractor.java
@@ -0,0 +1,8 @@
+public String extractText(InputStream stream) throws IOException, SAXException, TikaException {
+	AutoDetectParser parser = new AutoDetectParser();
+	BodyContentHandler handler = new BodyContentHandler(Integer.MAX_VALUE);
+	Metadata metadata = new Metadata();
+	parser.parse(stream, handler, metadata, new ParseContext());
+	String text = handler.toString();
+	return text;
+}