Author: jukka
Date: Wed Dec 19 02:57:58 2007
New Revision: 605511
URL: http://svn.apache.org/viewvc?rev=605511&view=rev
Log:
JCR-1247: Add Warnlog on Extraction Failure
- Catch and log exceptions in all extractors
- Introduced the required slf4j dependency
Modified:
jackrabbit/trunk/jackrabbit-text-extractors/pom.xml
jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/CompositeTextExtractor.java
jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/HTMLTextExtractor.java
jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsExcelTextExtractor.java
jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsPowerPointTextExtractor.java
jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsWordTextExtractor.java
jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/OpenOfficeTextExtractor.java
jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/PdfTextExtractor.java
jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/PlainTextExtractor.java
jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/RTFTextExtractor.java
jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/XMLTextExtractor.java
Modified: jackrabbit/trunk/jackrabbit-text-extractors/pom.xml
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/pom.xml?rev=605511&r1=605510&r2=605511&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/pom.xml (original)
+++ jackrabbit/trunk/jackrabbit-text-extractors/pom.xml Wed Dec 19 02:57:58 2007
@@ -75,6 +75,15 @@
xercesImpl
+ org.slf4j
+ slf4j-api
+
+
+ org.slf4j
+ slf4j-log4j12
+ test
+
+
junit
junit
test
Modified: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/CompositeTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/CompositeTextExtractor.java?rev=605511&r1=605510&r2=605511&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/CompositeTextExtractor.java (original)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/CompositeTextExtractor.java Wed Dec 19 02:57:58 2007
@@ -24,6 +24,9 @@
import java.util.Map;
import java.util.Set;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
/**
* Composite text extractor. This class presents a unified interface
* for a set of {@link TextExtractor} instances. The composite extractor
@@ -33,6 +36,12 @@
public class CompositeTextExtractor implements TextExtractor {
/**
+ * Logger instance.
+ */
+ private static final Logger logger =
+ LoggerFactory.getLogger(CompositeTextExtractor.class);
+
+ /**
* Configured {@link TextExtractor} instances, keyed by content types.
*/
private final Map extractors = new HashMap();
@@ -77,9 +86,15 @@
throws IOException {
TextExtractor extractor = (TextExtractor) extractors.get(type);
if (extractor != null) {
- return extractor.extractText(stream, type, encoding);
+ try {
+ return extractor.extractText(stream, type, encoding);
+ } catch (RuntimeException e) {
+ logger.warn("Failed to extract text content", e);
+ return new StringReader("");
+ }
} else {
stream.close();
+ logger.info("No extractor available for content type {}", type);
return new StringReader("");
}
}
Modified: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/HTMLTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/HTMLTextExtractor.java?rev=605511&r1=605510&r2=605511&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/HTMLTextExtractor.java (original)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/HTMLTextExtractor.java Wed Dec 19 02:57:58 2007
@@ -16,6 +16,8 @@
*/
package org.apache.jackrabbit.extractor;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import org.xml.sax.InputSource;
import org.xml.sax.helpers.DefaultHandler;
@@ -36,6 +38,12 @@
public class HTMLTextExtractor extends AbstractTextExtractor {
/**
+ * Logger instance.
+ */
+ private static final Logger logger =
+ LoggerFactory.getLogger(HTMLTextExtractor.class);
+
+ /**
* Creates a new HTMLTextExtractor
instance.
*/
public HTMLTextExtractor() {
@@ -61,8 +69,10 @@
return new StringReader(parser.getContents());
} catch (TransformerConfigurationException e) {
+ logger.warn("Failed to extract HTML text content", e);
return new StringReader("");
} catch (TransformerException e) {
+ logger.warn("Failed to extract HTML text content", e);
return new StringReader("");
} finally {
stream.close();
Modified: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsExcelTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsExcelTextExtractor.java?rev=605511&r1=605510&r2=605511&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsExcelTextExtractor.java (original)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsExcelTextExtractor.java Wed Dec 19 02:57:58 2007
@@ -21,12 +21,15 @@
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFCell;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import java.io.Reader;
import java.io.InputStream;
import java.io.IOException;
import java.io.CharArrayWriter;
import java.io.CharArrayReader;
+import java.io.StringReader;
import java.util.Iterator;
/**
@@ -35,6 +38,12 @@
public class MsExcelTextExtractor extends AbstractTextExtractor {
/**
+ * Logger instance.
+ */
+ private static final Logger logger =
+ LoggerFactory.getLogger(MsExcelTextExtractor.class);
+
+ /**
* Force loading of dependent class.
*/
static {
@@ -90,6 +99,9 @@
}
return new CharArrayReader(writer.toCharArray());
+ } catch (RuntimeException e) {
+ logger.warn("Failed to extract Excel text content", e);
+ return new StringReader("");
} finally {
stream.close();
}
Modified: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsPowerPointTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsPowerPointTextExtractor.java?rev=605511&r1=605510&r2=605511&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsPowerPointTextExtractor.java (original)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsPowerPointTextExtractor.java Wed Dec 19 02:57:58 2007
@@ -21,6 +21,8 @@
import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.util.LittleEndian;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import java.io.Reader;
import java.io.InputStream;
@@ -29,6 +31,7 @@
import java.io.ByteArrayOutputStream;
import java.io.InputStreamReader;
import java.io.ByteArrayInputStream;
+import java.io.StringReader;
/**
* Text extractor for Microsoft PowerPoint presentations.
@@ -36,6 +39,12 @@
public class MsPowerPointTextExtractor extends AbstractTextExtractor {
/**
+ * Logger instance.
+ */
+ private static final Logger logger =
+ LoggerFactory.getLogger(MsPowerPointTextExtractor.class);
+
+ /**
* Force loading of dependent class.
*/
static {
@@ -66,6 +75,9 @@
reader.read(stream);
return new InputStreamReader(
new ByteArrayInputStream(baos.toByteArray()));
+ } catch (RuntimeException e) {
+ logger.warn("Failed to extract PowerPoint text content", e);
+ return new StringReader("");
} finally {
stream.close();
}
Modified: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsWordTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsWordTextExtractor.java?rev=605511&r1=605510&r2=605511&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsWordTextExtractor.java (original)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsWordTextExtractor.java Wed Dec 19 02:57:58 2007
@@ -16,6 +16,8 @@
*/
package org.apache.jackrabbit.extractor;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import org.textmining.text.extraction.WordExtractor;
import java.io.Reader;
@@ -29,6 +31,12 @@
public class MsWordTextExtractor extends AbstractTextExtractor {
/**
+ * Logger instance.
+ */
+ private static final Logger logger =
+ LoggerFactory.getLogger(MsWordTextExtractor.class);
+
+ /**
* Force loading of dependent class.
*/
static {
@@ -60,6 +68,7 @@
return new StringReader(text);
} catch (Exception e) {
+ logger.warn("Failed to extract Word text content", e);
return new StringReader("");
} finally {
stream.close();
Modified: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/OpenOfficeTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/OpenOfficeTextExtractor.java?rev=605511&r1=605510&r2=605511&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/OpenOfficeTextExtractor.java (original)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/OpenOfficeTextExtractor.java Wed Dec 19 02:57:58 2007
@@ -16,6 +16,8 @@
*/
package org.apache.jackrabbit.extractor;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
@@ -38,6 +40,12 @@
public class OpenOfficeTextExtractor extends AbstractTextExtractor {
/**
+ * Logger instance.
+ */
+ private static final Logger logger =
+ LoggerFactory.getLogger(OpenOfficeTextExtractor.class);
+
+ /**
* Creates a new OpenOfficeTextExtractor
instance.
*/
public OpenOfficeTextExtractor() {
@@ -82,8 +90,10 @@
return new StringReader(contentHandler.getContent());
} catch (ParserConfigurationException e) {
+ logger.warn("Failed to extract OpenOffice text content", e);
return new StringReader("");
} catch (SAXException e) {
+ logger.warn("Failed to extract OpenOffice text content", e);
return new StringReader("");
} finally {
stream.close();
Modified: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/PdfTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/PdfTextExtractor.java?rev=605511&r1=605510&r2=605511&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/PdfTextExtractor.java (original)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/PdfTextExtractor.java Wed Dec 19 02:57:58 2007
@@ -19,6 +19,8 @@
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import java.io.Reader;
import java.io.InputStream;
@@ -34,6 +36,12 @@
public class PdfTextExtractor extends AbstractTextExtractor {
/**
+ * Logger instance.
+ */
+ private static final Logger logger =
+ LoggerFactory.getLogger(PdfTextExtractor.class);
+
+ /**
* Force loading of dependent class.
*/
static {
@@ -80,6 +88,7 @@
} catch (Exception e) {
// it may happen that PDFParser throws a runtime
// exception when parsing certain pdf documents
+ logger.warn("Failed to extract PDF text content", e);
return new StringReader("");
} finally {
stream.close();
Modified: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/PlainTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/PlainTextExtractor.java?rev=605511&r1=605510&r2=605511&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/PlainTextExtractor.java (original)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/PlainTextExtractor.java Wed Dec 19 02:57:58 2007
@@ -23,12 +23,21 @@
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
/**
* Text extractor for plain text.
*/
public class PlainTextExtractor extends AbstractTextExtractor {
/**
+ * Logger instance.
+ */
+ private static final Logger logger =
+ LoggerFactory.getLogger(PlainTextExtractor.class);
+
+ /**
* Creates a new PlainTextExtractor
instance.
*/
public PlainTextExtractor() {
@@ -59,6 +68,7 @@
return new InputStreamReader(stream);
}
} catch (UnsupportedEncodingException e) {
+ logger.warn("Failed to extract plain text content", e);
stream.close();
return new StringReader("");
}
Modified: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/RTFTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/RTFTextExtractor.java?rev=605511&r1=605510&r2=605511&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/RTFTextExtractor.java (original)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/RTFTextExtractor.java Wed Dec 19 02:57:58 2007
@@ -19,6 +19,10 @@
import javax.swing.text.BadLocationException;
import javax.swing.text.DefaultStyledDocument;
import javax.swing.text.rtf.RTFEditorKit;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
import java.io.Reader;
import java.io.InputStream;
import java.io.IOException;
@@ -30,6 +34,12 @@
public class RTFTextExtractor extends AbstractTextExtractor {
/**
+ * Logger instance.
+ */
+ private static final Logger logger =
+ LoggerFactory.getLogger(RTFTextExtractor.class);
+
+ /**
* Creates a new RTFTextExtractor
instance.
*/
public RTFTextExtractor() {
@@ -52,7 +62,8 @@
String text = doc.getText(0, doc.getLength());
return new StringReader(text);
} catch (BadLocationException e) {
- throw new IOException(e.getMessage());
+ logger.warn("Failed to extract RTF text content", e);
+ return new StringReader("");
} finally {
stream.close();
}
Modified: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/XMLTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/XMLTextExtractor.java?rev=605511&r1=605510&r2=605511&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/XMLTextExtractor.java (original)
+++ jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/XMLTextExtractor.java Wed Dec 19 02:57:58 2007
@@ -28,6 +28,8 @@
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
@@ -45,6 +47,12 @@
public class XMLTextExtractor extends AbstractTextExtractor {
/**
+ * Logger instance.
+ */
+ private static final Logger logger =
+ LoggerFactory.getLogger(XMLTextExtractor.class);
+
+ /**
* Creates a new XMLTextExtractor
instance.
*/
public XMLTextExtractor() {
@@ -93,8 +101,10 @@
return new CharArrayReader(writer.toCharArray());
} catch (ParserConfigurationException e) {
+ logger.warn("Failed to extract XML text content", e);
return new StringReader("");
} catch (SAXException e) {
+ logger.warn("Failed to extract XML text content", e);
return new StringReader("");
} finally {
stream.close();