tika-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Grant Ingersoll <gsing...@apache.org>
Subject [OT????] java.lang.IllegalStateException: NoWriterSupplied: No writer supplied for serializer.
Date Sat, 15 Nov 2008 18:58:25 GMT
This may just show my lack of understanding of XPath, etc., but when I  
apply [1] to TestParsers, I get the following output:
----------------------------
Val: <?xml version="1.0" encoding="UTF-8"?>
<html xmlns="http://www.w3.org/1999/xhtml">
     <head>
         <title/>
     </head>
     <body>
         <p>


     Solr Version Control System





       Overview

         The Solr source code resides in the Apache Subversion (SVN)  
repository.
         The command-line SVN client can be obtained here or as an  
optional package for cygwin.
         The TortoiseSVN GUI client for Windows can be obtained here.  
There
         are also SVN plugins available for older versions of Eclipse  
and
         IntelliJ IDEA that don't have subversion support already  
included.


     Here is some more text.  It contains a link.
     Text Here


</p>
     </body>
</html>


java.lang.IllegalStateException: NoWriterSupplied: No writer supplied  
for serializer.
	at org.apache.xml.serialize.XMLSerializer.startElement(Unknown Source)
	at  
org 
.apache 
.tika 
.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java: 
75)
	at  
org 
.apache 
.tika 
.sax 
.xpath.MatchingContentHandler.startElement(MatchingContentHandler.java: 
62)
	at  
org 
.apache 
.tika 
.sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java: 
75)
	at  
org 
.apache 
.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:111)
	at  
org 
.apache 
.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:115)
	at  
org 
.apache 
.tika 
.sax.XHTMLContentHandler.lazyStartDocument(XHTMLContentHandler.java:77)
	at  
org 
.apache 
.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:110)
	at  
org 
.apache 
.tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:115)
	at org.apache.tika.parser.xml.XMLParser.parse(XMLParser.java:51)
	at org.apache.tika.TestParsers.testXML(TestParsers.java:96)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at  
sun 
.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java: 
39)
	at  
sun 
.reflect 
.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java: 
25)
	at  
com.intellij.rt.execution.junit.JUnitStarter.main(JUnitStarter.java:40)
-----------------------

Anyone have any insight as to why wrapping a XMLSerializer inside of a  
MatchingContentHandler is causing such a problem?

Thanks,
Grant



[1]
Index: src/test/java/org/apache/tika/TestParsers.java
===================================================================
--- src/test/java/org/apache/tika/TestParsers.java      (revision  
713397)
+++ src/test/java/org/apache/tika/TestParsers.java      (working copy)
@@ -19,16 +19,28 @@
  import java.io.File;
  import java.io.FileInputStream;
  import java.io.InputStream;
+import java.io.StringBufferInputStream;
+import java.io.StringWriter;
+import java.io.ByteArrayInputStream;
  import java.util.List;
+import java.nio.charset.Charset;

  import junit.framework.TestCase;

  import org.apache.tika.config.TikaConfig;
  import org.apache.tika.metadata.Metadata;
  import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.xml.XMLParser;
  import org.apache.tika.utils.ParseUtils;
  import org.apache.tika.utils.Utils;
+import org.apache.tika.sax.xpath.Matcher;
+import org.apache.tika.sax.xpath.MatchingContentHandler;
+import org.apache.tika.sax.xpath.XPathParser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.xml.serialize.XMLSerializer;
+import org.apache.xml.serialize.OutputFormat;
  import org.xml.sax.helpers.DefaultHandler;
+import org.xml.sax.ContentHandler;

  /**
   * Junit test class for Tika {@link Parser}s.
@@ -62,7 +74,59 @@
          tc = TikaConfig.getDefaultConfig();
      }

-    public void testPDFExtraction() throws Exception {
+  private static final XPathParser PARSER =
+          new XPathParser("xhtml", XHTMLContentHandler.XHTML);
+
+  public void testXML() throws Exception {
+    XMLParser parser = new XMLParser();
+    StringWriter writer = new StringWriter();
+    Metadata metadata = new Metadata();
+    ContentHandler contentHandler = new XMLSerializer(writer, new  
OutputFormat("XML", "UTF-8", true));
+    parser.parse(new  
ByteArrayInputStream(sampleXML.getBytes(Charset.forName("UTF-8"))),  
contentHandler, metadata);
+    writer.close();
+    System.out.println("Val: " + writer.toString());
+
+
+
+    metadata = new Metadata();
+    writer = new StringWriter();
+    Matcher matcher = PARSER.parse("/xhtml:html/descendant:node()");
+    contentHandler = new XMLSerializer(writer, new  
OutputFormat("XML", "UTF-8", true));
+    MatchingContentHandler parsingHandler = new  
MatchingContentHandler(contentHandler, matcher);
+    parser.parse(new  
ByteArrayInputStream(sampleXML.getBytes(Charset.forName("UTF-8"))),  
parsingHandler, metadata);
+    //parser.parse(new StringBufferInputStream(sampleXML),  
parsingHandler, metadata);
+    writer.close();
+    System.out.println("Val: " + writer.toString());
+  }
+
+
+
+  private static String sampleXML = "<document>\n" +
+          "  \n" +
+          "  <header>\n" +
+          "    <title>Solr Version Control System</title>\n" +
+          "  </header>\n" +
+          "  \n" +
+          "  <body>\n" +
+          "  \n" +
+          "    <section>\n" +
+          "      <title>Overview</title>\n" +
+          "      <p>\n" +
+          "        The Solr source code resides in the Apache <a href= 
\"http://subversion.tigris.org/\">Subversion (SVN)</a> repository.\n" +
+          "        The command-line SVN client can be obtained <a  
href=\"http://subversion.tigris.org/project_packages.html\">here</a>  
or as an optional package for <a href=\"http://www.cygwin.com/ 
\">cygwin</a>.\n" +
+          "        The TortoiseSVN GUI client for Windows can be  
obtained <a href=\"http://tortoisesvn.tigris.org/\">here</a>. There\n" +
+          "        are also SVN plugins available for older versions  
of <a href=\"http://subclipse.tigris.org/\">Eclipse</a> and \n" +
+          "        <a href=\"http://svnup.tigris.org/\">IntelliJ  
IDEA</a> that don't have subversion support already included.\n" +
+          "      </p>\n" +
+          "    </section>\n" +
+          "    <p>Here is some more text.  It contains <a href=\"http://lucene.apache.org

\">a link</a>. </p>\n" +
+          "    <p>Text Here</p>\n" +
+          "  </body>\n" +
+          "  \n" +
+          "</document>";
+
+
+  public void testPDFExtraction() throws Exception {
          File file = getTestFile("testPDF.pdf");
          String s1 = ParseUtils.getStringContent(file, tc);
          String s2 = ParseUtils.getStringContent(file, tc,  
"application/pdf");

Mime
View raw message