tika-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Uwe Schindler" <...@thetaphi.de>
Subject RE: [OT????] java.lang.IllegalStateException: NoWriterSupplied: No writer supplied for serializer.
Date Sat, 15 Nov 2008 19:34:48 GMT
Hi Grant,

Here my first answer to a question for this project :-].

>From the call stack and your code in the second test step (when you use the
XPath with MatchingContentHandler around the serializer), I think the
following is the problem:
The MatchingContentHandler states in the documentation, that it does not
pass startDocument/endDocument events to the delegate (maybe because these
events do not match the XPath). The Serializer does not like this (because
it initializes the output writer in startDocument()). You can fix this by
manually adding serializer.startDocument/endDocument before/after
serializing.

-----
Uwe Schindler
H.-H.-Meier-Allee 63, D-28213 Bremen
http://www.thetaphi.de
eMail: uwe@thetaphi.de

> -----Original Message-----
> From: Grant Ingersoll [mailto:gsingers@apache.org]
> Sent: Saturday, November 15, 2008 7:58 PM
> To: tika-dev@incubator.apache.org
> Subject: [OT????] java.lang.IllegalStateException: NoWriterSupplied: No
> writer supplied for serializer.
> 
> This may just show my lack of understanding of XPath, etc., but when I
> apply [1] to TestParsers, I get the following output:
> ----------------------------
> Val: <?xml version="1.0" encoding="UTF-8"?>
> <html xmlns="http://www.w3.org/1999/xhtml">
>      <head>
>          <title/>
>      </head>
>      <body>
>          <p>
> 
> 
>      Solr Version Control System
> 
> 
> 
> 
> 
>        Overview
> 
>          The Solr source code resides in the Apache Subversion (SVN)
> repository.
>          The command-line SVN client can be obtained here or as an
> optional package for cygwin.
>          The TortoiseSVN GUI client for Windows can be obtained here.
> There
>          are also SVN plugins available for older versions of Eclipse
> and
>          IntelliJ IDEA that don't have subversion support already
> included.
> 
> 
>      Here is some more text.  It contains a link.
>      Text Here
> 
> 
> </p>
>      </body>
> </html>
> 
> 
> java.lang.IllegalStateException: NoWriterSupplied: No writer supplied
> for serializer.
> 	at org.apache.xml.serialize.XMLSerializer.startElement(Unknown
> Source)
> 	at
> org
> .apache
> .tika
> .sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:
> 75)
> 	at
> org
> .apache
> .tika
> .sax
> .xpath.MatchingContentHandler.startElement(MatchingContentHandler.java:
> 62)
> 	at
> org
> .apache
> .tika
> .sax.ContentHandlerDecorator.startElement(ContentHandlerDecorator.java:
> 75)
> 	at
> org
> .apache
> .tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:111)
> 	at
> org
> .apache
> .tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:115)
> 	at
> org
> .apache
> .tika
> .sax.XHTMLContentHandler.lazyStartDocument(XHTMLContentHandler.java:77)
> 	at
> org
> .apache
> .tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:110)
> 	at
> org
> .apache
> .tika.sax.XHTMLContentHandler.startElement(XHTMLContentHandler.java:115)
> 	at org.apache.tika.parser.xml.XMLParser.parse(XMLParser.java:51)
> 	at org.apache.tika.TestParsers.testXML(TestParsers.java:96)
> 	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
> 	at
> sun
> .reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:
> 39)
> 	at
> sun
> .reflect
> .DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:
> 25)
> 	at
> com.intellij.rt.execution.junit.JUnitStarter.main(JUnitStarter.java:40)
> -----------------------
> 
> Anyone have any insight as to why wrapping a XMLSerializer inside of a
> MatchingContentHandler is causing such a problem?
> 
> Thanks,
> Grant
> 
> 
> 
> [1]
> Index: src/test/java/org/apache/tika/TestParsers.java
> ===================================================================
> --- src/test/java/org/apache/tika/TestParsers.java      (revision
> 713397)
> +++ src/test/java/org/apache/tika/TestParsers.java      (working copy)
> @@ -19,16 +19,28 @@
>   import java.io.File;
>   import java.io.FileInputStream;
>   import java.io.InputStream;
> +import java.io.StringBufferInputStream;
> +import java.io.StringWriter;
> +import java.io.ByteArrayInputStream;
>   import java.util.List;
> +import java.nio.charset.Charset;
> 
>   import junit.framework.TestCase;
> 
>   import org.apache.tika.config.TikaConfig;
>   import org.apache.tika.metadata.Metadata;
>   import org.apache.tika.parser.Parser;
> +import org.apache.tika.parser.xml.XMLParser;
>   import org.apache.tika.utils.ParseUtils;
>   import org.apache.tika.utils.Utils;
> +import org.apache.tika.sax.xpath.Matcher;
> +import org.apache.tika.sax.xpath.MatchingContentHandler;
> +import org.apache.tika.sax.xpath.XPathParser;
> +import org.apache.tika.sax.XHTMLContentHandler;
> +import org.apache.xml.serialize.XMLSerializer;
> +import org.apache.xml.serialize.OutputFormat;
>   import org.xml.sax.helpers.DefaultHandler;
> +import org.xml.sax.ContentHandler;
> 
>   /**
>    * Junit test class for Tika {@link Parser}s.
> @@ -62,7 +74,59 @@
>           tc = TikaConfig.getDefaultConfig();
>       }
> 
> -    public void testPDFExtraction() throws Exception {
> +  private static final XPathParser PARSER =
> +          new XPathParser("xhtml", XHTMLContentHandler.XHTML);
> +
> +  public void testXML() throws Exception {
> +    XMLParser parser = new XMLParser();
> +    StringWriter writer = new StringWriter();
> +    Metadata metadata = new Metadata();
> +    ContentHandler contentHandler = new XMLSerializer(writer, new
> OutputFormat("XML", "UTF-8", true));
> +    parser.parse(new
> ByteArrayInputStream(sampleXML.getBytes(Charset.forName("UTF-8"))),
> contentHandler, metadata);
> +    writer.close();
> +    System.out.println("Val: " + writer.toString());
> +
> +
> +
> +    metadata = new Metadata();
> +    writer = new StringWriter();
> +    Matcher matcher = PARSER.parse("/xhtml:html/descendant:node()");
> +    contentHandler = new XMLSerializer(writer, new
> OutputFormat("XML", "UTF-8", true));
> +    MatchingContentHandler parsingHandler = new
> MatchingContentHandler(contentHandler, matcher);
> +    parser.parse(new
> ByteArrayInputStream(sampleXML.getBytes(Charset.forName("UTF-8"))),
> parsingHandler, metadata);
> +    //parser.parse(new StringBufferInputStream(sampleXML),
> parsingHandler, metadata);
> +    writer.close();
> +    System.out.println("Val: " + writer.toString());
> +  }
> +
> +
> +
> +  private static String sampleXML = "<document>\n" +
> +          "  \n" +
> +          "  <header>\n" +
> +          "    <title>Solr Version Control System</title>\n" +
> +          "  </header>\n" +
> +          "  \n" +
> +          "  <body>\n" +
> +          "  \n" +
> +          "    <section>\n" +
> +          "      <title>Overview</title>\n" +
> +          "      <p>\n" +
> +          "        The Solr source code resides in the Apache <a href=
> \"http://subversion.tigris.org/\">Subversion (SVN)</a> repository.\n" +
> +          "        The command-line SVN client can be obtained <a
> href=\"http://subversion.tigris.org/project_packages.html\">here</a>
> or as an optional package for <a href=\"http://www.cygwin.com/
> \">cygwin</a>.\n" +
> +          "        The TortoiseSVN GUI client for Windows can be
> obtained <a href=\"http://tortoisesvn.tigris.org/\">here</a>. There\n" +
> +          "        are also SVN plugins available for older versions
> of <a href=\"http://subclipse.tigris.org/\">Eclipse</a> and \n" +
> +          "        <a href=\"http://svnup.tigris.org/\">IntelliJ
> IDEA</a> that don't have subversion support already included.\n" +
> +          "      </p>\n" +
> +          "    </section>\n" +
> +          "    <p>Here is some more text.  It contains <a
> href=\"http://lucene.apache.org
> \">a link</a>. </p>\n" +
> +          "    <p>Text Here</p>\n" +
> +          "  </body>\n" +
> +          "  \n" +
> +          "</document>";
> +
> +
> +  public void testPDFExtraction() throws Exception {
>           File file = getTestFile("testPDF.pdf");
>           String s1 = ParseUtils.getStringContent(file, tc);
>           String s2 = ParseUtils.getStringContent(file, tc,
> "application/pdf");


Mime
View raw message