lucene-solr-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Shalin Shekhar Mangar <shalinman...@gmail.com>
Subject Re: Problems with DIH XPath flatten
Date Sun, 11 Oct 2009 08:48:39 GMT
On Wed, Oct 7, 2009 at 6:54 PM, Adam Foltzer <acfoltzer@gmail.com> wrote:

> Here's a sample:
>
> <?xml version="1.0" encoding="ISO-8859-1"?>
> <!DOCTYPE document [
> <!ENTITY nbsp "&#160;">
> <!ENTITY copy "&#169;">
> <!ENTITY reg "&#174;">
> ]>
> <document>
>  <kbml version="-//Indiana University//DTD KBML 0.9//EN">
>    <kbq>In Mac OS X, how do I enable or disable the firewall?</kbq>
>    <body>
> <p><kbh docid="aghe" access="allowed">Mac OS
> X<domain>all</domain><visibility>visible</visibility></kbh>
includes
> an easy-to-use <kbh docid="aoru"
>
> access="allowed">firewall<domain>all</domain><visibility>visible</visibility></kbh>
> that
> can prevent potentially harmful incoming connections from other
> computers. To turn it on or off:</p>
>
>
> <h3>Mac OS X 10.6 (Snow Leopard)</h3>
>
> <ol><li>From the Apple menu, select <mi>System Preferences...†</mi>.
> When the <code>System Preferences</code> window appears, from the
> <mi>View</mi> menu, select <mi>Security</mi>.
>
> <br clear="none"/><br clear="none"/>
> </li><li>Click the <mi>Firewall</mi> tab.
>
> ...
>
> </li></ol>
> </body>
>    <xtra>
>      <term weight="0">macos</term>
>      <term weight="0">macintosh</term>
>      <term weight="0">apple</term>
>      <term weight="0">macosx</term>
>
> ...
>
>    </xtra>
>  </kbml>
>  <metadata>
>    <docid>aozg</docid>
>    <owner firstname="" lastname="Macintosh Support">scmac</owner>
>
> ...
>
>  </metadata>
> </document>
>
> The /document/kbml/kbq works fine, but as you can see, it has no
> children. The actual content of the document is within the body
> element, though, which requires some flattening.
>
>
Adam, I'm not able to reproduce your problem. I wrote a test case using your
xml and configuration and it passes. Diff below:

Index:
contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestXPathEntityProcessor.java
===================================================================
---
contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestXPathEntityProcessor.java
(revision 824015)
+++
contrib/dataimporthandler/src/test/java/org/apache/solr/handler/dataimport/TestXPathEntityProcessor.java
(working copy)
@@ -109,6 +109,85 @@
   }

   @Test
+  @SuppressWarnings("unchecked")
+  public void testFlatten() throws Exception {
+    String xml = "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n" +
+            "<!DOCTYPE document [\n" +
+            "<!ENTITY nbsp \"&#160;\">\n" +
+            "<!ENTITY copy \"&#169;\">\n" +
+            "<!ENTITY reg \"&#174;\">\n" +
+            "]>\n" +
+            "<document>\n" +
+            " <kbml version=\"-//Indiana University//DTD KBML 0.9//EN\">\n"
+
+            "   <kbq>In Mac OS X, how do I enable or disable the
firewall?</kbq>\n" +
+            "   <body>\n" +
+            "<p><kbh docid=\"aghe\" access=\"allowed\">Mac OS\n" +
+            "X<domain>all</domain><visibility>visible</visibility></kbh>
includes\n" +
+            "an easy-to-use <kbh docid=\"aoru\"\n" +
+
"access=\"allowed\">firewall<domain>all</domain><visibility>visible</visibility></kbh>\n"
+
+            "that\n" +
+            "can prevent potentially harmful incoming connections from
other\n" +
+            "computers. To turn it on or off:</p>\n" +
+            "\n" +
+            "\n" +
+            "<h3>Mac OS X 10.6 (Snow Leopard)</h3>\n" +
+            "\n" +
+            "<ol><li>From the Apple menu, select <mi>System
Preferences...†</mi>.\n" +
+            "When the <code>System Preferences</code> window appears, from
the\n" +
+            "<mi>View</mi> menu, select <mi>Security</mi>.\n" +
+            "\n" +
+            "<br clear=\"none\"/><br clear=\"none\"/>\n" +
+            "</li><li>Click the <mi>Firewall</mi> tab.\n" +
+            "\n" +
+            "...\n" +
+            "\n" +
+            "</li></ol>\n" +
+            "</body>\n" +
+            "   <xtra>\n" +
+            "     <term weight=\"0\">macos</term>\n" +
+            "     <term weight=\"0\">macintosh</term>\n" +
+            "     <term weight=\"0\">apple</term>\n" +
+            "     <term weight=\"0\">macosx</term>\n" +
+            "\n" +
+            "...\n" +
+            "\n" +
+            "   </xtra>\n" +
+            " </kbml>\n" +
+            " <metadata>\n" +
+            "   <docid>aozg</docid>\n" +
+            "   <owner firstname=\"\" lastname=\"Macintosh
Support\">scmac</owner>\n" +
+            "\n" +
+            "...\n" +
+            "\n" +
+            " </metadata>\n" +
+            "</document>";
+    Map entityAttrs = createMap("name", "kbxml", "url", "testdata.xml",
+            XPathEntityProcessor.FOR_EACH, "/document", "transformer",
"HTMLStripTransformer");
+    List fields = new ArrayList();
+    fields.add(createMap("column", "content", "xpath",
"/document/kbml/body" ,"flatten","true", "stripHTML", "true"));
+    fields.add(createMap("column", "title", "xpath",
"/document/kbml/kbq"));
+    Context c = AbstractDataImportHandlerTest.getContext(null,
+            new VariableResolverImpl(), getDataSource(xml),
Context.FULL_DUMP, fields, entityAttrs);
+    XPathEntityProcessor xPathEntityProcessor = new XPathEntityProcessor();
+    xPathEntityProcessor.init(c);
+    Map<String, Object> result = null;
+    while (true) {
+      Map<String, Object> row = xPathEntityProcessor.nextRow();
+      if (row == null)
+        break;
+      result = row;
+    }
+    System.out.println("result.get(\"content\") = " +
result.get("content"));
+    Assert.assertNotNull(result.get("content"));
+    Assert.assertTrue(result.get("content").toString().trim().length() >
0);
+    HTMLStripTransformer t = new HTMLStripTransformer();
+    t.transformRow(result, c);
+    System.out.println("result.get(\"content\") = " +
result.get("content"));
+    Assert.assertNotNull(result.get("content"));
+    Assert.assertTrue(result.get("content").toString().trim().length() >
0);
+  }
+
+  @Test
   public void withFieldsAndXpathStream() throws Exception {
     Map entityAttrs = createMap("name", "e", "url", "cd.xml",
         XPathEntityProcessor.FOR_EACH, "/catalog/cd", "stream", "true",
"batchSize","1");


-- 
Regards,
Shalin Shekhar Mangar.

Mime
View raw message