lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From d...@apache.org
Subject [Lucene.Net] svn commit: r1147679 [1/2] - in /incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers: ./ Filters/ Miscellaneous/ Properties/ Shingle/
Date Sun, 17 Jul 2011 16:32:30 GMT
Author: digy
Date: Sun Jul 17 16:32:29 2011
New Revision: 1147679

URL: http://svn.apache.org/viewvc?rev=1147679&view=rev
Log:
[LUCENENET-437] for 2.9.4g

Added:
    incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Miscellaneous/
    incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Miscellaneous/ChainedFilterTest.cs
    incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Miscellaneous/TestPrefixAndSuffixAwareTokenFilter.cs
    incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Miscellaneous/TestPrefixAwareTokenFilter.cs
    incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Shingle/
    incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Shingle/ShingleAnalyzerWrapperTest.cs
    incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Shingle/ShingleFilterTest.cs
    incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Shingle/TestShingleMatrixFilter.cs
Removed:
    incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Filters/
Modified:
    incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj
    incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Properties/AssemblyInfo.cs

Modified: incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj?rev=1147679&r1=1147678&r2=1147679&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj (original)
+++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj Sun Jul 17 16:32:29 2011
@@ -8,7 +8,7 @@
     <ProjectGuid>{67D27628-F1D5-4499-9818-B669731925C8}</ProjectGuid>
     <OutputType>Library</OutputType>
     <AppDesignerFolder>Properties</AppDesignerFolder>
-    <RootNamespace>Lucene.Net.Analyzers</RootNamespace>
+    <RootNamespace>Lucene.Net.Analysis</RootNamespace>
     <AssemblyName>Lucene.Net.Analyzers.Test</AssemblyName>
     <TargetFrameworkVersion>v4.0</TargetFrameworkVersion>
     <FileAlignment>512</FileAlignment>
@@ -36,7 +36,7 @@
     <DebugSymbols>true</DebugSymbols>
     <DebugType>full</DebugType>
     <Optimize>false</Optimize>
-    <OutputPath>..\..\..\bin\contrib\Analyzers\Debug\</OutputPath>
+    <OutputPath>..\..\..\bin\contrib\Analyzers\</OutputPath>
     <DefineConstants>DEBUG;TRACE</DefineConstants>
     <ErrorReport>prompt</ErrorReport>
     <WarningLevel>4</WarningLevel>
@@ -59,12 +59,17 @@
     <Compile Include="AR\TestArabicAnalyzer.cs" />
     <Compile Include="AR\TestArabicNormalizationFilter.cs" />
     <Compile Include="AR\TestArabicStemFilter.cs" />
-    <Compile Include="Filters\ChainedFilterTest.cs" />
+    <Compile Include="Miscellaneous\ChainedFilterTest.cs" />
+    <Compile Include="Miscellaneous\TestPrefixAndSuffixAwareTokenFilter.cs" />
+    <Compile Include="Miscellaneous\TestPrefixAwareTokenFilter.cs" />
     <Compile Include="NGram\TestEdgeNGramTokenFilter.cs" />
     <Compile Include="NGram\TestEdgeNGramTokenizer.cs" />
     <Compile Include="NGram\TestNGramTokenFilter.cs" />
     <Compile Include="NGram\TestNGramTokenizer.cs" />
     <Compile Include="Properties\AssemblyInfo.cs" />
+    <Compile Include="Shingle\ShingleAnalyzerWrapperTest.cs" />
+    <Compile Include="Shingle\ShingleFilterTest.cs" />
+    <Compile Include="Shingle\TestShingleMatrixFilter.cs" />
   </ItemGroup>
   <ItemGroup>
     <ProjectReference Include="..\..\..\src\contrib\Analyzers\Contrib.Analyzers.csproj">

Added: incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Miscellaneous/ChainedFilterTest.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Miscellaneous/ChainedFilterTest.cs?rev=1147679&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Miscellaneous/ChainedFilterTest.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Miscellaneous/ChainedFilterTest.cs Sun Jul 17 16:32:29 2011
@@ -0,0 +1,218 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+using Lucene.Net.Documents;
+using Lucene.Net.Index;
+using Lucene.Net.Search;
+using Lucene.Net.Store;
+using Lucene.Net.Analysis;
+using Lucene.Net.Util;
+
+using NUnit.Framework;
+
+namespace Lucene.Net.Analysis.Miscellaneous
+{
+    public class ChainedFilterTest : Lucene.Net.TestCase
+    {
+        public static int MAX = 500;
+
+        private RAMDirectory directory;
+        private IndexSearcher searcher;
+        private Query query;
+        // private DateFilter dateFilter;   DateFilter was deprecated and removed
+        private TermRangeFilter dateFilter;
+        private QueryWrapperFilter bobFilter;
+        private QueryWrapperFilter sueFilter;
+
+        [SetUp]
+        public void SetUp()
+        {
+            directory = new RAMDirectory();
+            IndexWriter writer =
+               new IndexWriter(directory, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED);
+
+            DateTime cal = new DateTime(1041397200000L * TimeSpan.TicksPerMillisecond); // 2003 January 01
+
+            for (int i = 0; i < MAX; i++)
+            {
+                Document doc = new Document();
+                doc.Add(new Field("key", "" + (i + 1), Field.Store.YES, Field.Index.NOT_ANALYZED));
+                doc.Add(new Field("owner", (i < MAX / 2) ? "bob" : "sue", Field.Store.YES, Field.Index.NOT_ANALYZED));
+                doc.Add(new Field("date", (cal.Ticks / TimeSpan.TicksPerMillisecond).ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
+                writer.AddDocument(doc);
+
+                cal.AddMilliseconds(1);
+            }
+
+            writer.Close();
+
+            searcher = new IndexSearcher(directory, true);
+
+            // query for everything to make life easier
+            BooleanQuery bq = new BooleanQuery();
+            bq.Add(new TermQuery(new Term("owner", "bob")), BooleanClause.Occur.SHOULD);
+            bq.Add(new TermQuery(new Term("owner", "sue")), BooleanClause.Occur.SHOULD);
+            query = bq;
+
+            // date filter matches everything too
+            //Date pastTheEnd = parseDate("2099 Jan 1");
+            // dateFilter = DateFilter.Before("date", pastTheEnd);
+            // just treat dates as strings and select the whole range for now...
+            dateFilter = new TermRangeFilter("date", "", "ZZZZ", true, true);
+
+            bobFilter = new QueryWrapperFilter(
+                new TermQuery(new Term("owner", "bob")));
+            sueFilter = new QueryWrapperFilter(
+                new TermQuery(new Term("owner", "sue")));
+        }
+
+        private ChainedFilter GetChainedFilter(Filter[] chain, ChainedFilter.Logic[] logic)
+        {
+            if (logic == null)
+            {
+                return new ChainedFilter(chain);
+            }
+            else
+            {
+                return new ChainedFilter(chain, logic);
+            }
+        }
+
+        private ChainedFilter GetChainedFilter(Filter[] chain, ChainedFilter.Logic logic)
+        {
+            return new ChainedFilter(chain, logic);
+        }
+
+
+        [Test]
+        public void TestSingleFilter()
+        {
+            ChainedFilter chain = GetChainedFilter(new Filter[] { dateFilter }, null);
+
+            int numHits = searcher.Search(query, chain, 1000).TotalHits;
+            Assert.AreEqual(MAX, numHits);
+
+            chain = new ChainedFilter(new Filter[] { bobFilter });
+            numHits = searcher.Search(query, chain, 1000).TotalHits;
+            Assert.AreEqual(MAX / 2, numHits);
+
+            chain = GetChainedFilter(new Filter[] { bobFilter }, new ChainedFilter.Logic[] { ChainedFilter.Logic.AND });
+            TopDocs hits = searcher.Search(query, chain, 1000);
+            numHits = hits.TotalHits;
+            Assert.AreEqual(MAX / 2, numHits);
+            Assert.AreEqual("bob", searcher.Doc(hits.ScoreDocs[0].doc).Get("owner"));
+
+            chain = GetChainedFilter(new Filter[] { bobFilter }, new ChainedFilter.Logic[] { ChainedFilter.Logic.ANDNOT });
+            hits = searcher.Search(query, chain, 1000);
+            numHits = hits.TotalHits;
+            Assert.AreEqual(MAX / 2, numHits);
+            Assert.AreEqual("sue", searcher.Doc(hits.ScoreDocs[0].doc).Get("owner"));
+        }
+
+        [Test]
+        public void TestOR()
+        {
+            ChainedFilter chain = GetChainedFilter(
+              new Filter[] { sueFilter, bobFilter }, null);
+
+            int numHits = searcher.Search(query, chain, 1000).TotalHits;
+            Assert.AreEqual(MAX, numHits, "OR matches all");
+        }
+
+        [Test]
+        public void TestAND()
+        {
+            ChainedFilter chain = GetChainedFilter(
+              new Filter[] { dateFilter, bobFilter }, ChainedFilter.Logic.AND);
+
+            TopDocs hits = searcher.Search(query, chain, 1000);
+            Assert.AreEqual(MAX / 2, hits.TotalHits, "AND matches just bob");
+            Assert.AreEqual("bob", searcher.Doc(hits.ScoreDocs[0].doc).Get("owner"));
+        }
+
+        [Test]
+        public void TestXOR()
+        {
+            ChainedFilter chain = GetChainedFilter(
+              new Filter[] { dateFilter, bobFilter }, ChainedFilter.Logic.XOR);
+
+            TopDocs hits = searcher.Search(query, chain, 1000);
+            Assert.AreEqual(MAX / 2, hits.TotalHits, "XOR matches sue");
+            Assert.AreEqual("sue", searcher.Doc(hits.ScoreDocs[0].doc).Get("owner"));
+        }
+
+        [Test]
+        public void TestANDNOT()
+        {
+            ChainedFilter chain = GetChainedFilter(
+              new Filter[] { dateFilter, sueFilter },
+                new ChainedFilter.Logic[] { ChainedFilter.Logic.AND, ChainedFilter.Logic.ANDNOT });
+
+            TopDocs hits = searcher.Search(query, chain, 1000);
+            Assert.AreEqual(MAX / 2, hits.TotalHits, "ANDNOT matches just bob");
+            Assert.AreEqual("bob", searcher.Doc(hits.ScoreDocs[0].doc).Get("owner"));
+
+            chain = GetChainedFilter(
+                new Filter[] { bobFilter, bobFilter },
+                  new ChainedFilter.Logic[] { ChainedFilter.Logic.ANDNOT, ChainedFilter.Logic.ANDNOT });
+
+            hits = searcher.Search(query, chain, 1000);
+            Assert.AreEqual(MAX / 2, hits.TotalHits, "ANDNOT bob ANDNOT bob matches all sues");
+            Assert.AreEqual("sue", searcher.Doc(hits.ScoreDocs[0].doc).Get("owner"));
+        }
+
+        /*
+        private Date parseDate(String s) throws ParseException {
+          return new SimpleDateFormat("yyyy MMM dd", Locale.US).parse(s);
+        }
+        */
+
+        [Test]
+        public void TestWithCachingFilter()
+        {
+            Directory dir = new RAMDirectory();
+            Analyzer analyzer = new WhitespaceAnalyzer();
+
+            IndexWriter writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
+            writer.Close();
+
+            Searcher searcher = new IndexSearcher(dir, true);
+
+            Query query = new TermQuery(new Term("none", "none"));
+
+            QueryWrapperFilter queryFilter = new QueryWrapperFilter(query);
+            CachingWrapperFilter cachingFilter = new CachingWrapperFilter(queryFilter);
+
+            searcher.Search(query, cachingFilter, 1);
+
+            CachingWrapperFilter cachingFilter2 = new CachingWrapperFilter(queryFilter);
+            Filter[] chain = new Filter[2];
+            chain[0] = cachingFilter;
+            chain[1] = cachingFilter2;
+            ChainedFilter cf = new ChainedFilter(chain);
+
+            // throws java.lang.ClassCastException: org.apache.lucene.util.OpenBitSet cannot be cast to java.util.BitSet
+            searcher.Search(new MatchAllDocsQuery(), cf, 1);
+        }
+
+    }
+}
\ No newline at end of file

Added: incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Miscellaneous/TestPrefixAndSuffixAwareTokenFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Miscellaneous/TestPrefixAndSuffixAwareTokenFilter.cs?rev=1147679&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Miscellaneous/TestPrefixAndSuffixAwareTokenFilter.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Miscellaneous/TestPrefixAndSuffixAwareTokenFilter.cs Sun Jul 17 16:32:29 2011
@@ -0,0 +1,48 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.IO;
+using Lucene.Net.Analysis;
+using NUnit.Framework;
+
+namespace Lucene.Net.Analysis.Miscellaneous
+{
+    public class TestPrefixAndSuffixAwareTokenFilter : BaseTokenStreamTestCase
+    {
+        [Test]
+        public void TestTokenStreamContents()
+        {
+            var ts = new PrefixAndSuffixAwareTokenFilter(
+                new SingleTokenTokenStream(CreateToken("^", 0, 0)),
+                new WhitespaceTokenizer(new StringReader("hello world")),
+                new SingleTokenTokenStream(CreateToken("$", 0, 0)));
+
+            AssertTokenStreamContents(ts,
+                                      new[] {"^", "hello", "world", "$"},
+                                      new[] {0, 0, 6, 11},
+                                      new[] {0, 5, 11, 11});
+        }
+
+        private static Token CreateToken(String term, int start, int offset)
+        {
+            var token = new Token(start, offset);
+            token.SetTermBuffer(term);
+            return token;
+        }
+    }
+}
\ No newline at end of file

Added: incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Miscellaneous/TestPrefixAwareTokenFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Miscellaneous/TestPrefixAwareTokenFilter.cs?rev=1147679&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Miscellaneous/TestPrefixAwareTokenFilter.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Miscellaneous/TestPrefixAwareTokenFilter.cs Sun Jul 17 16:32:29 2011
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.IO;
+using Lucene.Net.Analysis;
+using NUnit.Framework;
+
+namespace Lucene.Net.Analysis.Miscellaneous
+{
+    public class TestPrefixAwareTokenFilter : BaseTokenStreamTestCase
+    {
+        [Test]
+        public void TestTokenStreamContents()
+        {
+            var ts = new PrefixAwareTokenFilter(
+                new SingleTokenTokenStream(CreateToken("a", 0, 1)),
+                new SingleTokenTokenStream(CreateToken("b", 0, 1)));
+
+            AssertTokenStreamContents(ts,
+                                      new[] {"a", "b"},
+                                      new[] {0, 1},
+                                      new[] {1, 2});
+
+            // prefix and suffix using 2x prefix
+
+            ts = new PrefixAwareTokenFilter(new SingleTokenTokenStream(CreateToken("^", 0, 0)),
+                                            new WhitespaceTokenizer(new StringReader("hello world")));
+            ts = new PrefixAwareTokenFilter(ts, new SingleTokenTokenStream(CreateToken("$", 0, 0)));
+
+            AssertTokenStreamContents(ts,
+                                      new[] {"^", "hello", "world", "$"},
+                                      new[] {0, 0, 6, 11},
+                                      new[] {0, 5, 11, 11});
+        }
+
+        private static Token CreateToken(String term, int start, int offset)
+        {
+            var token = new Token(start, offset);
+            token.SetTermBuffer(term);
+            return token;
+        }
+    }
+}
\ No newline at end of file

Modified: incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Properties/AssemblyInfo.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Properties/AssemblyInfo.cs?rev=1147679&r1=1147678&r2=1147679&view=diff
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Properties/AssemblyInfo.cs (original)
+++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Properties/AssemblyInfo.cs Sun Jul 17 16:32:29 2011
@@ -32,5 +32,5 @@ using System.Runtime.InteropServices;
 // You can specify all the values or you can default the Build and Revision Numbers 
 // by using the '*' as shown below:
 // [assembly: AssemblyVersion("1.0.*")]
-[assembly: AssemblyVersion("2.9.2.1")]
-[assembly: AssemblyFileVersion("2.9.2.1")]
+[assembly: AssemblyVersion("2.9.4.2")]
+[assembly: AssemblyFileVersion("2.9.4.2")]

Added: incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Shingle/ShingleAnalyzerWrapperTest.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Shingle/ShingleAnalyzerWrapperTest.cs?rev=1147679&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Shingle/ShingleAnalyzerWrapperTest.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Shingle/ShingleAnalyzerWrapperTest.cs Sun Jul 17 16:32:29 2011
@@ -0,0 +1,293 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.IO;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Documents;
+using Lucene.Net.Index;
+using Lucene.Net.QueryParsers;
+using Lucene.Net.Search;
+using Lucene.Net.Store;
+using NUnit.Framework;
+using Directory = Lucene.Net.Store.Directory;
+
+namespace Lucene.Net.Analysis.Shingle
+{
+    /// <summary>
+    /// A test class for ShingleAnalyzerWrapper as regards queries and scoring.
+    /// </summary>
+    public class ShingleAnalyzerWrapperTest : BaseTokenStreamTestCase
+    {
+        public IndexSearcher Searcher;
+
+        /// <summary>
+        /// Set up a new index in RAM with three test phrases and the supplied Analyzer.
+        /// </summary>
+        /// <param name="analyzer">the analyzer to use</param>
+        /// <returns>an indexSearcher on the test index.</returns>
+        public IndexSearcher SetUpSearcher(Analyzer analyzer)
+        {
+            Directory dir = new RAMDirectory();
+            var writer = new IndexWriter(dir, analyzer, true);
+
+            var doc = new Document();
+            doc.Add(new Field("content", "please divide this sentence into shingles",
+                              Field.Store.YES, Field.Index.ANALYZED));
+            writer.AddDocument(doc);
+
+            doc = new Document();
+            doc.Add(new Field("content", "just another test sentence",
+                              Field.Store.YES, Field.Index.ANALYZED));
+            writer.AddDocument(doc);
+
+            doc = new Document();
+            doc.Add(new Field("content", "a sentence which contains no test",
+                              Field.Store.YES, Field.Index.ANALYZED));
+            writer.AddDocument(doc);
+
+            writer.Close();
+
+            return new IndexSearcher(dir);
+        }
+
+        protected Hits QueryParsingTest(Analyzer analyzer, String qs)
+        {
+            Searcher = SetUpSearcher(analyzer);
+
+            var qp = new QueryParser("content", analyzer);
+
+            var q = qp.Parse(qs);
+
+            return Searcher.Search(q);
+        }
+
+        protected void CompareRanks(Hits hits, int[] ranks)
+        {
+            Assert.AreEqual(ranks.Length, hits.Length());
+            for (int i = 0; i < ranks.Length; i++)
+            {
+                Assert.AreEqual(ranks[i], hits.Id(i));
+            }
+        }
+
+        /// <summary>
+        /// Will not work on an index without unigrams, since QueryParser automatically tokenizes on whitespace.
+        /// </summary>
+        [Test]
+        public void TestShingleAnalyzerWrapperQueryParsing()
+        {
+            var hits = QueryParsingTest(new ShingleAnalyzerWrapper (new WhitespaceAnalyzer(), 2), "test sentence");
+            var ranks = new[] {1, 2, 0};
+            CompareRanks(hits, ranks);
+        }
+
+        /// <summary>
+        /// This one fails with an exception.
+        /// </summary>
+        [Test]
+        public void TestShingleAnalyzerWrapperPhraseQueryParsingFails()
+        {
+            var hits = QueryParsingTest(new ShingleAnalyzerWrapper (new WhitespaceAnalyzer(), 2), "\"this sentence\"");
+            var ranks = new[] {0};
+            CompareRanks(hits, ranks);
+        }
+
+        /// <summary>
+        /// This one works, actually.
+        /// </summary>
+        [Test]
+        public void TestShingleAnalyzerWrapperPhraseQueryParsing()
+        {
+            var hits = QueryParsingTest(new ShingleAnalyzerWrapper
+                                             (new WhitespaceAnalyzer(), 2),
+                                         "\"test sentence\"");
+            var ranks = new[] {1};
+            CompareRanks(hits, ranks);
+        }
+
+        /// <summary>
+        /// Same as above, is tokenized without using the analyzer.
+        /// </summary>
+        [Test]
+        public void TestShingleAnalyzerWrapperRequiredQueryParsing()
+        {
+            var hits = QueryParsingTest(new ShingleAnalyzerWrapper
+                                             (new WhitespaceAnalyzer(), 2),
+                                         "+test +sentence");
+            var ranks = new[] {1, 2};
+            CompareRanks(hits, ranks);
+        }
+
+        /// <summary>
+        /// This shows how to construct a phrase query containing shingles.
+        /// </summary>
+        [Test]
+        public void TestShingleAnalyzerWrapperPhraseQuery()
+        {
+            Analyzer analyzer = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 2);
+            Searcher = SetUpSearcher(analyzer);
+
+            var q = new PhraseQuery();
+
+            var ts = analyzer.TokenStream("content", new StringReader("this sentence"));
+            var j = -1;
+
+            var posIncrAtt = (PositionIncrementAttribute) ts.AddAttribute(typeof (PositionIncrementAttribute));
+            var termAtt = (TermAttribute) ts.AddAttribute(typeof (TermAttribute));
+
+            while (ts.IncrementToken())
+            {
+                j += posIncrAtt.GetPositionIncrement();
+                var termText = termAtt.Term();
+                q.Add(new Term("content", termText), j);
+            }
+
+            var hits = Searcher.Search(q);
+            var ranks = new[] {0};
+            CompareRanks(hits, ranks);
+        }
+
+        /// <summary>
+        /// How to construct a boolean query with shingles. A query like this will
+        /// implicitly score those documents higher that contain the words in the query
+        /// in the right order and adjacent to each other. 
+        /// </summary>
+        [Test]
+        public void TestShingleAnalyzerWrapperBooleanQuery()
+        {
+            Analyzer analyzer = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 2);
+            Searcher = SetUpSearcher(analyzer);
+
+            var q = new BooleanQuery();
+
+            var ts = analyzer.TokenStream("content", new StringReader("test sentence"));
+
+            var termAtt = (TermAttribute) ts.AddAttribute(typeof (TermAttribute));
+
+            while (ts.IncrementToken())
+            {
+                var termText = termAtt.Term();
+                q.Add(new TermQuery(new Term("content", termText)),
+                      BooleanClause.Occur.SHOULD);
+            }
+
+            var hits = Searcher.Search(q);
+            var ranks = new[] {1, 2, 0};
+            CompareRanks(hits, ranks);
+        }
+
+        [Test]
+        public void TestReusableTokenStream()
+        {
+            Analyzer a = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 2);
+            AssertAnalyzesToReuse(a, "please divide into shingles",
+                                  new[]
+                                      {
+                                          "please", "please divide", "divide", "divide into", "into", "into shingles",
+                                          "shingles"
+                                      },
+                                  new[] {0, 0, 7, 7, 14, 14, 19},
+                                  new[] {6, 13, 13, 18, 18, 27, 27},
+                                  new[] {1, 0, 1, 0, 1, 0, 1});
+            AssertAnalyzesToReuse(a, "divide me up again",
+                                  new[] {"divide", "divide me", "me", "me up", "up", "up again", "again"},
+                                  new[] {0, 0, 7, 7, 10, 10, 13},
+                                  new[] {6, 9, 9, 12, 12, 18, 18},
+                                  new[] {1, 0, 1, 0, 1, 0, 1});
+        }
+
+        /// <summary>
+        /// subclass that acts just like whitespace analyzer for testing
+        /// </summary>
+        [Test]
+        public void TestLucene1678BwComp()
+        {
+            Analyzer a = new ShingleWrapperSubclassAnalyzer();
+            AssertAnalyzesToReuse(a, "this is a test",
+                                  new[] {"this", "is", "a", "test"},
+                                  new[] {0, 5, 8, 10},
+                                  new[] {4, 7, 9, 14});
+        }
+
+        /// <summary>
+        /// analyzer that does not support reuse it is LetterTokenizer on odd invocations, WhitespaceTokenizer on even.
+        /// </summary>
+        [Test]
+        public void TestWrappedAnalyzerDoesNotReuse()
+        {
+            Analyzer a = new ShingleAnalyzerWrapper(new NonreusableAnalyzer());
+            AssertAnalyzesToReuse(a, "please divide into shingles.",
+                                  new[]
+                                      {
+                                          "please", "please divide", "divide", "divide into", "into", "into shingles",
+                                          "shingles"
+                                      },
+                                  new[] {0, 0, 7, 7, 14, 14, 19},
+                                  new[] {6, 13, 13, 18, 18, 27, 27},
+                                  new[] {1, 0, 1, 0, 1, 0, 1});
+            AssertAnalyzesToReuse(a, "please divide into shingles.",
+                                  new[]
+                                      {
+                                          "please", "please divide", "divide", "divide into", "into", "into shingles.",
+                                          "shingles."
+                                      },
+                                  new[] {0, 0, 7, 7, 14, 14, 19},
+                                  new[] {6, 13, 13, 18, 18, 28, 28},
+                                  new[] {1, 0, 1, 0, 1, 0, 1});
+            AssertAnalyzesToReuse(a, "please divide into shingles.",
+                                  new[]
+                                      {
+                                          "please", "please divide", "divide", "divide into", "into", "into shingles",
+                                          "shingles"
+                                      },
+                                  new[] {0, 0, 7, 7, 14, 14, 19},
+                                  new[] {6, 13, 13, 18, 18, 27, 27},
+                                  new[] {1, 0, 1, 0, 1, 0, 1});
+        }
+
+        #region Nested type: NonreusableAnalyzer
+
+        private class NonreusableAnalyzer : Analyzer
+        {
+            private int _invocationCount;
+
+            public override TokenStream TokenStream(String fieldName, TextReader reader)
+            {
+                if (++_invocationCount%2 == 0)
+                    return new WhitespaceTokenizer(reader);
+
+                return new LetterTokenizer(reader);
+            }
+        }
+
+        #endregion
+
+        #region Nested type: ShingleWrapperSubclassAnalyzer
+
+        private class ShingleWrapperSubclassAnalyzer : ShingleAnalyzerWrapper
+        {
+            public override TokenStream TokenStream(String fieldName, TextReader reader)
+            {
+                return new WhitespaceTokenizer(reader);
+            }
+        } ;
+
+        #endregion
+    }
+}
\ No newline at end of file

Added: incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Shingle/ShingleFilterTest.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Shingle/ShingleFilterTest.cs?rev=1147679&view=auto
==============================================================================
--- incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Shingle/ShingleFilterTest.cs (added)
+++ incubator/lucene.net/branches/Lucene.Net_2_9_4g/test/contrib/Analyzers/Shingle/ShingleFilterTest.cs Sun Jul 17 16:32:29 2011
@@ -0,0 +1,530 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.IO;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using NUnit.Framework;
+
+namespace Lucene.Net.Analysis.Shingle
+{
+    public class ShingleFilterTests : BaseTokenStreamTestCase
+    {
+        public static readonly Token[] TestToken = new[]
+                                                       {
+                                                           CreateToken("please", 0, 6),
+                                                           CreateToken("divide", 7, 13),
+                                                           CreateToken("this", 14, 18),
+                                                           CreateToken("sentence", 19, 27),
+                                                           CreateToken("into", 28, 32),
+                                                           CreateToken("shingles", 33, 39),
+                                                       };
+
+        public static Token[] TestTokenWithHoles;
+
+        public static readonly Token[] BiGramTokens = new[]
+                                                          {
+                                                              CreateToken("please", 0, 6),
+                                                              CreateToken("please divide", 0, 13),
+                                                              CreateToken("divide", 7, 13),
+                                                              CreateToken("divide this", 7, 18),
+                                                              CreateToken("this", 14, 18),
+                                                              CreateToken("this sentence", 14, 27),
+                                                              CreateToken("sentence", 19, 27),
+                                                              CreateToken("sentence into", 19, 32),
+                                                              CreateToken("into", 28, 32),
+                                                              CreateToken("into shingles", 28, 39),
+                                                              CreateToken("shingles", 33, 39),
+                                                          };
+
+        public static readonly int[] BiGramPositionIncrements = new[]
+                                                                    {
+                                                                        1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+                                                                    };
+
+        public static readonly String[] BiGramTypes = new[]
+                                                          {
+                                                              "word", "shingle", "word", "shingle", "word", "shingle",
+                                                              "word",
+                                                              "shingle", "word", "shingle", "word"
+                                                          };
+
+        public static readonly Token[] BiGramTokensWithHoles = new[]
+                                                                   {
+                                                                       CreateToken("please", 0, 6),
+                                                                       CreateToken("please divide", 0, 13),
+                                                                       CreateToken("divide", 7, 13),
+                                                                       CreateToken("divide _", 7, 19),
+                                                                       CreateToken("_", 19, 19),
+                                                                       CreateToken("_ sentence", 19, 27),
+                                                                       CreateToken("sentence", 19, 27),
+                                                                       CreateToken("sentence _", 19, 33),
+                                                                       CreateToken("_", 33, 33),
+                                                                       CreateToken("_ shingles", 33, 39),
+                                                                       CreateToken("shingles", 33, 39),
+                                                                   };
+
+        public static readonly int[] BiGramPositionIncrementsWithHoles = new[]
+                                                                             {
+                                                                                 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+                                                                             };
+
+        public static readonly Token[] BiGramTokensWithoutUnigrams = new[]
+                                                                         {
+                                                                             CreateToken("please divide", 0, 13),
+                                                                             CreateToken("divide this", 7, 18),
+                                                                             CreateToken("this sentence", 14, 27),
+                                                                             CreateToken("sentence into", 19, 32),
+                                                                             CreateToken("into shingles", 28, 39),
+                                                                         };
+
+        public static readonly int[] BiGramPositionIncrementsWithoutUnigrams = new[]
+                                                                                   {
+                                                                                       1, 1, 1, 1, 1
+                                                                                   };
+
+        public static readonly String[] BiGramTypesWithoutUnigrams = new[]
+                                                                         {
+                                                                             "shingle", "shingle", "shingle",
+                                                                             "shingle", "shingle"
+                                                                         };
+
+        public static readonly Token[] BiGramTokensWithHolesWithoutUnigrams = new[]
+                                                                                  {
+                                                                                      CreateToken(
+                                                                                          "please divide", 0, 13),
+                                                                                      CreateToken("divide _", 7,
+                                                                                                  19),
+                                                                                      CreateToken("_ sentence", 19,
+                                                                                                  27),
+                                                                                      CreateToken("sentence _", 19,
+                                                                                                  33),
+                                                                                      CreateToken("_ shingles", 33,
+                                                                                                  39),
+                                                                                  };
+
+        public static readonly int[] BiGramPositionIncrementsWithHolesWithoutUnigrams = new[]
+                                                                                            {
+                                                                                                1, 1, 1, 1, 1, 1
+                                                                                            };
+
+
+        public static readonly Token[] TestSingleToken = new[] { CreateToken("please", 0, 6) };
+
+        public static readonly Token[] SingleToken = new[] { CreateToken("please", 0, 6) };
+
+        public static readonly int[] SingleTokenIncrements = new[] { 1 };
+
+        public static readonly String[] SingleTokenTypes = new[] { "word" };
+
+        public static readonly Token[] EmptyTokenArray = new Token[] { };
+
+        public static readonly int[] EmptyTokenIncrementsArray = new int[] { };
+
+        public static readonly String[] EmptyTokenTypesArray = new String[] { };
+
+        public static readonly Token[] TriGramTokens = new[]
+                                                           {
+                                                               CreateToken("please", 0, 6),
+                                                               CreateToken("please divide", 0, 13),
+                                                               CreateToken("please divide this", 0, 18),
+                                                               CreateToken("divide", 7, 13),
+                                                               CreateToken("divide this", 7, 18),
+                                                               CreateToken("divide this sentence", 7, 27),
+                                                               CreateToken("this", 14, 18),
+                                                               CreateToken("this sentence", 14, 27),
+                                                               CreateToken("this sentence into", 14, 32),
+                                                               CreateToken("sentence", 19, 27),
+                                                               CreateToken("sentence into", 19, 32),
+                                                               CreateToken("sentence into shingles", 19, 39),
+                                                               CreateToken("into", 28, 32),
+                                                               CreateToken("into shingles", 28, 39),
+                                                               CreateToken("shingles", 33, 39)
+                                                           };
+
+        public static readonly int[] TriGramPositionIncrements = new[]
+                                                                     {
+                                                                         1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
+                                                                     };
+
+        public static readonly String[] TriGramTypes = new[]
+                                                           {
+                                                               "word", "shingle", "shingle",
+                                                               "word", "shingle", "shingle",
+                                                               "word", "shingle", "shingle",
+                                                               "word", "shingle", "shingle",
+                                                               "word", "shingle",
+                                                               "word"
+                                                           };
+
+        public static readonly Token[] TriGramTokensWithoutUnigrams = new[]
+                                                                          {
+                                                                              CreateToken("please divide", 0, 13),
+                                                                              CreateToken("please divide this", 0,
+                                                                                          18),
+                                                                              CreateToken("divide this", 7, 18),
+                                                                              CreateToken("divide this sentence", 7,
+                                                                                          27),
+                                                                              CreateToken("this sentence", 14, 27),
+                                                                              CreateToken("this sentence into", 14,
+                                                                                          32),
+                                                                              CreateToken("sentence into", 19, 32),
+                                                                              CreateToken("sentence into shingles",
+                                                                                          19, 39),
+                                                                              CreateToken("into shingles", 28, 39),
+                                                                          };
+
+        public static readonly int[] TriGramPositionIncrementsWithoutUnigrams = new[]
+                                                                                    {
+                                                                                        1, 0, 1, 0, 1, 0, 1, 0, 1
+                                                                                    };
+
+        public static readonly String[] TriGramTypesWithoutUnigrams = new[]
+                                                                          {
+                                                                              "shingle", "shingle",
+                                                                              "shingle", "shingle",
+                                                                              "shingle", "shingle",
+                                                                              "shingle", "shingle",
+                                                                              "shingle",
+                                                                          };
+
+        public static readonly Token[] FourGramTokens = new[]
+                                                            {
+                                                                CreateToken("please", 0, 6),
+                                                                CreateToken("please divide", 0, 13),
+                                                                CreateToken("please divide this", 0, 18),
+                                                                CreateToken("please divide this sentence", 0, 27),
+                                                                CreateToken("divide", 7, 13),
+                                                                CreateToken("divide this", 7, 18),
+                                                                CreateToken("divide this sentence", 7, 27),
+                                                                CreateToken("divide this sentence into", 7, 32),
+                                                                CreateToken("this", 14, 18),
+                                                                CreateToken("this sentence", 14, 27),
+                                                                CreateToken("this sentence into", 14, 32),
+                                                                CreateToken("this sentence into shingles", 14, 39),
+                                                                CreateToken("sentence", 19, 27),
+                                                                CreateToken("sentence into", 19, 32),
+                                                                CreateToken("sentence into shingles", 19, 39),
+                                                                CreateToken("into", 28, 32),
+                                                                CreateToken("into shingles", 28, 39),
+                                                                CreateToken("shingles", 33, 39)
+                                                            };
+
+        public static readonly int[] FourGramPositionIncrements = new[]
+                                                                      {
+                                                                          1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0
+                                                                          , 1, 0, 1
+                                                                      };
+
+        public static readonly String[] FourGramTypes = new[]
+                                                            {
+                                                                "word", "shingle", "shingle", "shingle",
+                                                                "word", "shingle", "shingle", "shingle",
+                                                                "word", "shingle", "shingle", "shingle",
+                                                                "word", "shingle", "shingle",
+                                                                "word", "shingle",
+                                                                "word"
+                                                            };
+
+        public static readonly Token[] FourGramTokensWithoutUnigrams = new[]
+                                                                           {
+                                                                               CreateToken("please divide", 0, 13),
+                                                                               CreateToken("please divide this", 0,
+                                                                                           18),
+                                                                               CreateToken(
+                                                                                   "please divide this sentence", 0,
+                                                                                   27),
+                                                                               CreateToken("divide this", 7, 18),
+                                                                               CreateToken("divide this sentence", 7,
+                                                                                           27),
+                                                                               CreateToken(
+                                                                                   "divide this sentence into", 7,
+                                                                                   32),
+                                                                               CreateToken("this sentence", 14, 27),
+                                                                               CreateToken("this sentence into", 14,
+                                                                                           32),
+                                                                               CreateToken(
+                                                                                   "this sentence into shingles", 14,
+                                                                                   39),
+                                                                               CreateToken("sentence into", 19, 32),
+                                                                               CreateToken(
+                                                                                   "sentence into shingles", 19, 39)
+                                                                               ,
+                                                                               CreateToken("into shingles", 28, 39),
+                                                                           };
+
+        public static readonly int[] FourGramPositionIncrementsWithoutUnigrams = new[]
+                                                                                     {
+                                                                                         1, 0, 0, 1, 0, 0, 1, 0, 0,
+                                                                                         1, 0, 1
+                                                                                     };
+
+        public static readonly String[] FourGramTypesWithoutUnigrams = new[]
+                                                                           {
+                                                                               "shingle", "shingle",
+                                                                               "shingle", "shingle",
+                                                                               "shingle", "shingle",
+                                                                               "shingle", "shingle",
+                                                                               "shingle", "shingle",
+                                                                               "shingle", "shingle",
+                                                                           };
+
+        private static Token CreateToken(String term, int start, int offset)
+        {
+            var token = new Token(start, offset);
+            token.SetTermBuffer(term);
+            return token;
+        }
+
+        [SetUp]
+        public override void SetUp()
+        {
+            base.SetUp();
+            TestTokenWithHoles = new[]
+                                     {
+                                         CreateToken("please", 0, 6),
+                                         CreateToken("divide", 7, 13),
+                                         CreateToken("sentence", 19, 27),
+                                         CreateToken("shingles", 33, 39),
+                                     };
+
+            TestTokenWithHoles[2].SetPositionIncrement(2);
+            TestTokenWithHoles[3].SetPositionIncrement(2);
+        }
+
+
+        /// <summary>
+        /// Class under test for void ShingleFilter(TokenStream, int)
+        /// </summary>
+        [Test]
+        public void TestBiGramFilter()
+        {
+            ShingleFilterTest(2, TestToken, BiGramTokens,
+                              BiGramPositionIncrements, BiGramTypes,
+                              true);
+        }
+
+        [Test]
+        public void TestBiGramFilterWithHoles()
+        {
+            ShingleFilterTest(2, TestTokenWithHoles, BiGramTokensWithHoles,
+                              BiGramPositionIncrements, BiGramTypes,
+                              true);
+        }
+
+        [Test]
+        public void TestBiGramFilterWithoutUnigrams()
+        {
+            ShingleFilterTest(2, TestToken, BiGramTokensWithoutUnigrams,
+                              BiGramPositionIncrementsWithoutUnigrams, BiGramTypesWithoutUnigrams,
+                              false);
+        }
+
+        [Test]
+        public void TestBiGramFilterWithHolesWithoutUnigrams()
+        {
+            ShingleFilterTest(2, TestTokenWithHoles, BiGramTokensWithHolesWithoutUnigrams,
+                              BiGramPositionIncrementsWithHolesWithoutUnigrams, BiGramTypesWithoutUnigrams,
+                              false);
+        }
+
+        [Test]
+        public void TestBiGramFilterWithSingleToken()
+        {
+            ShingleFilterTest(2, TestSingleToken, SingleToken,
+                              SingleTokenIncrements, SingleTokenTypes,
+                              true);
+        }
+
+        [Test]
+        public void TestBiGramFilterWithSingleTokenWithoutUnigrams()
+        {
+            ShingleFilterTest(2, TestSingleToken, EmptyTokenArray,
+                              EmptyTokenIncrementsArray, EmptyTokenTypesArray,
+                              false);
+        }
+
+        [Test]
+        public void TestBiGramFilterWithEmptyTokenStream()
+        {
+            ShingleFilterTest(2, EmptyTokenArray, EmptyTokenArray,
+                              EmptyTokenIncrementsArray, EmptyTokenTypesArray,
+                              true);
+        }
+
+        [Test]
+        public void TestBiGramFilterWithEmptyTokenStreamWithoutUnigrams()
+        {
+            ShingleFilterTest(2, EmptyTokenArray, EmptyTokenArray,
+                              EmptyTokenIncrementsArray, EmptyTokenTypesArray,
+                              false);
+        }
+
+        [Test]
+        public void TestTriGramFilter()
+        {
+            ShingleFilterTest(3, TestToken, TriGramTokens,
+                              TriGramPositionIncrements, TriGramTypes,
+                              true);
+        }
+
+        [Test]
+        public void TestTriGramFilterWithoutUnigrams()
+        {
+            ShingleFilterTest(3, TestToken, TriGramTokensWithoutUnigrams,
+                              TriGramPositionIncrementsWithoutUnigrams, TriGramTypesWithoutUnigrams,
+                              false);
+        }
+
+        [Test]
+        public void TestFourGramFilter()
+        {
+            ShingleFilterTest(4, TestToken, FourGramTokens,
+                              FourGramPositionIncrements, FourGramTypes,
+                              true);
+        }
+
+        [Test]
+        public void TestFourGramFilterWithoutUnigrams()
+        {
+            ShingleFilterTest(4, TestToken, FourGramTokensWithoutUnigrams,
+                              FourGramPositionIncrementsWithoutUnigrams,
+                              FourGramTypesWithoutUnigrams, false);
+        }
+
+        [Test]
+        public void TestReset()
+        {
+            Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader("please divide this sentence"));
+            TokenStream filter = new ShingleFilter(wsTokenizer, 2);
+
+            AssertTokenStreamContents(filter,
+                                      new[]
+                                          {
+                                              "please", "please divide", "divide", "divide this", "this",
+                                              "this sentence",
+                                              "sentence"
+                                          },
+                                      new[] {0, 0, 7, 7, 14, 14, 19}, new[] {6, 13, 13, 18, 18, 27, 27},
+                                      new[]
+                                          {
+                                              TypeAttributeImpl.DEFAULT_TYPE, "shingle", TypeAttributeImpl.DEFAULT_TYPE,
+                                              "shingle", TypeAttributeImpl.DEFAULT_TYPE, "shingle",
+                                              TypeAttributeImpl.DEFAULT_TYPE
+                                          },
+                                      new[] {1, 0, 1, 0, 1, 0, 1}
+                );
+
+            wsTokenizer.Reset(new StringReader("please divide this sentence"));
+
+            AssertTokenStreamContents(filter,
+                                      new[]
+                                          {
+                                              "please", "please divide", "divide", "divide this", "this",
+                                              "this sentence",
+                                              "sentence"
+                                          },
+                                      new[] {0, 0, 7, 7, 14, 14, 19}, new[] {6, 13, 13, 18, 18, 27, 27},
+                                      new[]
+                                          {
+                                              TypeAttributeImpl.DEFAULT_TYPE, "shingle", TypeAttributeImpl.DEFAULT_TYPE,
+                                              "shingle", TypeAttributeImpl.DEFAULT_TYPE, "shingle",
+                                              TypeAttributeImpl.DEFAULT_TYPE
+                                          },
+                                      new[] {1, 0, 1, 0, 1, 0, 1}
+                );
+        }
+
+        protected void ShingleFilterTest(int maxSize, Token[] tokensToShingle, Token[] tokensToCompare,
+                                         int[] positionIncrements, String[] types, bool outputUnigrams)
+        {
+            var filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
+            filter.SetOutputUnigrams(outputUnigrams);
+
+            var termAtt = (TermAttribute) filter.AddAttribute(typeof (TermAttribute));
+            var offsetAtt = (OffsetAttribute) filter.AddAttribute(typeof (OffsetAttribute));
+            var posIncrAtt = (PositionIncrementAttribute) filter.AddAttribute(typeof (PositionIncrementAttribute));
+            var typeAtt = (TypeAttribute) filter.AddAttribute(typeof (TypeAttribute));
+
+            int i = 0;
+            while (filter.IncrementToken())
+            {
+                Assert.IsTrue(i < tokensToCompare.Length, "ShingleFilter outputted more tokens than expected");
+
+                String termText = termAtt.Term();
+                String goldText = tokensToCompare[i].Term();
+
+                Assert.AreEqual(goldText, termText, "Wrong termText");
+                Assert.AreEqual(tokensToCompare[i].StartOffset(), offsetAtt.StartOffset(),
+                                "Wrong startOffset for token \"" + termText + "\"");
+                Assert.AreEqual(tokensToCompare[i].EndOffset(), offsetAtt.EndOffset(),
+                                "Wrong endOffset for token \"" + termText + "\"");
+                Assert.AreEqual(positionIncrements[i], posIncrAtt.GetPositionIncrement(),
+                                "Wrong positionIncrement for token \"" + termText + "\"");
+                Assert.AreEqual(types[i], typeAtt.Type(), "Wrong type for token \"" + termText + "\"");
+
+                i++;
+            }
+
+            Assert.AreEqual(tokensToCompare.Length, i,
+                            "ShingleFilter outputted wrong # of tokens. (# output = " + i + "; # expected =" +
+                            tokensToCompare.Length + ")");
+        }
+
+        #region Nested type: TestTokenStream
+
+        public sealed class TestTokenStream : TokenStream
+        {
+            private readonly OffsetAttribute _offsetAtt;
+            private readonly PositionIncrementAttribute _posIncrAtt;
+            private readonly TermAttribute _termAtt;
+            private readonly Token[] _testToken;
+            private readonly TypeAttribute _typeAtt;
+            private int _index;
+
+            public TestTokenStream(Token[] testToken)
+            {
+                _testToken = testToken;
+
+                _termAtt = (TermAttribute) AddAttribute(typeof (TermAttribute));
+                _offsetAtt = (OffsetAttribute) AddAttribute(typeof (OffsetAttribute));
+                _posIncrAtt = (PositionIncrementAttribute) AddAttribute(typeof (PositionIncrementAttribute));
+                _typeAtt = (TypeAttribute) AddAttribute(typeof (TypeAttribute));
+            }
+
+            public override bool IncrementToken()
+            {
+                ClearAttributes();
+
+                if (_index >= _testToken.Length)
+                    return false;
+
+                Token t = _testToken[_index++];
+
+                _termAtt.SetTermBuffer(t.TermBuffer(), 0, t.TermLength());
+                _offsetAtt.SetOffset(t.StartOffset(), t.EndOffset());
+                _posIncrAtt.SetPositionIncrement(t.GetPositionIncrement());
+                _typeAtt.SetType(TypeAttributeImpl.DEFAULT_TYPE);
+
+                return true;
+            }
+        }
+
+        #endregion
+    }
+}
\ No newline at end of file



Mime
View raw message