lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From d...@apache.org
Subject [Lucene.Net] svn commit: r1103463 - in /incubator/lucene.net/trunk: src/contrib/Analyzers/ src/contrib/Analyzers/NGram/ test/contrib/Analyzers/ test/contrib/Analyzers/NGram/
Date Sun, 15 May 2011 17:12:25 GMT
Author: digy
Date: Sun May 15 17:12:24 2011
New Revision: 1103463

URL: http://svn.apache.org/viewvc?rev=1103463&view=rev
Log:
[LUCENENET-405] contrib/Analysis.NGram

Added:
    incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/
    incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/EdgeNGramTokenFilter.cs
    incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/EdgeNGramTokenizer.cs
    incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/NGramTokenFilter.cs
    incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/NGramTokenizer.cs
    incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/
    incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestEdgeNGramTokenFilter.cs
    incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestEdgeNGramTokenizer.cs
    incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestNGramTokenFilter.cs
    incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestNGramTokenizer.cs
Modified:
    incubator/lucene.net/trunk/src/contrib/Analyzers/Contrib.Analyzers.csproj
    incubator/lucene.net/trunk/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj

Modified: incubator/lucene.net/trunk/src/contrib/Analyzers/Contrib.Analyzers.csproj
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/Contrib.Analyzers.csproj?rev=1103463&r1=1103462&r2=1103463&view=diff
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/Contrib.Analyzers.csproj (original)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/Contrib.Analyzers.csproj Sun May 15 17:12:24 2011
@@ -63,6 +63,10 @@
     <Compile Include="Fr\FrenchAnalyzer.cs" />
     <Compile Include="Fr\FrenchStemFilter.cs" />
     <Compile Include="Fr\FrenchStemmer.cs" />
+    <Compile Include="NGram\EdgeNGramTokenFilter.cs" />
+    <Compile Include="NGram\EdgeNGramTokenizer.cs" />
+    <Compile Include="NGram\NGramTokenFilter.cs" />
+    <Compile Include="NGram\NGramTokenizer.cs" />
     <Compile Include="Nl\DutchAnalyzer.cs" />
     <Compile Include="Nl\DutchStemFilter.cs" />
     <Compile Include="Nl\DutchStemmer.cs" />

Added: incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/EdgeNGramTokenFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/EdgeNGramTokenFilter.cs?rev=1103463&view=auto
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/EdgeNGramTokenFilter.cs (added)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/EdgeNGramTokenFilter.cs Sun May 15 17:12:24 2011
@@ -0,0 +1,198 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Analysis.NGram
+{
+
+    /**
+     * Tokenizes the given token into n-grams of given size(s).
+     * <p>
+     * This {@link TokenFilter} create n-grams from the beginning edge or ending edge of a input token.
+     * </p>
+     */
+    public class EdgeNGramTokenFilter : TokenFilter
+    {
+        public static Side DEFAULT_SIDE = Side.FRONT;
+        public static int DEFAULT_MAX_GRAM_SIZE = 1;
+        public static int DEFAULT_MIN_GRAM_SIZE = 1;
+
+        // Replace this with an enum when the Java 1.5 upgrade is made, the impl will be simplified
+        /** Specifies which side of the input the n-gram should be generated from */
+        public class Side
+        {
+            private string label;
+
+            /** Get the n-gram from the front of the input */
+            public static Side FRONT = new Side("front");
+
+            /** Get the n-gram from the end of the input */
+            public static Side BACK = new Side("back");
+
+            // Private ctor
+            private Side(string label) { this.label = label; }
+
+            public string getLabel() { return label; }
+
+            // Get the appropriate Side from a string
+            public static Side getSide(string sideName)
+            {
+                if (FRONT.getLabel().Equals(sideName))
+                {
+                    return FRONT;
+                }
+                else if (BACK.getLabel().Equals(sideName))
+                {
+                    return BACK;
+                }
+                return null;
+            }
+        }
+
+        private int minGram;
+        private int maxGram;
+        private Side side;
+        private char[] curTermBuffer;
+        private int curTermLength;
+        private int curGramSize;
+        private int tokStart;
+
+        private TermAttribute termAtt;
+        private OffsetAttribute offsetAtt;
+
+
+        protected EdgeNGramTokenFilter(TokenStream input) : base(input)
+        {
+            this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
+            this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
+        }
+
+        /**
+         * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
+         *
+         * @param input {@link TokenStream} holding the input to be tokenized
+         * @param side the {@link Side} from which to chop off an n-gram
+         * @param minGram the smallest n-gram to generate
+         * @param maxGram the largest n-gram to generate
+         */
+        public EdgeNGramTokenFilter(TokenStream input, Side side, int minGram, int maxGram)
+            : base(input)
+        {
+
+
+            if (side == null)
+            {
+                throw new System.ArgumentException("sideLabel must be either front or back");
+            }
+
+            if (minGram < 1)
+            {
+                throw new System.ArgumentException("minGram must be greater than zero");
+            }
+
+            if (minGram > maxGram)
+            {
+                throw new System.ArgumentException("minGram must not be greater than maxGram");
+            }
+
+            this.minGram = minGram;
+            this.maxGram = maxGram;
+            this.side = side;
+            this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
+            this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
+        }
+
+        /**
+         * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
+         *
+         * @param input {@link TokenStream} holding the input to be tokenized
+         * @param sideLabel the name of the {@link Side} from which to chop off an n-gram
+         * @param minGram the smallest n-gram to generate
+         * @param maxGram the largest n-gram to generate
+         */
+        public EdgeNGramTokenFilter(TokenStream input, string sideLabel, int minGram, int maxGram)
+            : this(input, Side.getSide(sideLabel), minGram, maxGram)
+        {
+
+        }
+
+        public override bool IncrementToken()
+        {
+            while (true)
+            {
+                if (curTermBuffer == null)
+                {
+                    if (!input.IncrementToken())
+                    {
+                        return false;
+                    }
+                    else
+                    {
+                        curTermBuffer = (char[])termAtt.TermBuffer().Clone();
+                        curTermLength = termAtt.TermLength();
+                        curGramSize = minGram;
+                        tokStart = offsetAtt.StartOffset();
+                    }
+                }
+                if (curGramSize <= maxGram)
+                {
+                    if (!(curGramSize > curTermLength         // if the remaining input is too short, we can't generate any n-grams
+                        || curGramSize > maxGram))
+                    {       // if we have hit the end of our n-gram size range, quit
+                        // grab gramSize chars from front or back
+                        int start = side == Side.FRONT ? 0 : curTermLength - curGramSize;
+                        int end = start + curGramSize;
+                        ClearAttributes();
+                        offsetAtt.SetOffset(tokStart + start, tokStart + end);
+                        termAtt.SetTermBuffer(curTermBuffer, start, curGramSize);
+                        curGramSize++;
+                        return true;
+                    }
+                }
+                curTermBuffer = null;
+            }
+        }
+
+        /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+         * not be overridden. Delegates to the backwards compatibility layer. */
+        [System.Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.")]
+        public override  Token Next(Token reusableToken)
+        {
+            return base.Next(reusableToken);
+        }
+
+        /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+         * not be overridden. Delegates to the backwards compatibility layer. */
+        [System.Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.")]
+        public override Token Next()
+        {
+            return base.Next();
+        }
+
+        public override void Reset()
+        {
+            base.Reset();
+            curTermBuffer = null;
+        }
+    }
+}
\ No newline at end of file

Added: incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/EdgeNGramTokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/EdgeNGramTokenizer.cs?rev=1103463&view=auto
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/EdgeNGramTokenizer.cs (added)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/EdgeNGramTokenizer.cs Sun May 15 17:12:24 2011
@@ -0,0 +1,271 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Analysis.NGram
+{
+
+    /**
+     * Tokenizes the input from an edge into n-grams of given size(s).
+     * <p>
+     * This {@link Tokenizer} create n-grams from the beginning edge or ending edge of a input token.
+     * MaxGram can't be larger than 1024 because of limitation.
+     * </p>
+     */
+    public class EdgeNGramTokenizer : Tokenizer
+    {
+        public static Side DEFAULT_SIDE = Side.FRONT;
+        public static int DEFAULT_MAX_GRAM_SIZE = 1;
+        public static int DEFAULT_MIN_GRAM_SIZE = 1;
+
+        private TermAttribute termAtt;
+        private OffsetAttribute offsetAtt;
+
+        // Replace this with an enum when the Java 1.5 upgrade is made, the impl will be simplified
+        /** Specifies which side of the input the n-gram should be generated from */
+        public class Side
+        {
+            private string label;
+
+            /** Get the n-gram from the front of the input */
+            public static Side FRONT = new Side("front");
+
+            /** Get the n-gram from the end of the input */
+            public static Side BACK = new Side("back");
+
+            // Private ctor
+            private Side(string label) { this.label = label; }
+
+
+            public string getLabel() { return label; }
+
+            // Get the appropriate Side from a string
+            public static Side getSide(string sideName)
+            {
+                if (FRONT.getLabel().Equals(sideName))
+                {
+                    return FRONT;
+                }
+                else if (BACK.getLabel().Equals(sideName))
+                {
+                    return BACK;
+                }
+                return null;
+            }
+        }
+
+        private int minGram;
+        private int maxGram;
+        private int gramSize;
+        private Side side;
+        private bool started = false;
+        private int inLen;
+        private string inStr;
+
+
+        /**
+         * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+         *
+         * @param input {@link Reader} holding the input to be tokenized
+         * @param side the {@link Side} from which to chop off an n-gram
+         * @param minGram the smallest n-gram to generate
+         * @param maxGram the largest n-gram to generate
+         */
+        public EdgeNGramTokenizer(TextReader input, Side side, int minGram, int maxGram)
+            : base(input)
+        {
+            init(side, minGram, maxGram);
+        }
+
+        /**
+         * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+         *
+         * @param source {@link AttributeSource} to use
+         * @param input {@link Reader} holding the input to be tokenized
+         * @param side the {@link Side} from which to chop off an n-gram
+         * @param minGram the smallest n-gram to generate
+         * @param maxGram the largest n-gram to generate
+         */
+        public EdgeNGramTokenizer(AttributeSource source, TextReader input, Side side, int minGram, int maxGram)
+            : base(source, input)
+        {
+
+            init(side, minGram, maxGram);
+        }
+
+        /**
+         * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+         * 
+         * @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use
+         * @param input {@link Reader} holding the input to be tokenized
+         * @param side the {@link Side} from which to chop off an n-gram
+         * @param minGram the smallest n-gram to generate
+         * @param maxGram the largest n-gram to generate
+         */
+        public EdgeNGramTokenizer(AttributeFactory factory, TextReader input, Side side, int minGram, int maxGram)
+            : base(factory, input)
+        {
+
+            init(side, minGram, maxGram);
+        }
+
+        /**
+         * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+         *
+         * @param input {@link Reader} holding the input to be tokenized
+         * @param sideLabel the name of the {@link Side} from which to chop off an n-gram
+         * @param minGram the smallest n-gram to generate
+         * @param maxGram the largest n-gram to generate
+         */
+        public EdgeNGramTokenizer(TextReader input, string sideLabel, int minGram, int maxGram)
+            : this(input, Side.getSide(sideLabel), minGram, maxGram)
+        {
+
+        }
+
+        /**
+         * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+         *
+         * @param source {@link AttributeSource} to use
+         * @param input {@link Reader} holding the input to be tokenized
+         * @param sideLabel the name of the {@link Side} from which to chop off an n-gram
+         * @param minGram the smallest n-gram to generate
+         * @param maxGram the largest n-gram to generate
+         */
+        public EdgeNGramTokenizer(AttributeSource source, TextReader input, string sideLabel, int minGram, int maxGram)
+            : this(source, input, Side.getSide(sideLabel), minGram, maxGram)
+        {
+
+        }
+
+        /**
+         * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+         * 
+         * @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use
+         * @param input {@link Reader} holding the input to be tokenized
+         * @param sideLabel the name of the {@link Side} from which to chop off an n-gram
+         * @param minGram the smallest n-gram to generate
+         * @param maxGram the largest n-gram to generate
+         */
+        public EdgeNGramTokenizer(AttributeFactory factory, TextReader input, string sideLabel, int minGram, int maxGram) :
+            this(factory, input, Side.getSide(sideLabel), minGram, maxGram)
+        {
+        }
+
+        private void init(Side side, int minGram, int maxGram)
+        {
+            if (side == null)
+            {
+                throw new System.ArgumentException("sideLabel must be either front or back");
+            }
+
+            if (minGram < 1)
+            {
+                throw new System.ArgumentException("minGram must be greater than zero");
+            }
+
+            if (minGram > maxGram)
+            {
+                throw new System.ArgumentException("minGram must not be greater than maxGram");
+            }
+
+            this.minGram = minGram;
+            this.maxGram = maxGram;
+            this.side = side;
+
+            this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
+            this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
+
+        }
+
+        /** Returns the next token in the stream, or null at EOS. */
+        public override bool IncrementToken()
+        {
+            ClearAttributes();
+            // if we are just starting, read the whole input
+            if (!started)
+            {
+                started = true;
+                char[] chars = new char[1024];
+                inStr = input.ReadToEnd().Trim();  // remove any leading or trailing spaces
+                inLen = inStr.Length;
+                gramSize = minGram;
+            }
+
+            // if the remaining input is too short, we can't generate any n-grams
+            if (gramSize > inLen)
+            {
+                return false;
+            }
+
+            // if we have hit the end of our n-gram size range, quit
+            if (gramSize > maxGram)
+            {
+                return false;
+            }
+
+            // grab gramSize chars from front or back
+            int start = side == Side.FRONT ? 0 : inLen - gramSize;
+            int end = start + gramSize;
+            termAtt.SetTermBuffer(inStr, start, gramSize);
+            offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(end));
+            gramSize++;
+            return true;
+        }
+
+        public override void End()
+        {
+            // set offset
+            int finalOffset = inLen;
+            this.offsetAtt.SetOffset(finalOffset, finalOffset);
+        }
+
+        /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+         * not be overridden. Delegates to the backwards compatibility layer. */
+        [System.Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.")]
+        public override Token Next(Token reusableToken)
+        {
+            return base.Next(reusableToken);
+        }
+
+        /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+         * not be overridden. Delegates to the backwards compatibility layer. */
+        [System.Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.")]
+        public override Token Next()
+        {
+            return base.Next();
+        }
+
+        public override void Reset(TextReader input)
+        {
+            base.Reset(input);
+            Reset();
+        }
+
+        public override void Reset()
+        {
+            base.Reset();
+            started = false;
+        }
+    }
+}
\ No newline at end of file

Added: incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/NGramTokenFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/NGramTokenFilter.cs?rev=1103463&view=auto
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/NGramTokenFilter.cs (added)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/NGramTokenFilter.cs Sun May 15 17:12:24 2011
@@ -0,0 +1,141 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Analysis.NGram
+{
+
+    /**
+     * Tokenizes the input into n-grams of the given size(s).
+     */
+    public class NGramTokenFilter : TokenFilter
+    {
+        public static int DEFAULT_MIN_NGRAM_SIZE = 1;
+        public static int DEFAULT_MAX_NGRAM_SIZE = 2;
+
+        private int minGram, maxGram;
+
+        private char[] curTermBuffer;
+        private int curTermLength;
+        private int curGramSize;
+        private int curPos;
+        private int tokStart;
+
+        private TermAttribute termAtt;
+        private OffsetAttribute offsetAtt;
+
+        /**
+         * Creates NGramTokenFilter with given min and max n-grams.
+         * @param input {@link TokenStream} holding the input to be tokenized
+         * @param minGram the smallest n-gram to generate
+         * @param maxGram the largest n-gram to generate
+         */
+        public NGramTokenFilter(TokenStream input, int minGram, int maxGram)
+            : base(input)
+        {
+
+            if (minGram < 1)
+            {
+                throw new System.ArgumentException("minGram must be greater than zero");
+            }
+            if (minGram > maxGram)
+            {
+                throw new System.ArgumentException("minGram must not be greater than maxGram");
+            }
+            this.minGram = minGram;
+            this.maxGram = maxGram;
+
+            this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
+            this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
+        }
+
+        /**
+         * Creates NGramTokenFilter with default min and max n-grams.
+         * @param input {@link TokenStream} holding the input to be tokenized
+         */
+        public NGramTokenFilter(TokenStream input)
+            : this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE)
+        {
+
+        }
+
+        /** Returns the next token in the stream, or null at EOS. */
+        public override bool IncrementToken()
+        {
+            while (true)
+            {
+                if (curTermBuffer == null)
+                {
+                    if (!input.IncrementToken())
+                    {
+                        return false;
+                    }
+                    else
+                    {
+                        curTermBuffer = (char[])termAtt.TermBuffer().Clone();
+                        curTermLength = termAtt.TermLength();
+                        curGramSize = minGram;
+                        curPos = 0;
+                        tokStart = offsetAtt.StartOffset();
+                    }
+                }
+                while (curGramSize <= maxGram)
+                {
+                    while (curPos + curGramSize <= curTermLength)
+                    {     // while there is input
+                        ClearAttributes();
+                        termAtt.SetTermBuffer(curTermBuffer, curPos, curGramSize);
+                        offsetAtt.SetOffset(tokStart + curPos, tokStart + curPos + curGramSize);
+                        curPos++;
+                        return true;
+                    }
+                    curGramSize++;                         // increase n-gram size
+                    curPos = 0;
+                }
+                curTermBuffer = null;
+            }
+        }
+
+        /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+         * not be overridden. Delegates to the backwards compatibility layer. */
+        [System.Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.")]
+        public override Token Next(Token reusableToken)
+        {
+            return base.Next(reusableToken);
+        }
+
+        /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+         * not be overridden. Delegates to the backwards compatibility layer. */
+        [System.Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.")]
+        public override Token Next()
+        {
+            return base.Next();
+        }
+
+        public override void Reset()
+        {
+            base.Reset();
+            curTermBuffer = null;
+        }
+    }
+}
\ No newline at end of file

Added: incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/NGramTokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/NGramTokenizer.cs?rev=1103463&view=auto
==============================================================================
--- incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/NGramTokenizer.cs (added)
+++ incubator/lucene.net/trunk/src/contrib/Analyzers/NGram/NGramTokenizer.cs Sun May 15 17:12:24 2011
@@ -0,0 +1,177 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Analysis.NGram
+{
+
+    /**
+     * Tokenizes the input into n-grams of the given size(s).
+     */
+    public class NGramTokenizer : Tokenizer
+    {
+        public static int DEFAULT_MIN_NGRAM_SIZE = 1;
+        public static int DEFAULT_MAX_NGRAM_SIZE = 2;
+
+        private int minGram, maxGram;
+        private int gramSize;
+        private int pos = 0;
+        private int inLen;
+        private string inStr;
+        private bool started = false;
+
+        private TermAttribute termAtt;
+        private OffsetAttribute offsetAtt;
+
+        /**
+         * Creates NGramTokenizer with given min and max n-grams.
+         * @param input {@link Reader} holding the input to be tokenized
+         * @param minGram the smallest n-gram to generate
+         * @param maxGram the largest n-gram to generate
+         */
+        public NGramTokenizer(TextReader input, int minGram, int maxGram)
+            : base(input)
+        {
+            init(minGram, maxGram);
+        }
+
+        /**
+         * Creates NGramTokenizer with given min and max n-grams.
+         * @param source {@link AttributeSource} to use
+         * @param input {@link Reader} holding the input to be tokenized
+         * @param minGram the smallest n-gram to generate
+         * @param maxGram the largest n-gram to generate
+         */
+        public NGramTokenizer(AttributeSource source, TextReader input, int minGram, int maxGram)
+            : base(source, input)
+        {
+            init(minGram, maxGram);
+        }
+
+        /**
+         * Creates NGramTokenizer with given min and max n-grams.
+         * @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use
+         * @param input {@link Reader} holding the input to be tokenized
+         * @param minGram the smallest n-gram to generate
+         * @param maxGram the largest n-gram to generate
+         */
+        public NGramTokenizer(AttributeFactory factory, TextReader input, int minGram, int maxGram)
+            : base(factory, input)
+        {
+            init(minGram, maxGram);
+        }
+
+        /**
+         * Creates NGramTokenizer with default min and max n-grams.
+         * @param input {@link Reader} holding the input to be tokenized
+         */
+        public NGramTokenizer(TextReader input)
+            : this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE)
+        {
+
+        }
+
+        private void init(int minGram, int maxGram)
+        {
+            if (minGram < 1)
+            {
+                throw new System.ArgumentException("minGram must be greater than zero");
+            }
+            if (minGram > maxGram)
+            {
+                throw new System.ArgumentException("minGram must not be greater than maxGram");
+            }
+            this.minGram = minGram;
+            this.maxGram = maxGram;
+
+            this.termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute));
+            this.offsetAtt = (OffsetAttribute)AddAttribute(typeof(OffsetAttribute));
+        }
+
+        /** Returns the next token in the stream, or null at EOS. */
+        public override bool IncrementToken()
+        {
+            ClearAttributes();
+            if (!started)
+            {
+                started = true;
+                gramSize = minGram;
+                char[] chars = new char[1024];
+                inStr = input.ReadToEnd();  // remove any trailing empty strings 
+                inLen = inStr.Length;
+            }
+
+            if (pos + gramSize > inLen)
+            {            // if we hit the end of the string
+                pos = 0;                           // reset to beginning of string
+                gramSize++;                        // increase n-gram size
+                if (gramSize > maxGram)            // we are done
+                    return false;
+                if (pos + gramSize > inLen)
+                    return false;
+            }
+
+            int oldPos = pos;
+            pos++;
+            termAtt.SetTermBuffer(inStr, oldPos, gramSize);
+            offsetAtt.SetOffset(CorrectOffset(oldPos), CorrectOffset(oldPos + gramSize));
+            return true;
+        }
+
+        public override void End()
+        {
+            // set offset
+            int finalOffset = inLen;
+            this.offsetAtt.SetOffset(finalOffset, finalOffset);
+        }
+
+        /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+         * not be overridden. Delegates to the backwards compatibility layer. */
+        [System.Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.")]
+        public override Token Next(Token reusableToken)
+        {
+            return base.Next(reusableToken);
+        }
+
+        /** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
+         * not be overridden. Delegates to the backwards compatibility layer. */
+        [System.Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer.")]
+        public override Token Next()
+        {
+            return base.Next();
+        }
+
+        public override void Reset(TextReader input)
+        {
+            base.Reset(input);
+            Reset();
+        }
+
+        public override void Reset()
+        {
+            base.Reset();
+            started = false;
+            pos = 0;
+        }
+    }
+}
\ No newline at end of file

Modified: incubator/lucene.net/trunk/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj?rev=1103463&r1=1103462&r2=1103463&view=diff
==============================================================================
--- incubator/lucene.net/trunk/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj (original)
+++ incubator/lucene.net/trunk/test/contrib/Analyzers/Contrib.Analyzers.Test.csproj Sun May 15 17:12:24 2011
@@ -60,6 +60,10 @@
     <Compile Include="AR\TestArabicAnalyzer.cs" />
     <Compile Include="AR\TestArabicNormalizationFilter.cs" />
     <Compile Include="AR\TestArabicStemFilter.cs" />
+    <Compile Include="NGram\TestEdgeNGramTokenFilter.cs" />
+    <Compile Include="NGram\TestEdgeNGramTokenizer.cs" />
+    <Compile Include="NGram\TestNGramTokenFilter.cs" />
+    <Compile Include="NGram\TestNGramTokenizer.cs" />
     <Compile Include="Properties\AssemblyInfo.cs" />
   </ItemGroup>
   <ItemGroup>

Added: incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestEdgeNGramTokenFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestEdgeNGramTokenFilter.cs?rev=1103463&view=auto
==============================================================================
--- incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestEdgeNGramTokenFilter.cs (added)
+++ incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestEdgeNGramTokenFilter.cs Sun May 15 17:12:24 2011
@@ -0,0 +1,143 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+using NUnit.Framework;
+
+namespace Lucene.Net.Analysis.NGram
+{
+
+    /**
+     * Tests {@link EdgeNGramTokenFilter} for correctness.
+     */
+    [TestFixture]
+    public class TestEdgeNGramTokenFilter : BaseTokenStreamTestCase
+    {
+        private TokenStream input;
+
+        [SetUp]
+        public void SetUp()
+        {
+            base.SetUp();
+            input = new WhitespaceTokenizer(new StringReader("abcde"));
+        }
+
+        [Test]
+        public void TestInvalidInput()
+        {
+            bool gotException = false;
+            try
+            {
+                new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 0, 0);
+            }
+            catch (System.ArgumentException e)
+            {
+                gotException = true;
+            }
+            Assert.IsTrue(gotException);
+        }
+
+        [Test]
+        public void TestInvalidInput2()
+        {
+            bool gotException = false;
+            try
+            {
+                new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 2, 1);
+            }
+            catch (System.ArgumentException e)
+            {
+                gotException = true;
+            }
+            Assert.IsTrue(gotException);
+        }
+
+        [Test]
+        public void TestInvalidInput3()
+        {
+            bool gotException = false;
+            try
+            {
+                new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, -1, 2);
+            }
+            catch (System.ArgumentException e)
+            {
+                gotException = true;
+            }
+            Assert.IsTrue(gotException);
+        }
+
+        [Test]
+        public void TestFrontUnigram()
+        {
+            EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 1, 1);
+            AssertTokenStreamContents(tokenizer, new String[] { "a" }, new int[] { 0 }, new int[] { 1 });
+        }
+
+        [Test]
+        public void TestBackUnigram()
+        {
+            EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.BACK, 1, 1);
+            AssertTokenStreamContents(tokenizer, new String[] { "e" }, new int[] { 4 }, new int[] { 5 });
+        }
+
+        [Test]
+        public void TestOversizedNgrams()
+        {
+            EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 6, 6);
+            AssertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0]);
+        }
+
+        [Test]
+        public void TestFrontRangeOfNgrams()
+        {
+            EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 1, 3);
+            AssertTokenStreamContents(tokenizer, new String[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 });
+        }
+
+        [Test]
+        public void TestBackRangeOfNgrams()
+        {
+            EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.BACK, 1, 3);
+            AssertTokenStreamContents(tokenizer, new String[] { "e", "de", "cde" }, new int[] { 4, 3, 2 }, new int[] { 5, 5, 5 });
+        }
+
+        [Test]
+        public void TestSmallTokenInStream()
+        {
+            input = new WhitespaceTokenizer(new StringReader("abc de fgh"));
+            EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 3, 3);
+            AssertTokenStreamContents(tokenizer, new String[] { "abc", "fgh" }, new int[] { 0, 7 }, new int[] { 3, 10 });
+        }
+
+        [Test]
+        public void TestReset()
+        {
+            WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(new StringReader("abcde"));
+            EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(tokenizer, EdgeNGramTokenFilter.Side.FRONT, 1, 3);
+            AssertTokenStreamContents(filter, new String[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 });
+            tokenizer.Reset(new StringReader("abcde"));
+            AssertTokenStreamContents(filter, new String[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 });
+        }
+    }
+}
\ No newline at end of file

Added: incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestEdgeNGramTokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestEdgeNGramTokenizer.cs?rev=1103463&view=auto
==============================================================================
--- incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestEdgeNGramTokenizer.cs (added)
+++ incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestEdgeNGramTokenizer.cs Sun May 15 17:12:24 2011
@@ -0,0 +1,134 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+using NUnit.Framework;
+
+namespace Lucene.Net.Analysis.NGram
+{
+
+    /**
+     * Tests {@link EdgeNGramTokenizer} for correctness.
+     */
+    [TestFixture]
+    public class TestEdgeNGramTokenizer : BaseTokenStreamTestCase
+    {
+        private StringReader input;
+
+        [SetUp]
+        public void SetUp()
+        {
+            base.SetUp();
+            input = new StringReader("abcde");
+        }
+
+        [Test]
+        public void TestInvalidInput()
+        {
+            bool gotException = false;
+            try
+            {
+                new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 0, 0);
+            }
+            catch (System.ArgumentException e)
+            {
+                gotException = true;
+            }
+            Assert.IsTrue(gotException);
+        }
+
+        [Test]
+        public void TestInvalidInput2()
+        {
+            bool gotException = false;
+            try
+            {
+                new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 2, 1);
+            }
+            catch (System.ArgumentException e)
+            {
+                gotException = true;
+            }
+            Assert.IsTrue(gotException);
+        }
+
+        [Test]
+        public void TestInvalidInput3()
+        {
+            bool gotException = false;
+            try
+            {
+                new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, -1, 2);
+            }
+            catch (System.ArgumentException e)
+            {
+                gotException = true;
+            }
+            Assert.IsTrue(gotException);
+        }
+
+        [Test]
+        public void TestFrontUnigram()
+        {
+            EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1, 1);
+            AssertTokenStreamContents(tokenizer, new String[] { "a" }, new int[] { 0 }, new int[] { 1 }, 5 /* abcde */);
+        }
+
+        [Test]
+        public void TestBackUnigram()
+        {
+            EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.BACK, 1, 1);
+            AssertTokenStreamContents(tokenizer, new String[] { "e" }, new int[] { 4 }, new int[] { 5 }, 5 /* abcde */);
+        }
+
+        [Test]
+        public void TestOversizedNgrams()
+        {
+            EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 6, 6);
+            AssertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */);
+        }
+
+        [Test]
+        public void TestFrontRangeOfNgrams()
+        {
+            EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1, 3);
+            AssertTokenStreamContents(tokenizer, new String[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 }, 5 /* abcde */);
+        }
+
+        [Test]
+        public void TestBackRangeOfNgrams()
+        {
+            EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.BACK, 1, 3);
+            AssertTokenStreamContents(tokenizer, new String[] { "e", "de", "cde" }, new int[] { 4, 3, 2 }, new int[] { 5, 5, 5 }, 5 /* abcde */);
+        }
+
+        [Test]
+        public void TestReset()
+        {
+            EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1, 3);
+            AssertTokenStreamContents(tokenizer, new String[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 }, 5 /* abcde */);
+            tokenizer.Reset(new StringReader("abcde"));
+            AssertTokenStreamContents(tokenizer, new String[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 1, 2, 3 }, 5 /* abcde */);
+        }
+    }
+}
\ No newline at end of file

Added: incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestNGramTokenFilter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestNGramTokenFilter.cs?rev=1103463&view=auto
==============================================================================
--- incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestNGramTokenFilter.cs (added)
+++ incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestNGramTokenFilter.cs Sun May 15 17:12:24 2011
@@ -0,0 +1,125 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+using NUnit.Framework;
+
+namespace Lucene.Net.Analysis.NGram
+{
+
+    /**
+     * Tests {@link NGramTokenFilter} for correctness.
+     */
+    [TestFixture]
+    public class TestNGramTokenFilter : BaseTokenStreamTestCase
+    {
+        private TokenStream input;
+
+        [SetUp]
+        public void SetUp()
+        {
+            base.SetUp();
+            input = new WhitespaceTokenizer(new StringReader("abcde"));
+        }
+
+        [Test]
+        public void TestInvalidInput()
+        {
+            bool gotException = false;
+            try
+            {
+                new NGramTokenFilter(input, 2, 1);
+            }
+            catch (System.ArgumentException e)
+            {
+                gotException = true;
+            }
+            Assert.IsTrue(gotException);
+        }
+
+        [Test]
+        public void TestInvalidInput2()
+        {
+            bool gotException = false;
+            try
+            {
+                new NGramTokenFilter(input, 0, 1);
+            }
+            catch (System.ArgumentException e)
+            {
+                gotException = true;
+            }
+            Assert.IsTrue(gotException);
+        }
+
+        [Test]
+        public void TestUnigrams()
+        {
+            NGramTokenFilter filter = new NGramTokenFilter(input, 1, 1);
+            AssertTokenStreamContents(filter, new String[] { "a", "b", "c", "d", "e" }, new int[] { 0, 1, 2, 3, 4 }, new int[] { 1, 2, 3, 4, 5 });
+        }
+
+        [Test]
+        public void TestBigrams()
+        {
+            NGramTokenFilter filter = new NGramTokenFilter(input, 2, 2);
+            AssertTokenStreamContents(filter, new String[] { "ab", "bc", "cd", "de" }, new int[] { 0, 1, 2, 3 }, new int[] { 2, 3, 4, 5 });
+        }
+
+        [Test]
+        public void TestNgrams()
+        {
+            NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3);
+            AssertTokenStreamContents(filter,
+              new String[] { "a", "b", "c", "d", "e", "ab", "bc", "cd", "de", "abc", "bcd", "cde" },
+              new int[] { 0, 1, 2, 3, 4, 0, 1, 2, 3, 0, 1, 2 },
+              new int[] { 1, 2, 3, 4, 5, 2, 3, 4, 5, 3, 4, 5 }
+            );
+        }
+
+        [Test]
+        public void TestOversizedNgrams()
+        {
+            NGramTokenFilter filter = new NGramTokenFilter(input, 6, 7);
+            AssertTokenStreamContents(filter, new String[0], new int[0], new int[0]);
+        }
+
+        [Test]
+        public void TestSmallTokenInStream()
+        {
+            input = new WhitespaceTokenizer(new StringReader("abc de fgh"));
+            NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3);
+            AssertTokenStreamContents(filter, new String[] { "abc", "fgh" }, new int[] { 0, 7 }, new int[] { 3, 10 });
+        }
+
+        [Test]
+        public void TestReset()
+        {
+            WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(new StringReader("abcde"));
+            NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1);
+            AssertTokenStreamContents(filter, new String[] { "a", "b", "c", "d", "e" }, new int[] { 0, 1, 2, 3, 4 }, new int[] { 1, 2, 3, 4, 5 });
+            tokenizer.Reset(new StringReader("abcde"));
+            AssertTokenStreamContents(filter, new String[] { "a", "b", "c", "d", "e" }, new int[] { 0, 1, 2, 3, 4 }, new int[] { 1, 2, 3, 4, 5 });
+        }
+    }
+}
\ No newline at end of file

Added: incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestNGramTokenizer.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestNGramTokenizer.cs?rev=1103463&view=auto
==============================================================================
--- incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestNGramTokenizer.cs (added)
+++ incubator/lucene.net/trunk/test/contrib/Analyzers/NGram/TestNGramTokenizer.cs Sun May 15 17:12:24 2011
@@ -0,0 +1,117 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.IO;
+using System.Collections;
+
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+using NUnit.Framework;
+
+namespace Lucene.Net.Analysis.NGram
+{
+
+    /**
+     * Tests {@link NGramTokenizer} for correctness.
+     */
+    [TestFixture]
+    public class TestNGramTokenizer : BaseTokenStreamTestCase
+    {
+        private StringReader input;
+
+        [SetUp]
+        public void SetUp()
+        {
+            base.SetUp();
+            input = new StringReader("abcde");
+        }
+
+        [Test]
+        public void TestInvalidInput()
+        {
+            bool gotException = false;
+            try
+            {
+                new NGramTokenizer(input, 2, 1);
+            }
+            catch (System.ArgumentException e)
+            {
+                gotException = true;
+            }
+            Assert.IsTrue(gotException);
+        }
+
+        [Test]
+        public void TestInvalidInput2()
+        {
+            bool gotException = false;
+            try
+            {
+                new NGramTokenizer(input, 0, 1);
+            }
+            catch (System.ArgumentException e)
+            {
+                gotException = true;
+            }
+            Assert.IsTrue(gotException);
+        }
+
+        [Test]
+        public void TestUnigrams()
+        {
+            NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
+            AssertTokenStreamContents(tokenizer, new String[] { "a", "b", "c", "d", "e" }, new int[] { 0, 1, 2, 3, 4 }, new int[] { 1, 2, 3, 4, 5 }, 5 /* abcde */);
+        }
+
+        [Test]
+        public void TestBigrams()
+        {
+            NGramTokenizer tokenizer = new NGramTokenizer(input, 2, 2);
+            AssertTokenStreamContents(tokenizer, new String[] { "ab", "bc", "cd", "de" }, new int[] { 0, 1, 2, 3 }, new int[] { 2, 3, 4, 5 }, 5 /* abcde */);
+        }
+
+        [Test]
+        public void TestNgrams()
+        {
+            NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 3);
+            AssertTokenStreamContents(tokenizer,
+              new String[] { "a", "b", "c", "d", "e", "ab", "bc", "cd", "de", "abc", "bcd", "cde" },
+              new int[] { 0, 1, 2, 3, 4, 0, 1, 2, 3, 0, 1, 2 },
+              new int[] { 1, 2, 3, 4, 5, 2, 3, 4, 5, 3, 4, 5 },
+              5 /* abcde */
+            );
+        }
+
+        [Test]
+        public void TestOversizedNgrams()
+        {
+            NGramTokenizer tokenizer = new NGramTokenizer(input, 6, 7);
+            AssertTokenStreamContents(tokenizer, new String[0], new int[0], new int[0], 5 /* abcde */);
+        }
+
+        [Test]
+        public void TestReset()
+        {
+            NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
+            AssertTokenStreamContents(tokenizer, new String[] { "a", "b", "c", "d", "e" }, new int[] { 0, 1, 2, 3, 4 }, new int[] { 1, 2, 3, 4, 5 }, 5 /* abcde */);
+            tokenizer.Reset(new StringReader("abcde"));
+            AssertTokenStreamContents(tokenizer, new String[] { "a", "b", "c", "d", "e" }, new int[] { 0, 1, 2, 3, 4 }, new int[] { 1, 2, 3, 4, 5 }, 5 /* abcde */);
+        }
+    }
+}
\ No newline at end of file



Mime
View raw message