lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From nightowl...@apache.org
Subject [38/39] lucenenet git commit: Lucene.Net.Analysis.Ngram - renamed NGram in Git
Date Sat, 04 Feb 2017 20:32:57 GMT
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenFilter.cs
deleted file mode 100644
index 8cf8172..0000000
--- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenFilter.cs
+++ /dev/null
@@ -1,245 +0,0 @@
-using Lucene.Net.Analysis.TokenAttributes;
-using Lucene.Net.Analysis.Util;
-using Lucene.Net.Util;
-using System;
-
-namespace Lucene.Net.Analysis.NGram
-{
-    /*
-	 * Licensed to the Apache Software Foundation (ASF) under one or more
-	 * contributor license agreements.  See the NOTICE file distributed with
-	 * this work for additional information regarding copyright ownership.
-	 * The ASF licenses this file to You under the Apache License, Version 2.0
-	 * (the "License"); you may not use this file except in compliance with
-	 * the License.  You may obtain a copy of the License at
-	 *
-	 *     http://www.apache.org/licenses/LICENSE-2.0
-	 *
-	 * Unless required by applicable law or agreed to in writing, software
-	 * distributed under the License is distributed on an "AS IS" BASIS,
-	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-	 * See the License for the specific language governing permissions and
-	 * limitations under the License.
-	 */
-
-    /// <summary>
-    /// Tokenizes the given token into n-grams of given size(s).
-    /// <para>
-    /// This <see cref="TokenFilter"/> create n-grams from the beginning edge or ending edge of a input token.
-    /// </para>
-    /// <para>As of Lucene 4.4, this filter does not support
-    /// <see cref="Side.BACK"/> (you can use <see cref="Reverse.ReverseStringFilter"/> up-front and
-    /// afterward to get the same behavior), handles supplementary characters
-    /// correctly and does not update offsets anymore.
-    /// </para>
-    /// </summary>
-    public sealed class EdgeNGramTokenFilter : TokenFilter
-    {
-        public const Side DEFAULT_SIDE = Side.FRONT;
-        public const int DEFAULT_MAX_GRAM_SIZE = 1;
-        public const int DEFAULT_MIN_GRAM_SIZE = 1;
-
-        /// <summary>
-        /// Specifies which side of the input the n-gram should be generated from </summary>
-        public enum Side
-        {
-            /// <summary>
-            /// Get the n-gram from the front of the input </summary>
-            FRONT,
-
-            /// <summary>
-            /// Get the n-gram from the end of the input </summary>
-            [System.Obsolete]
-            BACK,
-        }
-
-        /// <summary>
-        /// Get the appropriate <see cref="Side"/> from a string
-        /// </summary>
-        public static Side GetSide(string sideName)
-        {
-            Side result;
-            if (!Enum.TryParse(sideName, true, out result))
-            {
-                result = Side.FRONT;
-            }
-            return result;
-        }
-
-        private readonly LuceneVersion version;
-        private readonly CharacterUtils charUtils;
-        private readonly int minGram;
-        private readonly int maxGram;
-        private Side side;
-        private char[] curTermBuffer;
-        private int curTermLength;
-        private int curCodePointCount;
-        private int curGramSize;
-        private int tokStart;
-        private int tokEnd; // only used if the length changed before this filter
-        private bool updateOffsets; // never if the length changed before this filter
-        private int savePosIncr;
-        private int savePosLen;
-
-        private readonly ICharTermAttribute termAtt;
-        private readonly IOffsetAttribute offsetAtt;
-        private readonly IPositionIncrementAttribute posIncrAtt;
-        private readonly IPositionLengthAttribute posLenAtt;
-
-        /// <summary>
-        /// Creates <see cref="EdgeNGramTokenFilter"/> that can generate n-grams in the sizes of the given range
-        /// </summary>
-        /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
-        /// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param>
-        /// <param name="side"> the <see cref="Side"/> from which to chop off an n-gram </param>
-        /// <param name="minGram"> the smallest n-gram to generate </param>
-        /// <param name="maxGram"> the largest n-gram to generate </param>
-        [Obsolete]
-        public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, Side side, int minGram, int maxGram)
-              : base(input)
-        {
-
-            //if (version == null)
-            //{
-            //    throw new System.ArgumentException("version must not be null");
-            //}
-
-            if (version.OnOrAfter(LuceneVersion.LUCENE_44) && side == Side.BACK)
-            {
-                throw new System.ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4, use ReverseStringFilter up-front and afterward");
-            }
-
-            if (!Enum.IsDefined(typeof(Side), side))
-            {
-                throw new System.ArgumentException("sideLabel must be either front or back");
-            }
-
-            if (minGram < 1)
-            {
-                throw new System.ArgumentException("minGram must be greater than zero");
-            }
-
-            if (minGram > maxGram)
-            {
-                throw new System.ArgumentException("minGram must not be greater than maxGram");
-            }
-
-            this.version = version;
-            this.charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ? CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance;
-            this.minGram = minGram;
-            this.maxGram = maxGram;
-            this.side = side;
-
-            this.termAtt = AddAttribute<ICharTermAttribute>();
-            this.offsetAtt = AddAttribute<IOffsetAttribute>();
-            this.posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
-            this.posLenAtt = AddAttribute<IPositionLengthAttribute>();
-        }
-
-        /// <summary>
-        /// Creates <see cref="EdgeNGramTokenFilter"/> that can generate n-grams in the sizes of the given range
-        /// </summary>
-        /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
-        /// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param>
-        /// <param name="sideLabel"> the name of the <see cref="Side"/> from which to chop off an n-gram </param>
-        /// <param name="minGram"> the smallest n-gram to generate </param>
-        /// <param name="maxGram"> the largest n-gram to generate </param>
-        [Obsolete]
-        public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, string sideLabel, int minGram, int maxGram)
-              : this(version, input, GetSide(sideLabel), minGram, maxGram)
-        {
-        }
-
-        /// <summary>
-        /// Creates <see cref="EdgeNGramTokenFilter"/> that can generate n-grams in the sizes of the given range
-        /// </summary>
-        /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
-        /// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param>
-        /// <param name="minGram"> the smallest n-gram to generate </param>
-        /// <param name="maxGram"> the largest n-gram to generate </param>
-        public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, int minGram, int maxGram)
-#pragma warning disable 612, 618
-              : this(version, input, Side.FRONT, minGram, maxGram)
-#pragma warning restore 612, 618
-        {
-        }
-
-        public override sealed bool IncrementToken()
-        {
-            while (true)
-            {
-                if (curTermBuffer == null)
-                {
-                    if (!m_input.IncrementToken())
-                    {
-                        return false;
-                    }
-                    else
-                    {
-                        curTermBuffer = (char[])termAtt.Buffer.Clone();
-                        curTermLength = termAtt.Length;
-                        curCodePointCount = charUtils.CodePointCount(termAtt.ToString());
-                        curGramSize = minGram;
-                        tokStart = offsetAtt.StartOffset;
-                        tokEnd = offsetAtt.EndOffset;
-#pragma warning disable 612, 618
-                        if (version.OnOrAfter(LuceneVersion.LUCENE_44))
-#pragma warning restore 612, 618
-                        {
-                            // Never update offsets
-                            updateOffsets = false;
-                        }
-                        else
-                        {
-                            // if length by start + end offsets doesn't match the term text then assume
-                            // this is a synonym and don't adjust the offsets.
-                            updateOffsets = (tokStart + curTermLength) == tokEnd;
-                        }
-                        savePosIncr += posIncrAtt.PositionIncrement;
-                        savePosLen = posLenAtt.PositionLength;
-                    }
-                }
-                if (curGramSize <= maxGram) // if we have hit the end of our n-gram size range, quit
-                {
-                    if (curGramSize <= curCodePointCount) // if the remaining input is too short, we can't generate any n-grams
-                    {
-                        // grab gramSize chars from front or back
-                        int start = side == Side.FRONT ? 0 : charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, curTermLength, -curGramSize);
-                        int end = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
-                        ClearAttributes();
-                        if (updateOffsets)
-                        {
-                            offsetAtt.SetOffset(tokStart + start, tokStart + end);
-                        }
-                        else
-                        {
-                            offsetAtt.SetOffset(tokStart, tokEnd);
-                        }
-                        // first ngram gets increment, others don't
-                        if (curGramSize == minGram)
-                        {
-                            posIncrAtt.PositionIncrement = savePosIncr;
-                            savePosIncr = 0;
-                        }
-                        else
-                        {
-                            posIncrAtt.PositionIncrement = 0;
-                        }
-                        posLenAtt.PositionLength = savePosLen;
-                        termAtt.CopyBuffer(curTermBuffer, start, end - start);
-                        curGramSize++;
-                        return true;
-                    }
-                }
-                curTermBuffer = null;
-            }
-        }
-
-        public override void Reset()
-        {
-            base.Reset();
-            curTermBuffer = null;
-            savePosIncr = 0;
-        }
-    }
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizer.cs
deleted file mode 100644
index ed2cb3d..0000000
--- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizer.cs
+++ /dev/null
@@ -1,72 +0,0 @@
-using Lucene.Net.Util;
-using System.IO;
-
-namespace Lucene.Net.Analysis.NGram
-{
-    /*
-     * Licensed to the Apache Software Foundation (ASF) under one or more
-     * contributor license agreements.  See the NOTICE file distributed with
-     * this work for additional information regarding copyright ownership.
-     * The ASF licenses this file to You under the Apache License, Version 2.0
-     * (the "License"); you may not use this file except in compliance with
-     * the License.  You may obtain a copy of the License at
-     *
-     *     http://www.apache.org/licenses/LICENSE-2.0
-     *
-     * Unless required by applicable law or agreed to in writing, software
-     * distributed under the License is distributed on an "AS IS" BASIS,
-     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-     * See the License for the specific language governing permissions and
-     * limitations under the License.
-     */
-
-    /// <summary>
-    /// Tokenizes the input from an edge into n-grams of given size(s).
-    /// <para>
-    /// This <see cref="Tokenizer"/> create n-grams from the beginning edge or ending edge of a input token.
-    /// </para>
-    /// <para>As of Lucene 4.4, this tokenizer
-    /// <list type="bullet">
-    ///     <item>can handle <code>maxGram</code> larger than 1024 chars, but beware that this will result in increased memory usage</item>
-    ///     <item>doesn't trim the input,</item>
-    ///     <item>sets position increments equal to 1 instead of 1 for the first token and 0 for all other ones</item>
-    ///     <item>doesn't support backward n-grams anymore.</item>
-    ///     <item>supports <see cref="Util.CharTokenizer.IsTokenChar(int)"/> pre-tokenization,</item>
-    ///     <item>correctly handles supplementary characters.</item>
-    /// </list>
-    /// </para>
-    /// <para>Although <b style="color:red">highly</b> discouraged, it is still possible
-    /// to use the old behavior through <see cref="Lucene43EdgeNGramTokenizer"/>.
-    /// </para>
-    /// </summary>
-    public class EdgeNGramTokenizer : NGramTokenizer
-    {
-        public const int DEFAULT_MAX_GRAM_SIZE = 1;
-        public const int DEFAULT_MIN_GRAM_SIZE = 1;
-
-        /// <summary>
-        /// Creates <see cref="EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range
-        /// </summary>
-        /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
-        /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
-        /// <param name="minGram"> the smallest n-gram to generate </param>
-        /// <param name="maxGram"> the largest n-gram to generate </param>
-        public EdgeNGramTokenizer(LuceneVersion version, TextReader input, int minGram, int maxGram)
-            : base(version, input, minGram, maxGram, true)
-        {
-        }
-
-        /// <summary>
-        /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
-        /// </summary>
-        /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
-        /// <param name="factory"> <see cref="AttributeSource.AttributeFactory"/> to use </param>
-        /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
-        /// <param name="minGram"> the smallest n-gram to generate </param>
-        /// <param name="maxGram"> the largest n-gram to generate </param>
-        public EdgeNGramTokenizer(LuceneVersion version, AttributeSource.AttributeFactory factory, TextReader input, int minGram, int maxGram)
-            : base(version, factory, input, minGram, maxGram, true)
-        {
-        }
-    }
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizerFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizerFactory.cs
deleted file mode 100644
index 00325f5..0000000
--- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizerFactory.cs
+++ /dev/null
@@ -1,75 +0,0 @@
-using Lucene.Net.Analysis.Util;
-using Lucene.Net.Util;
-using System;
-using System.Collections.Generic;
-using System.IO;
-
-namespace Lucene.Net.Analysis.NGram
-{
-    /*
-	 * Licensed to the Apache Software Foundation (ASF) under one or more
-	 * contributor license agreements.  See the NOTICE file distributed with
-	 * this work for additional information regarding copyright ownership.
-	 * The ASF licenses this file to You under the Apache License, Version 2.0
-	 * (the "License"); you may not use this file except in compliance with
-	 * the License.  You may obtain a copy of the License at
-	 *
-	 *     http://www.apache.org/licenses/LICENSE-2.0
-	 *
-	 * Unless required by applicable law or agreed to in writing, software
-	 * distributed under the License is distributed on an "AS IS" BASIS,
-	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-	 * See the License for the specific language governing permissions and
-	 * limitations under the License.
-	 */
-
-    /// <summary>
-    /// Creates new instances of <see cref="EdgeNGramTokenizer"/>.
-    /// <code>
-    /// &lt;fieldType name="text_edgngrm" class="solr.TextField" positionIncrementGap="100"&gt;
-    ///   &lt;analyzer&gt;
-    ///     &lt;tokenizer class="solr.EdgeNGramTokenizerFactory" minGramSize="1" maxGramSize="1"/&gt;
-    ///   &lt;/analyzer&gt;
-    /// &lt;/fieldType&gt;</code>
-    /// </summary>
-    public class EdgeNGramTokenizerFactory : TokenizerFactory
-    {
-        private readonly int maxGramSize;
-        private readonly int minGramSize;
-        private readonly string side;
-
-        /// <summary>
-        /// Creates a new <see cref="EdgeNGramTokenizerFactory"/> </summary>
-        public EdgeNGramTokenizerFactory(IDictionary<string, string> args) : base(args)
-        {
-            minGramSize = GetInt(args, "minGramSize", EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE);
-            maxGramSize = GetInt(args, "maxGramSize", EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE);
-            side = Get(args, "side", EdgeNGramTokenFilter.Side.FRONT.ToString());
-            if (args.Count > 0)
-            {
-                throw new System.ArgumentException("Unknown parameters: " + args);
-            }
-        }
-
-        public override Tokenizer Create(AttributeSource.AttributeFactory factory, TextReader input)
-        {
-#pragma warning disable 612, 618
-            if (m_luceneMatchVersion.OnOrAfter(LuceneVersion.LUCENE_44))
-#pragma warning restore 612, 618
-            {
-                EdgeNGramTokenFilter.Side sideEnum;
-                if (!Enum.TryParse(this.side, true, out sideEnum))
-                {
-                    throw new System.ArgumentException(typeof(EdgeNGramTokenizer).Name + " does not support backward n-grams as of Lucene 4.4");
-                }
-                return new EdgeNGramTokenizer(m_luceneMatchVersion, input, minGramSize, maxGramSize);
-            }
-            else
-            {
-#pragma warning disable 612, 618
-                return new Lucene43EdgeNGramTokenizer(m_luceneMatchVersion, input, side, minGramSize, maxGramSize);
-#pragma warning restore 612, 618
-            }
-        }
-    }
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43EdgeNGramTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43EdgeNGramTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43EdgeNGramTokenizer.cs
deleted file mode 100644
index 4dadbed..0000000
--- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43EdgeNGramTokenizer.cs
+++ /dev/null
@@ -1,297 +0,0 @@
-using Lucene.Net.Analysis.TokenAttributes;
-using Lucene.Net.Util;
-using System;
-using System.IO;
-
-namespace Lucene.Net.Analysis.NGram
-{
-    /*
-     * Licensed to the Apache Software Foundation (ASF) under one or more
-     * contributor license agreements.  See the NOTICE file distributed with
-     * this work for additional information regarding copyright ownership.
-     * The ASF licenses this file to You under the Apache License, Version 2.0
-     * (the "License"); you may not use this file except in compliance with
-     * the License.  You may obtain a copy of the License at
-     *
-     *     http://www.apache.org/licenses/LICENSE-2.0
-     *
-     * Unless required by applicable law or agreed to in writing, software
-     * distributed under the License is distributed on an "AS IS" BASIS,
-     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-     * See the License for the specific language governing permissions and
-     * limitations under the License.
-     */
-
-    /// <summary>
-    /// Old version of <see cref="EdgeNGramTokenizer"/> which doesn't handle correctly
-    /// supplementary characters.
-    /// </summary>
-    [Obsolete]
-    public sealed class Lucene43EdgeNGramTokenizer : Tokenizer
-    {
-        public const Side DEFAULT_SIDE = Side.FRONT;
-        public const int DEFAULT_MAX_GRAM_SIZE = 1;
-        public const int DEFAULT_MIN_GRAM_SIZE = 1;
-
-        private ICharTermAttribute termAtt;
-        private IOffsetAttribute offsetAtt;
-        private IPositionIncrementAttribute posIncrAtt;
-
-        /// <summary>
-        /// Specifies which side of the input the n-gram should be generated from </summary>
-        public enum Side
-        {
-            /// <summary>
-            /// Get the n-gram from the front of the input </summary>
-            FRONT,
-
-            /// <summary>
-            /// Get the n-gram from the end of the input </summary>
-            BACK,
-        }
-
-        // Get the appropriate Side from a string
-        public static Side GetSide(string sideName)
-        {
-            Side result;
-            if (!Enum.TryParse(sideName, true, out result))
-            {
-                result = Side.FRONT;
-            }
-            return result;
-        }
-
-        private int minGram;
-        private int maxGram;
-        private int gramSize;
-        private Side side;
-        private bool started;
-        private int inLen; // length of the input AFTER trim()
-        private int charsRead; // length of the input
-        private string inStr;
-
-
-        /// <summary>
-        /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range
-        /// </summary>
-        /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
-        /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
-        /// <param name="side"> the <see cref="Side"/> from which to chop off an n-gram </param>
-        /// <param name="minGram"> the smallest n-gram to generate </param>
-        /// <param name="maxGram"> the largest n-gram to generate </param>
-        [Obsolete]
-        public Lucene43EdgeNGramTokenizer(LuceneVersion version, TextReader input, Side side, int minGram, int maxGram)
-            : base(input)
-        {
-            Init(version, side, minGram, maxGram);
-        }
-
-        /// <summary>
-        /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range
-        /// </summary>
-        /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
-        /// <param name="factory"> <see cref="AttributeSource.AttributeFactory"/> to use </param>
-        /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
-        /// <param name="side"> the <see cref="Side"/> from which to chop off an n-gram </param>
-        /// <param name="minGram"> the smallest n-gram to generate </param>
-        /// <param name="maxGram"> the largest n-gram to generate </param>
-        [Obsolete]
-        public Lucene43EdgeNGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, Side side, int minGram, int maxGram)
-            : base(factory, input)
-        {
-            Init(version, side, minGram, maxGram);
-        }
-
-        /// <summary>
-        /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range
-        /// </summary>
-        /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
-        /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
-        /// <param name="sideLabel"> the name of the <see cref="Side"/> from which to chop off an n-gram </param>
-        /// <param name="minGram"> the smallest n-gram to generate </param>
-        /// <param name="maxGram"> the largest n-gram to generate </param>
-        [Obsolete]
-        public Lucene43EdgeNGramTokenizer(LuceneVersion version, TextReader input, string sideLabel, int minGram, int maxGram)
-            : this(version, input, GetSide(sideLabel), minGram, maxGram)
-        {
-        }
-
-        /// <summary>
-        /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range
-        /// </summary>
-        /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
-        /// <param name="factory"> <see cref="AttributeSource.AttributeFactory"/> to use </param>
-        /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
-        /// <param name="sideLabel"> the name of the <see cref="Side"/> from which to chop off an n-gram </param>
-        /// <param name="minGram"> the smallest n-gram to generate </param>
-        /// <param name="maxGram"> the largest n-gram to generate </param>
-        [Obsolete]
-        public Lucene43EdgeNGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, string sideLabel, int minGram, int maxGram)
-            : this(version, factory, input, GetSide(sideLabel), minGram, maxGram)
-        {
-        }
-
-        /// <summary>
-        /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range
-        /// </summary>
-        /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
-        /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
-        /// <param name="minGram"> the smallest n-gram to generate </param>
-        /// <param name="maxGram"> the largest n-gram to generate </param>
-        public Lucene43EdgeNGramTokenizer(LuceneVersion version, TextReader input, int minGram, int maxGram)
-            : this(version, input, Side.FRONT, minGram, maxGram)
-        {
-        }
-
-        /// <summary>
-        /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range
-        /// </summary>
-        /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
-        /// <param name="factory"> <see cref="AttributeSource.AttributeFactory"/> to use </param>
-        /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
-        /// <param name="minGram"> the smallest n-gram to generate </param>
-        /// <param name="maxGram"> the largest n-gram to generate </param>
-        public Lucene43EdgeNGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, int minGram, int maxGram)
-            : this(version, factory, input, Side.FRONT, minGram, maxGram)
-        {
-        }
-
-        private void Init(LuceneVersion version, Side side, int minGram, int maxGram)
-        {
-            //if (version == null)
-            //{
-            //    throw new System.ArgumentException("version must not be null");
-            //}
-
-            if (!Enum.IsDefined(typeof(Side), side))
-            {
-                throw new System.ArgumentException("sideLabel must be either front or back");
-            }
-
-            if (minGram < 1)
-            {
-                throw new System.ArgumentException("minGram must be greater than zero");
-            }
-
-            if (minGram > maxGram)
-            {
-                throw new System.ArgumentException("minGram must not be greater than maxGram");
-            }
-
-            if (version.OnOrAfter(LuceneVersion.LUCENE_44))
-            {
-                if (side == Side.BACK)
-                {
-                    throw new System.ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4");
-                }
-            }
-            else
-            {
-                maxGram = Math.Min(maxGram, 1024);
-            }
-
-            this.minGram = minGram;
-            this.maxGram = maxGram;
-            this.side = side;
-            this.termAtt = AddAttribute<ICharTermAttribute>();
-            this.offsetAtt = AddAttribute<IOffsetAttribute>();
-            this.posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
-        }
-
-        /// <summary>
-        /// Returns the next token in the stream, or null at EOS. </summary>
-        public override bool IncrementToken()
-        {
-            ClearAttributes();
-            // if we are just starting, read the whole input
-            if (!started)
-            {
-                started = true;
-                gramSize = minGram;
-                int limit = side == Side.FRONT ? maxGram : 1024;
-                char[] chars = new char[Math.Min(1024, limit)];
-                charsRead = 0;
-                // TODO: refactor to a shared readFully somewhere:
-                bool exhausted = false;
-                while (charsRead < limit)
-                {
-                    int inc = m_input.Read(chars, charsRead, chars.Length - charsRead);
-                    if (inc <= 0)
-                    {
-                        exhausted = true;
-                        break;
-                    }
-                    charsRead += inc;
-                    if (charsRead == chars.Length && charsRead < limit)
-                    {
-                        chars = ArrayUtil.Grow(chars);
-                    }
-                }
-
-                inStr = new string(chars, 0, charsRead);
-                inStr = inStr.Trim();
-
-                if (!exhausted)
-                {
-                    // Read extra throwaway chars so that on end() we
-                    // report the correct offset:
-                    var throwaway = new char[1024];
-                    while (true)
-                    {
-                        int inc = m_input.Read(throwaway, 0, throwaway.Length);
-                        if (inc <= 0)
-                        {
-                            break;
-                        }
-                        charsRead += inc;
-                    }
-                }
-
-                inLen = inStr.Length;
-                if (inLen == 0)
-                {
-                    return false;
-                }
-                posIncrAtt.PositionIncrement = 1;
-            }
-            else
-            {
-                posIncrAtt.PositionIncrement = 0;
-            }
-
-            // if the remaining input is too short, we can't generate any n-grams
-            if (gramSize > inLen)
-            {
-                return false;
-            }
-
-            // if we have hit the end of our n-gram size range, quit
-            if (gramSize > maxGram || gramSize > inLen)
-            {
-                return false;
-            }
-
-            // grab gramSize chars from front or back
-            int start = side == Side.FRONT ? 0 : inLen - gramSize;
-            int end = start + gramSize;
-            termAtt.SetEmpty().Append(inStr, start, end);
-            offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(end));
-            gramSize++;
-            return true;
-        }
-
-        public override void End()
-        {
-            base.End();
-            // set final offset
-            int finalOffset = CorrectOffset(charsRead);
-            this.offsetAtt.SetOffset(finalOffset, finalOffset);
-        }
-
-        public override void Reset()
-        {
-            base.Reset();
-            started = false;
-        }
-    }
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43NGramTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43NGramTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43NGramTokenizer.cs
deleted file mode 100644
index b806345..0000000
--- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43NGramTokenizer.cs
+++ /dev/null
@@ -1,173 +0,0 @@
-using Lucene.Net.Analysis.TokenAttributes;
-using System;
-using System.IO;
-
-namespace Lucene.Net.Analysis.NGram
-{
-    /*
-     * Licensed to the Apache Software Foundation (ASF) under one or more
-     * contributor license agreements.  See the NOTICE file distributed with
-     * this work for additional information regarding copyright ownership.
-     * The ASF licenses this file to You under the Apache License, Version 2.0
-     * (the "License"); you may not use this file except in compliance with
-     * the License.  You may obtain a copy of the License at
-     *
-     *     http://www.apache.org/licenses/LICENSE-2.0
-     *
-     * Unless required by applicable law or agreed to in writing, software
-     * distributed under the License is distributed on an "AS IS" BASIS,
-     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-     * See the License for the specific language governing permissions and
-     * limitations under the License.
-     */
-
-    /// <summary>
-    /// Old broken version of <see cref="NGramTokenizer"/>.
-    /// </summary>
-    [Obsolete]
-    public sealed class Lucene43NGramTokenizer : Tokenizer
-    {
-        public const int DEFAULT_MIN_NGRAM_SIZE = 1;
-        public const int DEFAULT_MAX_NGRAM_SIZE = 2;
-
-        private int minGram, maxGram;
-        private int gramSize;
-        private int pos;
-        private int inLen; // length of the input AFTER trim()
-        private int charsRead; // length of the input
-        private string inStr;
-        private bool started;
-
-        private ICharTermAttribute termAtt;
-        private IOffsetAttribute offsetAtt;
-
-        /// <summary>
-        /// Creates <see cref="Lucene43NGramTokenizer"/> with given min and max n-grams. </summary>
-        /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
-        /// <param name="minGram"> the smallest n-gram to generate </param>
-        /// <param name="maxGram"> the largest n-gram to generate </param>
-        public Lucene43NGramTokenizer(TextReader input, int minGram, int maxGram)
-            : base(input)
-        {
-            Init(minGram, maxGram);
-        }
-
-        /// <summary>
-        /// Creates <see cref="Lucene43NGramTokenizer"/> with given min and max n-grams. </summary>
-        /// <param name="factory"> <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory"/> to use </param>
-        /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
-        /// <param name="minGram"> the smallest n-gram to generate </param>
-        /// <param name="maxGram"> the largest n-gram to generate </param>
-        public Lucene43NGramTokenizer(AttributeFactory factory, TextReader input, int minGram, int maxGram)
-            : base(factory, input)
-        {
-            Init(minGram, maxGram);
-        }
-
-        /// <summary>
-        /// Creates <see cref="Lucene43NGramTokenizer"/> with default min and max n-grams. </summary>
-        /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
-        public Lucene43NGramTokenizer(TextReader input)
-            : this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE)
-        {
-        }
-
-        private void Init(int minGram, int maxGram)
-        {
-            if (minGram < 1)
-            {
-                throw new System.ArgumentException("minGram must be greater than zero");
-            }
-            if (minGram > maxGram)
-            {
-                throw new System.ArgumentException("minGram must not be greater than maxGram");
-            }
-            this.minGram = minGram;
-            this.maxGram = maxGram;
-            termAtt = AddAttribute<ICharTermAttribute>();
-            offsetAtt = AddAttribute<IOffsetAttribute>();
-        }
-
-        /// <summary>
-        /// Returns the next token in the stream, or null at EOS. </summary>
-        public override bool IncrementToken()
-        {
-            ClearAttributes();
-            if (!started)
-            {
-                started = true;
-                gramSize = minGram;
-                char[] chars = new char[1024];
-                charsRead = 0;
-                // TODO: refactor to a shared readFully somewhere:
-                while (charsRead < chars.Length)
-                {
-                    int inc = m_input.Read(chars, charsRead, chars.Length - charsRead);
-                    if (inc == -1)
-                    {
-                        break;
-                    }
-                    charsRead += inc;
-                }
-                inStr = (new string(chars, 0, charsRead)).Trim(); // remove any trailing empty strings
-
-                if (charsRead == chars.Length)
-                {
-                    // Read extra throwaway chars so that on end() we
-                    // report the correct offset:
-                    var throwaway = new char[1024];
-                    while (true)
-                    {
-                        int inc = m_input.Read(throwaway, 0, throwaway.Length);
-                        if (inc == -1)
-                        {
-                            break;
-                        }
-                        charsRead += inc;
-                    }
-                }
-
-                inLen = inStr.Length;
-                if (inLen == 0)
-                {
-                    return false;
-                }
-            }
-
-            if (pos + gramSize > inLen) // if we hit the end of the string
-            {
-                pos = 0; // reset to beginning of string
-                gramSize++; // increase n-gram size
-                if (gramSize > maxGram) // we are done
-                {
-                    return false;
-                }
-                if (pos + gramSize > inLen)
-                {
-                    return false;
-                }
-            }
-
-            int oldPos = pos;
-            pos++;
-            termAtt.SetEmpty().Append(inStr, oldPos, oldPos + gramSize);
-            offsetAtt.SetOffset(CorrectOffset(oldPos), CorrectOffset(oldPos + gramSize));
-            return true;
-        }
-
-        public override void End()
-        {
-            base.End();
-            // set final offset
-            int finalOffset = CorrectOffset(charsRead);
-            this.offsetAtt.SetOffset(finalOffset, finalOffset);
-        }
-
-        public override void Reset()
-        {
-            base.Reset();
-            started = false;
-            pos = 0;
-        }
-    }
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramFilterFactory.cs
deleted file mode 100644
index ca1d0bc..0000000
--- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramFilterFactory.cs
+++ /dev/null
@@ -1,56 +0,0 @@
-using Lucene.Net.Analysis.Util;
-using System.Collections.Generic;
-
-namespace Lucene.Net.Analysis.NGram
-{
-    /*
-     * Licensed to the Apache Software Foundation (ASF) under one or more
-     * contributor license agreements.  See the NOTICE file distributed with
-     * this work for additional information regarding copyright ownership.
-     * The ASF licenses this file to You under the Apache License, Version 2.0
-     * (the "License"); you may not use this file except in compliance with
-     * the License.  You may obtain a copy of the License at
-     *
-     *     http://www.apache.org/licenses/LICENSE-2.0
-     *
-     * Unless required by applicable law or agreed to in writing, software
-     * distributed under the License is distributed on an "AS IS" BASIS,
-     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-     * See the License for the specific language governing permissions and
-     * limitations under the License.
-     */
-
-    /// <summary>
-    /// Factory for <see cref="NGramTokenFilter"/>.
-    /// <code>
-    /// &lt;fieldType name="text_ngrm" class="solr.TextField" positionIncrementGap="100"&gt;
-    ///   &lt;analyzer&gt;
-    ///     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
-    ///     &lt;filter class="solr.NGramFilterFactory" minGramSize="1" maxGramSize="2"/&gt;
-    ///   &lt;/analyzer&gt;
-    /// &lt;/fieldType&gt;</code>
-    /// </summary>
-    public class NGramFilterFactory : TokenFilterFactory
-    {
-        private readonly int maxGramSize;
-        private readonly int minGramSize;
-
-        /// <summary>
-        /// Creates a new <see cref="NGramFilterFactory"/> </summary>
-        public NGramFilterFactory(IDictionary<string, string> args)
-            : base(args)
-        {
-            minGramSize = GetInt(args, "minGramSize", NGramTokenFilter.DEFAULT_MIN_NGRAM_SIZE);
-            maxGramSize = GetInt(args, "maxGramSize", NGramTokenFilter.DEFAULT_MAX_NGRAM_SIZE);
-            if (args.Count > 0)
-            {
-                throw new System.ArgumentException("Unknown parameters: " + args);
-            }
-        }
-
-        public override TokenStream Create(TokenStream input)
-        {
-            return new NGramTokenFilter(m_luceneMatchVersion, input, minGramSize, maxGramSize);
-        }
-    }
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenFilter.cs
deleted file mode 100644
index f1c82c5..0000000
--- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenFilter.cs
+++ /dev/null
@@ -1,252 +0,0 @@
-using Lucene.Net.Analysis.Miscellaneous;
-using Lucene.Net.Analysis.TokenAttributes;
-using Lucene.Net.Analysis.Util;
-using Lucene.Net.Util;
-
-namespace Lucene.Net.Analysis.NGram
-{
-    /*
-     * Licensed to the Apache Software Foundation (ASF) under one or more
-     * contributor license agreements.  See the NOTICE file distributed with
-     * this work for additional information regarding copyright ownership.
-     * The ASF licenses this file to You under the Apache License, Version 2.0
-     * (the "License"); you may not use this file except in compliance with
-     * the License.  You may obtain a copy of the License at
-     *
-     *     http://www.apache.org/licenses/LICENSE-2.0
-     *
-     * Unless required by applicable law or agreed to in writing, software
-     * distributed under the License is distributed on an "AS IS" BASIS,
-     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-     * See the License for the specific language governing permissions and
-     * limitations under the License.
-     */
-
-    /// <summary>
-    /// Tokenizes the input into n-grams of the given size(s).
-    /// <para>You must specify the required <see cref="LuceneVersion"/> compatibility when
-    /// creating a <see cref="NGramTokenFilter"/>. As of Lucene 4.4, this token filters:
-    /// <list type="bullet">
-    ///     <item>handles supplementary characters correctly,</item>
-    ///     <item>emits all n-grams for the same token at the same position,</item>
-    ///     <item>does not modify offsets,</item>
-    ///     <item>sorts n-grams by their offset in the original token first, then
-    ///         increasing length (meaning that "abc" will give "a", "ab", "abc", "b", "bc",
-    ///         "c").</item>
-    /// </list>
-    /// </para>
-    /// <para>You can make this filter use the old behavior by providing a version &lt;
-    /// <see cref="LuceneVersion.LUCENE_44"/> in the constructor but this is not recommended as
-    /// it will lead to broken <see cref="TokenStream"/>s that will cause highlighting
-    /// bugs.
-    /// </para>
-    /// <para>If you were using this <see cref="TokenFilter"/> to perform partial highlighting,
-    /// this won't work anymore since this filter doesn't update offsets. You should
-    /// modify your analysis chain to use <see cref="NGramTokenizer"/>, and potentially
-    /// override <see cref="NGramTokenizer.IsTokenChar(int)"/> to perform pre-tokenization.
-    /// </para>
-    /// </summary>
-    public sealed class NGramTokenFilter : TokenFilter
-    {
-        public const int DEFAULT_MIN_NGRAM_SIZE = 1;
-        public const int DEFAULT_MAX_NGRAM_SIZE = 2;
-
-        private readonly int minGram, maxGram;
-
-        private char[] curTermBuffer;
-        private int curTermLength;
-        private int curCodePointCount;
-        private int curGramSize;
-        private int curPos;
-        private int curPosInc, curPosLen;
-        private int tokStart;
-        private int tokEnd;
-        private bool hasIllegalOffsets; // only if the length changed before this filter
-
-        private readonly LuceneVersion version;
-        private readonly CharacterUtils charUtils;
-        private readonly ICharTermAttribute termAtt;
-        private readonly IPositionIncrementAttribute posIncAtt;
-        private readonly IPositionLengthAttribute posLenAtt;
-        private readonly IOffsetAttribute offsetAtt;
-
-        /// <summary>
-        /// Creates <see cref="NGramTokenFilter"/> with given min and max n-grams. </summary>
-        /// <param name="version"> Lucene version to enable correct position increments.
-        ///                See <see cref="NGramTokenFilter"/> for details. </param>
-        /// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param>
-        /// <param name="minGram"> the smallest n-gram to generate </param>
-        /// <param name="maxGram"> the largest n-gram to generate </param>
-        public NGramTokenFilter(LuceneVersion version, TokenStream input, int minGram, int maxGram)
-            : base(new CodepointCountFilter(version, input, minGram, int.MaxValue))
-        {
-            this.version = version;
-            this.charUtils = version.OnOrAfter(
-#pragma warning disable 612, 618
-                LuceneVersion.LUCENE_44) ?
-#pragma warning restore 612, 618
-                CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance;
-            if (minGram < 1)
-            {
-                throw new System.ArgumentException("minGram must be greater than zero");
-            }
-            if (minGram > maxGram)
-            {
-                throw new System.ArgumentException("minGram must not be greater than maxGram");
-            }
-            this.minGram = minGram;
-            this.maxGram = maxGram;
-#pragma warning disable 612, 618
-            if (version.OnOrAfter(LuceneVersion.LUCENE_44))
-#pragma warning restore 612, 618
-            {
-                posIncAtt = AddAttribute<IPositionIncrementAttribute>();
-                posLenAtt = AddAttribute<IPositionLengthAttribute>();
-            }
-            else
-            {
-                posIncAtt = new PositionIncrementAttributeAnonymousInnerClassHelper(this);
-                posLenAtt = new PositionLengthAttributeAnonymousInnerClassHelper(this);
-            }
-            termAtt = AddAttribute<ICharTermAttribute>();
-            offsetAtt = AddAttribute<IOffsetAttribute>();
-        }
-
-        private class PositionIncrementAttributeAnonymousInnerClassHelper : PositionIncrementAttribute
-        {
-            private readonly NGramTokenFilter outerInstance;
-
-            public PositionIncrementAttributeAnonymousInnerClassHelper(NGramTokenFilter outerInstance)
-            {
-                this.outerInstance = outerInstance;
-            }
-
-            public override int PositionIncrement
-            {
-                set
-                {
-                }
-                get
-                {
-                    return 0;
-                }
-            }
-        }
-
-        private class PositionLengthAttributeAnonymousInnerClassHelper : PositionLengthAttribute
-        {
-            private readonly NGramTokenFilter outerInstance;
-
-            public PositionLengthAttributeAnonymousInnerClassHelper(NGramTokenFilter outerInstance)
-            {
-                this.outerInstance = outerInstance;
-            }
-
-            public override int PositionLength
-            {
-                set
-                {
-                }
-                get
-                {
-                    return 0;
-                }
-            }
-        }
-
-        /// <summary>
-        /// Creates <see cref="NGramTokenFilter"/> with default min and max n-grams. </summary>
-        /// <param name="version"> Lucene version to enable correct position increments.
-        ///                See <see cref="NGramTokenFilter"/> for details. </param>
-        /// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param>
-        public NGramTokenFilter(LuceneVersion version, TokenStream input)
-            : this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE)
-        {
-        }
-
-        /// <summary>
-        /// Returns the next token in the stream, or null at EOS.
-        /// </summary>
-        public override sealed bool IncrementToken()
-        {
-            while (true)
-            {
-                if (curTermBuffer == null)
-                {
-                    if (!m_input.IncrementToken())
-                    {
-                        return false;
-                    }
-                    else
-                    {
-                        curTermBuffer = (char[])termAtt.Buffer.Clone();
-                        curTermLength = termAtt.Length;
-                        curCodePointCount = charUtils.CodePointCount(termAtt.ToString());
-                        curGramSize = minGram;
-                        curPos = 0;
-                        curPosInc = posIncAtt.PositionIncrement;
-                        curPosLen = posLenAtt.PositionLength;
-                        tokStart = offsetAtt.StartOffset;
-                        tokEnd = offsetAtt.EndOffset;
-                        // if length by start + end offsets doesn't match the term text then assume
-                        // this is a synonym and don't adjust the offsets.
-                        hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
-                    }
-                }
-#pragma warning disable 612, 618
-                if (version.OnOrAfter(LuceneVersion.LUCENE_44))
-#pragma warning restore 612, 618
-                {
-                    if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount)
-                    {
-                        ++curPos;
-                        curGramSize = minGram;
-                    }
-                    if ((curPos + curGramSize) <= curCodePointCount)
-                    {
-                        ClearAttributes();
-                        int start = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
-                        int end = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
-                        termAtt.CopyBuffer(curTermBuffer, start, end - start);
-                        posIncAtt.PositionIncrement = curPosInc;
-                        curPosInc = 0;
-                        posLenAtt.PositionLength = curPosLen;
-                        offsetAtt.SetOffset(tokStart, tokEnd);
-                        curGramSize++;
-                        return true;
-                    }
-                }
-                else
-                {
-                    while (curGramSize <= maxGram)
-                    {
-                        while (curPos + curGramSize <= curTermLength) // while there is input
-                        {
-                            ClearAttributes();
-                            termAtt.CopyBuffer(curTermBuffer, curPos, curGramSize);
-                            if (hasIllegalOffsets)
-                            {
-                                offsetAtt.SetOffset(tokStart, tokEnd);
-                            }
-                            else
-                            {
-                                offsetAtt.SetOffset(tokStart + curPos, tokStart + curPos + curGramSize);
-                            }
-                            curPos++;
-                            return true;
-                        }
-                        curGramSize++; // increase n-gram size
-                        curPos = 0;
-                    }
-                }
-                curTermBuffer = null;
-            }
-        }
-
-        public override void Reset()
-        {
-            base.Reset();
-            curTermBuffer = null;
-        }
-    }
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizer.cs
deleted file mode 100644
index b1845c8..0000000
--- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizer.cs
+++ /dev/null
@@ -1,319 +0,0 @@
-using Lucene.Net.Analysis.TokenAttributes;
-using Lucene.Net.Analysis.Util;
-using Lucene.Net.Support;
-using Lucene.Net.Util;
-using System;
-using System.Diagnostics;
-using System.IO;
-
-namespace Lucene.Net.Analysis.NGram
-{
-    /*
-	 * Licensed to the Apache Software Foundation (ASF) under one or more
-	 * contributor license agreements.  See the NOTICE file distributed with
-	 * this work for additional information regarding copyright ownership.
-	 * The ASF licenses this file to You under the Apache License, Version 2.0
-	 * (the "License"); you may not use this file except in compliance with
-	 * the License.  You may obtain a copy of the License at
-	 *
-	 *     http://www.apache.org/licenses/LICENSE-2.0
-	 *
-	 * Unless required by applicable law or agreed to in writing, software
-	 * distributed under the License is distributed on an "AS IS" BASIS,
-	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-	 * See the License for the specific language governing permissions and
-	 * limitations under the License.
-	 */
-
-    /// <summary>
-    /// Tokenizes the input into n-grams of the given size(s).
-    /// <para>On the contrary to <see cref="NGramTokenFilter"/>, this class sets offsets so
-    /// that characters between startOffset and endOffset in the original stream are
-    /// the same as the term chars.
-    /// </para>
-    /// <para>For example, "abcde" would be tokenized as (minGram=2, maxGram=3):
-    /// <list type="table">
-    ///     <listheader>
-    ///         <term>Term</term>
-    ///         <term>Position increment</term>
-    ///         <term>Position length</term>
-    ///         <term>Offsets</term>
-    ///     </listheader>
-    ///     <item>
-    ///         <term>ab</term>
-    ///         <term>1</term>
-    ///         <term>1</term>
-    ///         <term>[0,2[</term>
-    ///     </item>
-    ///     <item>
-    ///         <term>abc</term>
-    ///         <term>1</term>
-    ///         <term>1</term>
-    ///         <term>[0,3[</term>
-    ///     </item>
-    ///     <item>
-    ///         <term>bc</term>
-    ///         <term>1</term>
-    ///         <term>1</term>
-    ///         <term>[1,3[</term>
-    ///     </item>
-    ///     <item>
-    ///         <term>bcd</term>
-    ///         <term>1</term>
-    ///         <term>1</term>
-    ///         <term>[1,4[</term>
-    ///     </item>
-    ///     <item>
-    ///         <term>cd</term>
-    ///         <term>1</term>
-    ///         <term>1</term>
-    ///         <term>[2,4[</term>
-    ///     </item>
-    ///     <item>
-    ///         <term>cde</term>
-    ///         <term>1</term>
-    ///         <term>1</term>
-    ///         <term>[2,5[</term>
-    ///     </item>
-    ///     <item>
-    ///         <term>de</term>
-    ///         <term>1</term>
-    ///         <term>1</term>
-    ///         <term>[3,5[</term>
-    ///     </item>
-    /// </list>
-    /// </para>
-    /// <para>This tokenizer changed a lot in Lucene 4.4 in order to:
-    /// <list type="bullet">
-    ///     <item>tokenize in a streaming fashion to support streams which are larger
-    ///         than 1024 chars (limit of the previous version),</item>
-    ///     <item>count grams based on unicode code points instead of java chars (and
-    ///         never split in the middle of surrogate pairs),</item>
-    ///     <item>give the ability to pre-tokenize the stream (<see cref="IsTokenChar(int)"/>)
-    ///         before computing n-grams.</item>
-    /// </list>
-    /// </para>
-    /// <para>Additionally, this class doesn't trim trailing whitespaces and emits
-    /// tokens in a different order, tokens are now emitted by increasing start
-    /// offsets while they used to be emitted by increasing lengths (which prevented
-    /// from supporting large input streams).
-    /// </para>
-    /// <para>Although <b style="color:red">highly</b> discouraged, it is still possible
-    /// to use the old behavior through <see cref="Lucene43NGramTokenizer"/>.
-    /// </para>
-    /// </summary>
-    // non-sealed to allow for overriding IsTokenChar, but all other methods should be sealed
-    public class NGramTokenizer : Tokenizer
-    {
-        public const int DEFAULT_MIN_NGRAM_SIZE = 1;
-        public const int DEFAULT_MAX_NGRAM_SIZE = 2;
-
-        private CharacterUtils charUtils;
-        private CharacterUtils.CharacterBuffer charBuffer;
-        private int[] buffer; // like charBuffer, but converted to code points
-        private int bufferStart, bufferEnd; // remaining slice in buffer
-        private int offset;
-        private int gramSize;
-        private int minGram, maxGram;
-        private bool exhausted;
-        private int lastCheckedChar; // last offset in the buffer that we checked
-        private int lastNonTokenChar; // last offset that we found to not be a token char
-        private bool edgesOnly; // leading edges n-grams only
-
-        private ICharTermAttribute termAtt;
-        private IPositionIncrementAttribute posIncAtt;
-        private IPositionLengthAttribute posLenAtt;
-        private IOffsetAttribute offsetAtt;
-
-        internal NGramTokenizer(LuceneVersion version, TextReader input, int minGram, int maxGram, bool edgesOnly)
-              : base(input)
-        {
-            Init(version, minGram, maxGram, edgesOnly);
-        }
-
-        /// <summary>
-        /// Creates <see cref="NGramTokenizer"/> with given min and max n-grams. </summary>
-        /// <param name="version"> the lucene compatibility version </param>
-        /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
-        /// <param name="minGram"> the smallest n-gram to generate </param>
-        /// <param name="maxGram"> the largest n-gram to generate </param>
-        public NGramTokenizer(LuceneVersion version, TextReader input, int minGram, int maxGram)
-              : this(version, input, minGram, maxGram, false)
-        {
-        }
-
-        internal NGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, int minGram, int maxGram, bool edgesOnly)
-              : base(factory, input)
-        {
-            Init(version, minGram, maxGram, edgesOnly);
-        }
-
-        /// <summary>
-        /// Creates <see cref="NGramTokenizer"/> with given min and max n-grams. </summary>
-        /// <param name="version"> the lucene compatibility version </param>
-        /// <param name="factory"> <see cref="AttributeSource.AttributeFactory"/> to use </param>
-        /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
-        /// <param name="minGram"> the smallest n-gram to generate </param>
-        /// <param name="maxGram"> the largest n-gram to generate </param>
-        public NGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, int minGram, int maxGram)
-              : this(version, factory, input, minGram, maxGram, false)
-        {
-        }
-
-        /// <summary>
-        /// Creates <see cref="NGramTokenizer"/> with default min and max n-grams. </summary>
-        /// <param name="version"> the lucene compatibility version </param>
-        /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
-        public NGramTokenizer(LuceneVersion version, TextReader input)
-              : this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE)
-        {
-        }
-
-        private void Init(LuceneVersion version, int minGram, int maxGram, bool edgesOnly)
-        {
-#pragma warning disable 612, 618
-            if (!version.OnOrAfter(LuceneVersion.LUCENE_44))
-#pragma warning restore 612, 618
-            {
-                throw new System.ArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer/Lucene43EdgeNGramTokenizer");
-            }
-#pragma warning disable 612, 618
-            charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ?
-#pragma warning restore 612, 618
-                CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance;
-            if (minGram < 1)
-            {
-                throw new System.ArgumentException("minGram must be greater than zero");
-            }
-            if (minGram > maxGram)
-            {
-                throw new System.ArgumentException("minGram must not be greater than maxGram");
-            }
-            termAtt = AddAttribute<ICharTermAttribute>();
-            posIncAtt = AddAttribute<IPositionIncrementAttribute>();
-            posLenAtt = AddAttribute<IPositionLengthAttribute>();
-            offsetAtt = AddAttribute<IOffsetAttribute>();
-            this.minGram = minGram;
-            this.maxGram = maxGram;
-            this.edgesOnly = edgesOnly;
-            charBuffer = CharacterUtils.NewCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader
-            buffer = new int[charBuffer.Buffer.Length];
-
-            // Make the term att large enough
-            termAtt.ResizeBuffer(2 * maxGram);
-        }
-
-        public override sealed bool IncrementToken()
-        {
-            ClearAttributes();
-
-            // termination of this loop is guaranteed by the fact that every iteration
-            // either advances the buffer (calls consumes()) or increases gramSize
-            while (true)
-            {
-                // compact
-                if (bufferStart >= bufferEnd - maxGram - 1 && !exhausted)
-                {
-                    Array.Copy(buffer, bufferStart, buffer, 0, bufferEnd - bufferStart);
-                    bufferEnd -= bufferStart;
-                    lastCheckedChar -= bufferStart;
-                    lastNonTokenChar -= bufferStart;
-                    bufferStart = 0;
-
-                    // fill in remaining space
-                    exhausted = !charUtils.Fill(charBuffer, m_input, buffer.Length - bufferEnd);
-                    // convert to code points
-                    bufferEnd += charUtils.ToCodePoints(charBuffer.Buffer, 0, charBuffer.Length, buffer, bufferEnd);
-                }
-
-                // should we go to the next offset?
-                if (gramSize > maxGram || (bufferStart + gramSize) > bufferEnd)
-                {
-                    if (bufferStart + 1 + minGram > bufferEnd)
-                    {
-                        Debug.Assert(exhausted);
-                        return false;
-                    }
-                    Consume();
-                    gramSize = minGram;
-                }
-
-                UpdateLastNonTokenChar();
-
-                // retry if the token to be emitted was going to not only contain token chars
-                bool termContainsNonTokenChar = lastNonTokenChar >= bufferStart && lastNonTokenChar < (bufferStart + gramSize);
-                bool isEdgeAndPreviousCharIsTokenChar = edgesOnly && lastNonTokenChar != bufferStart - 1;
-                if (termContainsNonTokenChar || isEdgeAndPreviousCharIsTokenChar)
-                {
-                    Consume();
-                    gramSize = minGram;
-                    continue;
-                }
-
-                int length = charUtils.ToChars(buffer, bufferStart, gramSize, termAtt.Buffer, 0);
-                termAtt.Length = length;
-                posIncAtt.PositionIncrement = 1;
-                posLenAtt.PositionLength = 1;
-                offsetAtt.SetOffset(CorrectOffset(offset), CorrectOffset(offset + length));
-                ++gramSize;
-                return true;
-            }
-        }
-
-        private void UpdateLastNonTokenChar()
-        {
-            int termEnd = bufferStart + gramSize - 1;
-            if (termEnd > lastCheckedChar)
-            {
-                for (int i = termEnd; i > lastCheckedChar; --i)
-                {
-                    if (!IsTokenChar(buffer[i]))
-                    {
-                        lastNonTokenChar = i;
-                        break;
-                    }
-                }
-                lastCheckedChar = termEnd;
-            }
-        }
-
-        /// <summary>
-        /// Consume one code point. </summary>
-        private void Consume()
-        {
-            offset += Character.CharCount(buffer[bufferStart++]);
-        }
-
-        /// <summary>
-        /// Only collect characters which satisfy this condition. </summary>
-        protected virtual bool IsTokenChar(int chr)
-        {
-            return true;
-        }
-
-        public override sealed void End()
-        {
-            base.End();
-            Debug.Assert(bufferStart <= bufferEnd);
-            int endOffset = offset;
-            for (int i = bufferStart; i < bufferEnd; ++i)
-            {
-                endOffset += Character.CharCount(buffer[i]);
-            }
-            endOffset = CorrectOffset(endOffset);
-            // set final offset
-            offsetAtt.SetOffset(endOffset, endOffset);
-        }
-
-        public override sealed void Reset()
-        {
-            base.Reset();
-            bufferStart = bufferEnd = buffer.Length;
-            lastNonTokenChar = lastCheckedChar = bufferStart - 1;
-            offset = 0;
-            gramSize = minGram;
-            exhausted = false;
-            charBuffer.Reset();
-        }
-    }
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizerFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizerFactory.cs
deleted file mode 100644
index cf25b65..0000000
--- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizerFactory.cs
+++ /dev/null
@@ -1,70 +0,0 @@
-using Lucene.Net.Analysis.Util;
-using Lucene.Net.Util;
-using System.Collections.Generic;
-using System.IO;
-
-namespace Lucene.Net.Analysis.NGram
-{
-    /*
-     * Licensed to the Apache Software Foundation (ASF) under one or more
-     * contributor license agreements.  See the NOTICE file distributed with
-     * this work for additional information regarding copyright ownership.
-     * The ASF licenses this file to You under the Apache License, Version 2.0
-     * (the "License"); you may not use this file except in compliance with
-     * the License.  You may obtain a copy of the License at
-     *
-     *     http://www.apache.org/licenses/LICENSE-2.0
-     *
-     * Unless required by applicable law or agreed to in writing, software
-     * distributed under the License is distributed on an "AS IS" BASIS,
-     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-     * See the License for the specific language governing permissions and
-     * limitations under the License.
-     */
-
-    /// <summary>
-    /// Factory for <see cref="NGramTokenizer"/>.
-    /// <code>
-    /// &lt;fieldType name="text_ngrm" class="solr.TextField" positionIncrementGap="100"&gt;
-    ///   &lt;analyzer&gt;
-    ///     &lt;tokenizer class="solr.NGramTokenizerFactory" minGramSize="1" maxGramSize="2"/&gt;
-    ///   &lt;/analyzer&gt;
-    /// &lt;/fieldType&gt;</code>
-    /// </summary>
-    public class NGramTokenizerFactory : TokenizerFactory
-    {
-        private readonly int maxGramSize;
-        private readonly int minGramSize;
-
-        /// <summary>
-        /// Creates a new <see cref="NGramTokenizerFactory"/> </summary>
-        public NGramTokenizerFactory(IDictionary<string, string> args)
-            : base(args)
-        {
-            minGramSize = GetInt(args, "minGramSize", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
-            maxGramSize = GetInt(args, "maxGramSize", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
-            if (args.Count > 0)
-            {
-                throw new System.ArgumentException("Unknown parameters: " + args);
-            }
-        }
-
-        /// <summary>
-        /// Creates the <see cref="TokenStream"/> of n-grams from the given <see cref="TextReader"/> and <see cref="AttributeSource.AttributeFactory"/>. </summary>
-        public override Tokenizer Create(AttributeSource.AttributeFactory factory, TextReader input)
-        {
-#pragma warning disable 612, 618
-            if (m_luceneMatchVersion.OnOrAfter(LuceneVersion.LUCENE_44))
-#pragma warning restore 612, 618
-            {
-                return new NGramTokenizer(m_luceneMatchVersion, factory, input, minGramSize, maxGramSize);
-            }
-            else
-            {
-#pragma warning disable 612, 618
-                return new Lucene43NGramTokenizer(factory, input, minGramSize, maxGramSize);
-#pragma warning restore 612, 618
-            }
-        }
-    }
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Tests.Analysis.Common/Analysis/NGram/EdgeNGramTokenFilterTest.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/NGram/EdgeNGramTokenFilterTest.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/NGram/EdgeNGramTokenFilterTest.cs
new file mode 100644
index 0000000..ea6fbd7
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/NGram/EdgeNGramTokenFilterTest.cs
@@ -0,0 +1,390 @@
+using Lucene.Net.Analysis.Core;
+using Lucene.Net.Analysis.Miscellaneous;
+using Lucene.Net.Analysis.Shingle;
+using Lucene.Net.Analysis.TokenAttributes;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using NUnit.Framework;
+using System;
+using System.IO;
+
+namespace Lucene.Net.Analysis.NGram
+{
+    /*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+    /// <summary>
+    /// Tests <seealso cref="EdgeNGramTokenFilter"/> for correctness.
+    /// </summary>
+    public class EdgeNGramTokenFilterTest : BaseTokenStreamTestCase
+    {
+        private TokenStream input;
+
+        public override void SetUp()
+        {
+            base.SetUp();
+            input = new MockTokenizer(new StringReader("abcde"), MockTokenizer.WHITESPACE, false);
+        }
+
+        [Test]
+        public virtual void TestInvalidInput()
+        {
+            bool gotException = false;
+            try
+            {
+#pragma warning disable 612, 618
+                new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 0, 0);
+#pragma warning restore 612, 618
+            }
+            catch (System.ArgumentException)
+            {
+                gotException = true;
+            }
+            assertTrue(gotException);
+        }
+
+        [Test]
+        public virtual void TestInvalidInput2()
+        {
+            bool gotException = false;
+            try
+            {
+#pragma warning disable 612, 618
+                new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 2, 1);
+#pragma warning restore 612, 618
+            }
+            catch (System.ArgumentException)
+            {
+                gotException = true;
+            }
+            assertTrue(gotException);
+        }
+
+        [Test]
+        public virtual void TestInvalidInput3()
+        {
+            bool gotException = false;
+            try
+            {
+#pragma warning disable 612, 618
+                new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, -1, 2);
+#pragma warning restore 612, 618
+            }
+            catch (System.ArgumentException)
+            {
+                gotException = true;
+            }
+            assertTrue(gotException);
+        }
+
+        [Test]
+        public virtual void TestFrontUnigram()
+        {
+#pragma warning disable 612, 618
+            EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 1, 1);
+#pragma warning restore 612, 618
+            AssertTokenStreamContents(tokenizer, new string[] { "a" }, new int[] { 0 }, new int[] { 5 });
+        }
+
+        [Test]
+        public virtual void TestBackUnigram()
+        {
+#pragma warning disable 612, 618
+            EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(LuceneVersion.LUCENE_43, input, EdgeNGramTokenFilter.Side.BACK, 1, 1);
+#pragma warning restore 612, 618
+            AssertTokenStreamContents(tokenizer, new string[] { "e" }, new int[] { 4 }, new int[] { 5 });
+        }
+
+        [Test]
+        public virtual void TestOversizedNgrams()
+        {
+#pragma warning disable 612, 618
+            EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 6, 6);
+#pragma warning restore 612, 618
+            AssertTokenStreamContents(tokenizer, new string[0], new int[0], new int[0]);
+        }
+
+        [Test]
+        public virtual void TestFrontRangeOfNgrams()
+        {
+#pragma warning disable 612, 618
+            EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 1, 3);
+#pragma warning restore 612, 618
+            AssertTokenStreamContents(tokenizer, new string[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 5, 5, 5 });
+        }
+
+        [Test]
+        public virtual void TestBackRangeOfNgrams()
+        {
+#pragma warning disable 612, 618
+            EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(LuceneVersion.LUCENE_43, input, EdgeNGramTokenFilter.Side.BACK, 1, 3);
+#pragma warning restore 612, 618
+            AssertTokenStreamContents(tokenizer, new string[] { "e", "de", "cde" }, new int[] { 4, 3, 2 }, new int[] { 5, 5, 5 }, null, null, null, null, false);
+        }
+
+        [Test]
+        public virtual void TestFilterPositions()
+        {
+            TokenStream ts = new MockTokenizer(new StringReader("abcde vwxyz"), MockTokenizer.WHITESPACE, false);
+#pragma warning disable 612, 618
+            EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, ts, EdgeNGramTokenFilter.Side.FRONT, 1, 3);
+#pragma warning restore 612, 618
+            AssertTokenStreamContents(tokenizer, new string[] { "a", "ab", "abc", "v", "vw", "vwx" }, new int[] { 0, 0, 0, 6, 6, 6 }, new int[] { 5, 5, 5, 11, 11, 11 }, null, new int[] { 1, 0, 0, 1, 0, 0 }, null, null, false);
+        }
+
+        private class PositionFilter : TokenFilter
+        {
+
+            internal readonly IPositionIncrementAttribute posIncrAtt;
+            internal bool started;
+
+            internal PositionFilter(TokenStream input) : base(input)
+            {
+                posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
+            }
+
+            public override sealed bool IncrementToken()
+            {
+                if (m_input.IncrementToken())
+                {
+                    if (started)
+                    {
+                        posIncrAtt.PositionIncrement = 0;
+                    }
+                    else
+                    {
+                        started = true;
+                    }
+                    return true;
+                }
+                else
+                {
+                    return false;
+                }
+            }
+
+            public override void Reset()
+            {
+                base.Reset();
+                started = false;
+            }
+        }
+
+        [Test]
+        public virtual void TestFirstTokenPositionIncrement()
+        {
+            TokenStream ts = new MockTokenizer(new StringReader("a abc"), MockTokenizer.WHITESPACE, false);
+            ts = new PositionFilter(ts); // All but first token will get 0 position increment
+#pragma warning disable 612, 618
+            EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, ts, EdgeNGramTokenFilter.Side.FRONT, 2, 3);
+#pragma warning restore 612, 618
+            // The first token "a" will not be output, since it's smaller than the mingram size of 2.
+            // The second token on input to EdgeNGramTokenFilter will have position increment of 0,
+            // which should be increased to 1, since this is the first output token in the stream.
+            AssertTokenStreamContents(filter, new string[] { "ab", "abc" }, new int[] { 2, 2 }, new int[] { 5, 5 }, new int[] { 1, 0 });
+        }
+
+        [Test]
+        public virtual void TestSmallTokenInStream()
+        {
+            input = new MockTokenizer(new StringReader("abc de fgh"), MockTokenizer.WHITESPACE, false);
+#pragma warning disable 612, 618
+            EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 3, 3);
+#pragma warning restore 612, 618
+            AssertTokenStreamContents(tokenizer, new string[] { "abc", "fgh" }, new int[] { 0, 7 }, new int[] { 3, 10 });
+        }
+
+        [Test]
+        public virtual void TestReset()
+        {
+            WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde"));
+#pragma warning disable 612, 618
+            EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, EdgeNGramTokenFilter.Side.FRONT, 1, 3);
+#pragma warning restore 612, 618
+            AssertTokenStreamContents(filter, new string[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 5, 5, 5 });
+            tokenizer.SetReader(new StringReader("abcde"));
+            AssertTokenStreamContents(filter, new string[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 5, 5, 5 });
+        }
+
+        // LUCENE-3642
+        // EdgeNgram blindly adds term length to offset, but this can take things out of bounds
+        // wrt original text if a previous filter increases the length of the word (in this case æ -> ae)
+        // so in this case we behave like WDF, and preserve any modified offsets
+        [Test]
+        public virtual void TestInvalidOffsets()
+        {
+            Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper(this);
+            AssertAnalyzesTo(analyzer, "mosfellsbær", new string[] { "mo", "mos", "mosf", "mosfe", "mosfel", "mosfell", "mosfells", "mosfellsb", "mosfellsba", "mosfellsbae", "mosfellsbaer" }, new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 });
+        }
+
+        private class AnalyzerAnonymousInnerClassHelper : Analyzer
+        {
+            private readonly EdgeNGramTokenFilterTest outerInstance;
+
+            public AnalyzerAnonymousInnerClassHelper(EdgeNGramTokenFilterTest outerInstance)
+            {
+                this.outerInstance = outerInstance;
+            }
+
+            protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
+            {
+                Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+                TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
+#pragma warning disable 612, 618
+                filters = new EdgeNGramTokenFilter(LuceneVersion.LUCENE_43, filters, EdgeNGramTokenFilter.Side.FRONT, 2, 15);
+#pragma warning restore 612, 618
+                return new TokenStreamComponents(tokenizer, filters);
+            }
+        }
+
+        /// <summary>
+        /// blast some random strings through the analyzer </summary>
+        [Test]
+        public virtual void TestRandomStrings()
+        {
+            for (int i = 0; i < 10; i++)
+            {
+                int min = TestUtil.NextInt(Random(), 2, 10);
+                int max = TestUtil.NextInt(Random(), min, 20);
+
+                Analyzer a = new AnalyzerAnonymousInnerClassHelper2(this, min, max);
+                CheckRandomData(Random(), a, 100 * RANDOM_MULTIPLIER);
+            }
+
+            Analyzer b = new AnalyzerAnonymousInnerClassHelper3(this);
+            CheckRandomData(Random(), b, 1000 * RANDOM_MULTIPLIER, 20, false, false);
+        }
+
+        private class AnalyzerAnonymousInnerClassHelper2 : Analyzer
+        {
+            private readonly EdgeNGramTokenFilterTest outerInstance;
+
+            private int min;
+            private int max;
+
+            public AnalyzerAnonymousInnerClassHelper2(EdgeNGramTokenFilterTest outerInstance, int min, int max)
+            {
+                this.outerInstance = outerInstance;
+                this.min = min;
+                this.max = max;
+            }
+
+            protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
+            {
+                Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+                return new TokenStreamComponents(tokenizer, new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, min, max));
+            }
+        }
+
+        private class AnalyzerAnonymousInnerClassHelper3 : Analyzer
+        {
+            private readonly EdgeNGramTokenFilterTest outerInstance;
+
+            public AnalyzerAnonymousInnerClassHelper3(EdgeNGramTokenFilterTest outerInstance)
+            {
+                this.outerInstance = outerInstance;
+            }
+
+            protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
+            {
+                Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+#pragma warning disable 612, 618
+                return new TokenStreamComponents(tokenizer, new EdgeNGramTokenFilter(LuceneVersion.LUCENE_43, tokenizer, EdgeNGramTokenFilter.Side.BACK, 2, 4));
+#pragma warning restore 612, 618 
+            }
+        }
+
+        [Test]
+        public virtual void TestEmptyTerm()
+        {
+            Random random = Random();
+            Analyzer a = new AnalyzerAnonymousInnerClassHelper4(this);
+            CheckAnalysisConsistency(random, a, random.nextBoolean(), "");
+
+            Analyzer b = new AnalyzerAnonymousInnerClassHelper5(this);
+            CheckAnalysisConsistency(random, b, random.nextBoolean(), "");
+        }
+
+        private class AnalyzerAnonymousInnerClassHelper4 : Analyzer
+        {
+            private readonly EdgeNGramTokenFilterTest outerInstance;
+
+            public AnalyzerAnonymousInnerClassHelper4(EdgeNGramTokenFilterTest outerInstance)
+            {
+                this.outerInstance = outerInstance;
+            }
+
+            protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
+            {
+                Tokenizer tokenizer = new KeywordTokenizer(reader);
+#pragma warning disable 612, 618
+                return new TokenStreamComponents(tokenizer, new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, EdgeNGramTokenFilter.Side.FRONT, 2, 15));
+#pragma warning restore 612, 618
+            }
+        }
+
+        private class AnalyzerAnonymousInnerClassHelper5 : Analyzer
+        {
+            private readonly EdgeNGramTokenFilterTest outerInstance;
+
+            public AnalyzerAnonymousInnerClassHelper5(EdgeNGramTokenFilterTest outerInstance)
+            {
+                this.outerInstance = outerInstance;
+            }
+
+            protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
+            {
+                Tokenizer tokenizer = new KeywordTokenizer(reader);
+#pragma warning disable 612, 618
+                return new TokenStreamComponents(tokenizer, new EdgeNGramTokenFilter(LuceneVersion.LUCENE_43, tokenizer, EdgeNGramTokenFilter.Side.BACK, 2, 15));
+#pragma warning restore 612, 618
+            }
+        }
+
+        [Test]
+        public virtual void TestGraphs()
+        {
+            TokenStream tk = new LetterTokenizer(TEST_VERSION_CURRENT, new StringReader("abc d efgh ij klmno p q"));
+            tk = new ShingleFilter(tk);
+            tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, 7, 10);
+            AssertTokenStreamContents(tk, new string[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" }, new int[] { 6, 11, 11, 14 }, new int[] { 13, 19, 19, 21 }, new int[] { 3, 1, 0, 1 }, new int[] { 2, 2, 2, 2 }, 23);
+        }
+
+        [Test]
+        public virtual void TestSupplementaryCharacters()
+        {
+            string s = TestUtil.RandomUnicodeString(Random(), 10);
+            int codePointCount = s.CodePointCount(0, s.Length);
+            int minGram = TestUtil.NextInt(Random(), 1, 3);
+            int maxGram = TestUtil.NextInt(Random(), minGram, 10);
+            TokenStream tk = new KeywordTokenizer(new StringReader(s));
+            tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, minGram, maxGram);
+            ICharTermAttribute termAtt = tk.AddAttribute<ICharTermAttribute>();
+            IOffsetAttribute offsetAtt = tk.AddAttribute<IOffsetAttribute>();
+            tk.Reset();
+            for (int i = minGram; i <= Math.Min(codePointCount, maxGram); ++i)
+            {
+                assertTrue(tk.IncrementToken());
+                assertEquals(0, offsetAtt.StartOffset);
+                assertEquals(s.Length, offsetAtt.EndOffset);
+                int end = Character.OffsetByCodePoints(s, 0, i);
+                assertEquals(s.Substring(0, end), termAtt.ToString());
+            }
+            assertFalse(tk.IncrementToken());
+        }
+    }
+}
\ No newline at end of file


Mime
View raw message