lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From nightowl...@apache.org
Subject [39/39] lucenenet git commit: Lucene.Net.Analysis.Ngram - renamed NGram in Git
Date Sat, 04 Feb 2017 20:32:58 GMT
Lucene.Net.Analysis.Ngram - renamed NGram in Git


Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/ab81d913
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/ab81d913
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/ab81d913

Branch: refs/heads/api-work
Commit: ab81d91313149500e6c88b4ceabd6ff5aa4e0d63
Parents: 3201465
Author: Shad Storhaug <shad@shadstorhaug.com>
Authored: Sun Feb 5 03:17:39 2017 +0700
Committer: Shad Storhaug <shad@shadstorhaug.com>
Committed: Sun Feb 5 03:29:11 2017 +0700

----------------------------------------------------------------------
 .../Analysis/NGram/EdgeNGramFilterFactory.cs    |  60 +++
 .../Analysis/NGram/EdgeNGramTokenFilter.cs      | 245 ++++++++++++
 .../Analysis/NGram/EdgeNGramTokenizer.cs        |  72 ++++
 .../Analysis/NGram/EdgeNGramTokenizerFactory.cs |  75 ++++
 .../NGram/Lucene43EdgeNGramTokenizer.cs         | 297 ++++++++++++++
 .../Analysis/NGram/Lucene43NGramTokenizer.cs    | 173 ++++++++
 .../Analysis/NGram/NGramFilterFactory.cs        |  56 +++
 .../Analysis/NGram/NGramTokenFilter.cs          | 252 ++++++++++++
 .../Analysis/NGram/NGramTokenizer.cs            | 319 +++++++++++++++
 .../Analysis/NGram/NGramTokenizerFactory.cs     |  70 ++++
 .../Analysis/Ngram/EdgeNGramFilterFactory.cs    |  60 ---
 .../Analysis/Ngram/EdgeNGramTokenFilter.cs      | 245 ------------
 .../Analysis/Ngram/EdgeNGramTokenizer.cs        |  72 ----
 .../Analysis/Ngram/EdgeNGramTokenizerFactory.cs |  75 ----
 .../Ngram/Lucene43EdgeNGramTokenizer.cs         | 297 --------------
 .../Analysis/Ngram/Lucene43NGramTokenizer.cs    | 173 --------
 .../Analysis/Ngram/NGramFilterFactory.cs        |  56 ---
 .../Analysis/Ngram/NGramTokenFilter.cs          | 252 ------------
 .../Analysis/Ngram/NGramTokenizer.cs            | 319 ---------------
 .../Analysis/Ngram/NGramTokenizerFactory.cs     |  70 ----
 .../Analysis/NGram/EdgeNGramTokenFilterTest.cs  | 390 +++++++++++++++++++
 .../Analysis/NGram/EdgeNGramTokenizerTest.cs    | 278 +++++++++++++
 .../Analysis/NGram/NGramTokenFilterTest.cs      | 249 ++++++++++++
 .../Analysis/NGram/NGramTokenizerTest.cs        | 303 ++++++++++++++
 .../Analysis/NGram/TestNGramFilters.cs          | 196 ++++++++++
 .../Analysis/Ngram/EdgeNGramTokenFilterTest.cs  | 390 -------------------
 .../Analysis/Ngram/EdgeNGramTokenizerTest.cs    | 278 -------------
 .../Analysis/Ngram/NGramTokenFilterTest.cs      | 249 ------------
 .../Analysis/Ngram/NGramTokenizerTest.cs        | 303 --------------
 .../Analysis/Ngram/TestNGramFilters.cs          | 196 ----------
 30 files changed, 3035 insertions(+), 3035 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramFilterFactory.cs
new file mode 100644
index 0000000..70b44d3
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramFilterFactory.cs
@@ -0,0 +1,60 @@
+using Lucene.Net.Analysis.Util;
+using System.Collections.Generic;
+
+namespace Lucene.Net.Analysis.NGram
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Creates new instances of <see cref="EdgeNGramTokenFilter"/>.
+    /// <code>
+    /// &lt;fieldType name="text_edgngrm" class="solr.TextField" positionIncrementGap="100"&gt;
+    ///   &lt;analyzer&gt;
+    ///     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
+    ///     &lt;filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="1"/&gt;
+    ///   &lt;/analyzer&gt;
+    /// &lt;/fieldType&gt;</code>
+    /// </summary>
+    public class EdgeNGramFilterFactory : TokenFilterFactory
+    {
+        private readonly int maxGramSize;
+        private readonly int minGramSize;
+        private readonly string side;
+
+        /// <summary>
+        /// Creates a new <see cref="EdgeNGramFilterFactory"/> </summary>
+        public EdgeNGramFilterFactory(IDictionary<string, string> args)
+            : base(args)
+        {
+            minGramSize = GetInt(args, "minGramSize", EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE);
+            maxGramSize = GetInt(args, "maxGramSize", EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE);
+            side = Get(args, "side", EdgeNGramTokenFilter.Side.FRONT.ToString());
+            if (args.Count > 0)
+            {
+                throw new System.ArgumentException("Unknown parameters: " + args);
+            }
+        }
+
+        public override TokenStream Create(TokenStream input)
+        {
+#pragma warning disable 612, 618
+            return new EdgeNGramTokenFilter(m_luceneMatchVersion, input, side, minGramSize, maxGramSize);
+#pragma warning restore 612, 618
+        }
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramTokenFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramTokenFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramTokenFilter.cs
new file mode 100644
index 0000000..8cf8172
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramTokenFilter.cs
@@ -0,0 +1,245 @@
+using Lucene.Net.Analysis.TokenAttributes;
+using Lucene.Net.Analysis.Util;
+using Lucene.Net.Util;
+using System;
+
+namespace Lucene.Net.Analysis.NGram
+{
+    /*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+    /// <summary>
+    /// Tokenizes the given token into n-grams of given size(s).
+    /// <para>
+    /// This <see cref="TokenFilter"/> create n-grams from the beginning edge or ending edge of a input token.
+    /// </para>
+    /// <para>As of Lucene 4.4, this filter does not support
+    /// <see cref="Side.BACK"/> (you can use <see cref="Reverse.ReverseStringFilter"/> up-front and
+    /// afterward to get the same behavior), handles supplementary characters
+    /// correctly and does not update offsets anymore.
+    /// </para>
+    /// </summary>
+    public sealed class EdgeNGramTokenFilter : TokenFilter
+    {
+        public const Side DEFAULT_SIDE = Side.FRONT;
+        public const int DEFAULT_MAX_GRAM_SIZE = 1;
+        public const int DEFAULT_MIN_GRAM_SIZE = 1;
+
+        /// <summary>
+        /// Specifies which side of the input the n-gram should be generated from </summary>
+        public enum Side
+        {
+            /// <summary>
+            /// Get the n-gram from the front of the input </summary>
+            FRONT,
+
+            /// <summary>
+            /// Get the n-gram from the end of the input </summary>
+            [System.Obsolete]
+            BACK,
+        }
+
+        /// <summary>
+        /// Get the appropriate <see cref="Side"/> from a string
+        /// </summary>
+        public static Side GetSide(string sideName)
+        {
+            Side result;
+            if (!Enum.TryParse(sideName, true, out result))
+            {
+                result = Side.FRONT;
+            }
+            return result;
+        }
+
+        private readonly LuceneVersion version;
+        private readonly CharacterUtils charUtils;
+        private readonly int minGram;
+        private readonly int maxGram;
+        private Side side;
+        private char[] curTermBuffer;
+        private int curTermLength;
+        private int curCodePointCount;
+        private int curGramSize;
+        private int tokStart;
+        private int tokEnd; // only used if the length changed before this filter
+        private bool updateOffsets; // never if the length changed before this filter
+        private int savePosIncr;
+        private int savePosLen;
+
+        private readonly ICharTermAttribute termAtt;
+        private readonly IOffsetAttribute offsetAtt;
+        private readonly IPositionIncrementAttribute posIncrAtt;
+        private readonly IPositionLengthAttribute posLenAtt;
+
+        /// <summary>
+        /// Creates <see cref="EdgeNGramTokenFilter"/> that can generate n-grams in the sizes of the given range
+        /// </summary>
+        /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
+        /// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param>
+        /// <param name="side"> the <see cref="Side"/> from which to chop off an n-gram </param>
+        /// <param name="minGram"> the smallest n-gram to generate </param>
+        /// <param name="maxGram"> the largest n-gram to generate </param>
+        [Obsolete]
+        public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, Side side, int minGram, int maxGram)
+              : base(input)
+        {
+
+            //if (version == null)
+            //{
+            //    throw new System.ArgumentException("version must not be null");
+            //}
+
+            if (version.OnOrAfter(LuceneVersion.LUCENE_44) && side == Side.BACK)
+            {
+                throw new System.ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4, use ReverseStringFilter up-front and afterward");
+            }
+
+            if (!Enum.IsDefined(typeof(Side), side))
+            {
+                throw new System.ArgumentException("sideLabel must be either front or back");
+            }
+
+            if (minGram < 1)
+            {
+                throw new System.ArgumentException("minGram must be greater than zero");
+            }
+
+            if (minGram > maxGram)
+            {
+                throw new System.ArgumentException("minGram must not be greater than maxGram");
+            }
+
+            this.version = version;
+            this.charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ? CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance;
+            this.minGram = minGram;
+            this.maxGram = maxGram;
+            this.side = side;
+
+            this.termAtt = AddAttribute<ICharTermAttribute>();
+            this.offsetAtt = AddAttribute<IOffsetAttribute>();
+            this.posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
+            this.posLenAtt = AddAttribute<IPositionLengthAttribute>();
+        }
+
+        /// <summary>
+        /// Creates <see cref="EdgeNGramTokenFilter"/> that can generate n-grams in the sizes of the given range
+        /// </summary>
+        /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
+        /// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param>
+        /// <param name="sideLabel"> the name of the <see cref="Side"/> from which to chop off an n-gram </param>
+        /// <param name="minGram"> the smallest n-gram to generate </param>
+        /// <param name="maxGram"> the largest n-gram to generate </param>
+        [Obsolete]
+        public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, string sideLabel, int minGram, int maxGram)
+              : this(version, input, GetSide(sideLabel), minGram, maxGram)
+        {
+        }
+
+        /// <summary>
+        /// Creates <see cref="EdgeNGramTokenFilter"/> that can generate n-grams in the sizes of the given range
+        /// </summary>
+        /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
+        /// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param>
+        /// <param name="minGram"> the smallest n-gram to generate </param>
+        /// <param name="maxGram"> the largest n-gram to generate </param>
+        public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, int minGram, int maxGram)
+#pragma warning disable 612, 618
+              : this(version, input, Side.FRONT, minGram, maxGram)
+#pragma warning restore 612, 618
+        {
+        }
+
+        public override sealed bool IncrementToken()
+        {
+            while (true)
+            {
+                if (curTermBuffer == null)
+                {
+                    if (!m_input.IncrementToken())
+                    {
+                        return false;
+                    }
+                    else
+                    {
+                        curTermBuffer = (char[])termAtt.Buffer.Clone();
+                        curTermLength = termAtt.Length;
+                        curCodePointCount = charUtils.CodePointCount(termAtt.ToString());
+                        curGramSize = minGram;
+                        tokStart = offsetAtt.StartOffset;
+                        tokEnd = offsetAtt.EndOffset;
+#pragma warning disable 612, 618
+                        if (version.OnOrAfter(LuceneVersion.LUCENE_44))
+#pragma warning restore 612, 618
+                        {
+                            // Never update offsets
+                            updateOffsets = false;
+                        }
+                        else
+                        {
+                            // if length by start + end offsets doesn't match the term text then assume
+                            // this is a synonym and don't adjust the offsets.
+                            updateOffsets = (tokStart + curTermLength) == tokEnd;
+                        }
+                        savePosIncr += posIncrAtt.PositionIncrement;
+                        savePosLen = posLenAtt.PositionLength;
+                    }
+                }
+                if (curGramSize <= maxGram) // if we have hit the end of our n-gram size range, quit
+                {
+                    if (curGramSize <= curCodePointCount) // if the remaining input is too short, we can't generate any n-grams
+                    {
+                        // grab gramSize chars from front or back
+                        int start = side == Side.FRONT ? 0 : charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, curTermLength, -curGramSize);
+                        int end = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
+                        ClearAttributes();
+                        if (updateOffsets)
+                        {
+                            offsetAtt.SetOffset(tokStart + start, tokStart + end);
+                        }
+                        else
+                        {
+                            offsetAtt.SetOffset(tokStart, tokEnd);
+                        }
+                        // first ngram gets increment, others don't
+                        if (curGramSize == minGram)
+                        {
+                            posIncrAtt.PositionIncrement = savePosIncr;
+                            savePosIncr = 0;
+                        }
+                        else
+                        {
+                            posIncrAtt.PositionIncrement = 0;
+                        }
+                        posLenAtt.PositionLength = savePosLen;
+                        termAtt.CopyBuffer(curTermBuffer, start, end - start);
+                        curGramSize++;
+                        return true;
+                    }
+                }
+                curTermBuffer = null;
+            }
+        }
+
+        public override void Reset()
+        {
+            base.Reset();
+            curTermBuffer = null;
+            savePosIncr = 0;
+        }
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramTokenizer.cs
new file mode 100644
index 0000000..ed2cb3d
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramTokenizer.cs
@@ -0,0 +1,72 @@
+using Lucene.Net.Util;
+using System.IO;
+
+namespace Lucene.Net.Analysis.NGram
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Tokenizes the input from an edge into n-grams of given size(s).
+    /// <para>
+    /// This <see cref="Tokenizer"/> create n-grams from the beginning edge or ending edge of a input token.
+    /// </para>
+    /// <para>As of Lucene 4.4, this tokenizer
+    /// <list type="bullet">
+    ///     <item>can handle <code>maxGram</code> larger than 1024 chars, but beware that this will result in increased memory usage</item>
+    ///     <item>doesn't trim the input,</item>
+    ///     <item>sets position increments equal to 1 instead of 1 for the first token and 0 for all other ones</item>
+    ///     <item>doesn't support backward n-grams anymore.</item>
+    ///     <item>supports <see cref="Util.CharTokenizer.IsTokenChar(int)"/> pre-tokenization,</item>
+    ///     <item>correctly handles supplementary characters.</item>
+    /// </list>
+    /// </para>
+    /// <para>Although <b style="color:red">highly</b> discouraged, it is still possible
+    /// to use the old behavior through <see cref="Lucene43EdgeNGramTokenizer"/>.
+    /// </para>
+    /// </summary>
+    public class EdgeNGramTokenizer : NGramTokenizer
+    {
+        public const int DEFAULT_MAX_GRAM_SIZE = 1;
+        public const int DEFAULT_MIN_GRAM_SIZE = 1;
+
+        /// <summary>
+        /// Creates <see cref="EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range
+        /// </summary>
+        /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
+        /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
+        /// <param name="minGram"> the smallest n-gram to generate </param>
+        /// <param name="maxGram"> the largest n-gram to generate </param>
+        public EdgeNGramTokenizer(LuceneVersion version, TextReader input, int minGram, int maxGram)
+            : base(version, input, minGram, maxGram, true)
+        {
+        }
+
+        /// <summary>
+        /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+        /// </summary>
+        /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
+        /// <param name="factory"> <see cref="AttributeSource.AttributeFactory"/> to use </param>
+        /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
+        /// <param name="minGram"> the smallest n-gram to generate </param>
+        /// <param name="maxGram"> the largest n-gram to generate </param>
+        public EdgeNGramTokenizer(LuceneVersion version, AttributeSource.AttributeFactory factory, TextReader input, int minGram, int maxGram)
+            : base(version, factory, input, minGram, maxGram, true)
+        {
+        }
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramTokenizerFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramTokenizerFactory.cs
new file mode 100644
index 0000000..00325f5
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/NGram/EdgeNGramTokenizerFactory.cs
@@ -0,0 +1,75 @@
+using Lucene.Net.Analysis.Util;
+using Lucene.Net.Util;
+using System;
+using System.Collections.Generic;
+using System.IO;
+
+namespace Lucene.Net.Analysis.NGram
+{
+    /*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+    /// <summary>
+    /// Creates new instances of <see cref="EdgeNGramTokenizer"/>.
+    /// <code>
+    /// &lt;fieldType name="text_edgngrm" class="solr.TextField" positionIncrementGap="100"&gt;
+    ///   &lt;analyzer&gt;
+    ///     &lt;tokenizer class="solr.EdgeNGramTokenizerFactory" minGramSize="1" maxGramSize="1"/&gt;
+    ///   &lt;/analyzer&gt;
+    /// &lt;/fieldType&gt;</code>
+    /// </summary>
+    public class EdgeNGramTokenizerFactory : TokenizerFactory
+    {
+        private readonly int maxGramSize;
+        private readonly int minGramSize;
+        private readonly string side;
+
+        /// <summary>
+        /// Creates a new <see cref="EdgeNGramTokenizerFactory"/> </summary>
+        public EdgeNGramTokenizerFactory(IDictionary<string, string> args) : base(args)
+        {
+            minGramSize = GetInt(args, "minGramSize", EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE);
+            maxGramSize = GetInt(args, "maxGramSize", EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE);
+            side = Get(args, "side", EdgeNGramTokenFilter.Side.FRONT.ToString());
+            if (args.Count > 0)
+            {
+                throw new System.ArgumentException("Unknown parameters: " + args);
+            }
+        }
+
+        public override Tokenizer Create(AttributeSource.AttributeFactory factory, TextReader input)
+        {
+#pragma warning disable 612, 618
+            if (m_luceneMatchVersion.OnOrAfter(LuceneVersion.LUCENE_44))
+#pragma warning restore 612, 618
+            {
+                EdgeNGramTokenFilter.Side sideEnum;
+                if (!Enum.TryParse(this.side, true, out sideEnum))
+                {
+                    throw new System.ArgumentException(typeof(EdgeNGramTokenizer).Name + " does not support backward n-grams as of Lucene 4.4");
+                }
+                return new EdgeNGramTokenizer(m_luceneMatchVersion, input, minGramSize, maxGramSize);
+            }
+            else
+            {
+#pragma warning disable 612, 618
+                return new Lucene43EdgeNGramTokenizer(m_luceneMatchVersion, input, side, minGramSize, maxGramSize);
+#pragma warning restore 612, 618
+            }
+        }
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/NGram/Lucene43EdgeNGramTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/NGram/Lucene43EdgeNGramTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/NGram/Lucene43EdgeNGramTokenizer.cs
new file mode 100644
index 0000000..4dadbed
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/NGram/Lucene43EdgeNGramTokenizer.cs
@@ -0,0 +1,297 @@
+using Lucene.Net.Analysis.TokenAttributes;
+using Lucene.Net.Util;
+using System;
+using System.IO;
+
+namespace Lucene.Net.Analysis.NGram
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Old version of <see cref="EdgeNGramTokenizer"/> which doesn't handle correctly
+    /// supplementary characters.
+    /// </summary>
+    [Obsolete]
+    public sealed class Lucene43EdgeNGramTokenizer : Tokenizer
+    {
+        public const Side DEFAULT_SIDE = Side.FRONT;
+        public const int DEFAULT_MAX_GRAM_SIZE = 1;
+        public const int DEFAULT_MIN_GRAM_SIZE = 1;
+
+        private ICharTermAttribute termAtt;
+        private IOffsetAttribute offsetAtt;
+        private IPositionIncrementAttribute posIncrAtt;
+
+        /// <summary>
+        /// Specifies which side of the input the n-gram should be generated from </summary>
+        public enum Side
+        {
+            /// <summary>
+            /// Get the n-gram from the front of the input </summary>
+            FRONT,
+
+            /// <summary>
+            /// Get the n-gram from the end of the input </summary>
+            BACK,
+        }
+
+        // Get the appropriate Side from a string
+        public static Side GetSide(string sideName)
+        {
+            Side result;
+            if (!Enum.TryParse(sideName, true, out result))
+            {
+                result = Side.FRONT;
+            }
+            return result;
+        }
+
+        private int minGram;
+        private int maxGram;
+        private int gramSize;
+        private Side side;
+        private bool started;
+        private int inLen; // length of the input AFTER trim()
+        private int charsRead; // length of the input
+        private string inStr;
+
+
+        /// <summary>
+        /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range
+        /// </summary>
+        /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
+        /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
+        /// <param name="side"> the <see cref="Side"/> from which to chop off an n-gram </param>
+        /// <param name="minGram"> the smallest n-gram to generate </param>
+        /// <param name="maxGram"> the largest n-gram to generate </param>
+        [Obsolete]
+        public Lucene43EdgeNGramTokenizer(LuceneVersion version, TextReader input, Side side, int minGram, int maxGram)
+            : base(input)
+        {
+            Init(version, side, minGram, maxGram);
+        }
+
+        /// <summary>
+        /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range
+        /// </summary>
+        /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
+        /// <param name="factory"> <see cref="AttributeSource.AttributeFactory"/> to use </param>
+        /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
+        /// <param name="side"> the <see cref="Side"/> from which to chop off an n-gram </param>
+        /// <param name="minGram"> the smallest n-gram to generate </param>
+        /// <param name="maxGram"> the largest n-gram to generate </param>
+        [Obsolete]
+        public Lucene43EdgeNGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, Side side, int minGram, int maxGram)
+            : base(factory, input)
+        {
+            Init(version, side, minGram, maxGram);
+        }
+
+        /// <summary>
+        /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range
+        /// </summary>
+        /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
+        /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
+        /// <param name="sideLabel"> the name of the <see cref="Side"/> from which to chop off an n-gram </param>
+        /// <param name="minGram"> the smallest n-gram to generate </param>
+        /// <param name="maxGram"> the largest n-gram to generate </param>
+        [Obsolete]
+        public Lucene43EdgeNGramTokenizer(LuceneVersion version, TextReader input, string sideLabel, int minGram, int maxGram)
+            : this(version, input, GetSide(sideLabel), minGram, maxGram)
+        {
+        }
+
+        /// <summary>
+        /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range
+        /// </summary>
+        /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
+        /// <param name="factory"> <see cref="AttributeSource.AttributeFactory"/> to use </param>
+        /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
+        /// <param name="sideLabel"> the name of the <see cref="Side"/> from which to chop off an n-gram </param>
+        /// <param name="minGram"> the smallest n-gram to generate </param>
+        /// <param name="maxGram"> the largest n-gram to generate </param>
+        [Obsolete]
+        public Lucene43EdgeNGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, string sideLabel, int minGram, int maxGram)
+            : this(version, factory, input, GetSide(sideLabel), minGram, maxGram)
+        {
+        }
+
+        /// <summary>
+        /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range
+        /// </summary>
+        /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
+        /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
+        /// <param name="minGram"> the smallest n-gram to generate </param>
+        /// <param name="maxGram"> the largest n-gram to generate </param>
+        public Lucene43EdgeNGramTokenizer(LuceneVersion version, TextReader input, int minGram, int maxGram)
+            : this(version, input, Side.FRONT, minGram, maxGram)
+        {
+        }
+
+        /// <summary>
+        /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range
+        /// </summary>
+        /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param>
+        /// <param name="factory"> <see cref="AttributeSource.AttributeFactory"/> to use </param>
+        /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
+        /// <param name="minGram"> the smallest n-gram to generate </param>
+        /// <param name="maxGram"> the largest n-gram to generate </param>
+        public Lucene43EdgeNGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, int minGram, int maxGram)
+            : this(version, factory, input, Side.FRONT, minGram, maxGram)
+        {
+        }
+
+        private void Init(LuceneVersion version, Side side, int minGram, int maxGram)
+        {
+            //if (version == null)
+            //{
+            //    throw new System.ArgumentException("version must not be null");
+            //}
+
+            if (!Enum.IsDefined(typeof(Side), side))
+            {
+                throw new System.ArgumentException("sideLabel must be either front or back");
+            }
+
+            if (minGram < 1)
+            {
+                throw new System.ArgumentException("minGram must be greater than zero");
+            }
+
+            if (minGram > maxGram)
+            {
+                throw new System.ArgumentException("minGram must not be greater than maxGram");
+            }
+
+            if (version.OnOrAfter(LuceneVersion.LUCENE_44))
+            {
+                if (side == Side.BACK)
+                {
+                    throw new System.ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4");
+                }
+            }
+            else
+            {
+                maxGram = Math.Min(maxGram, 1024);
+            }
+
+            this.minGram = minGram;
+            this.maxGram = maxGram;
+            this.side = side;
+            this.termAtt = AddAttribute<ICharTermAttribute>();
+            this.offsetAtt = AddAttribute<IOffsetAttribute>();
+            this.posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
+        }
+
+        /// <summary>
+        /// Returns the next token in the stream, or null at EOS. </summary>
+        public override bool IncrementToken()
+        {
+            ClearAttributes();
+            // if we are just starting, read the whole input
+            if (!started)
+            {
+                started = true;
+                gramSize = minGram;
+                int limit = side == Side.FRONT ? maxGram : 1024;
+                char[] chars = new char[Math.Min(1024, limit)];
+                charsRead = 0;
+                // TODO: refactor to a shared readFully somewhere:
+                bool exhausted = false;
+                while (charsRead < limit)
+                {
+                    int inc = m_input.Read(chars, charsRead, chars.Length - charsRead);
+                    if (inc <= 0)
+                    {
+                        exhausted = true;
+                        break;
+                    }
+                    charsRead += inc;
+                    if (charsRead == chars.Length && charsRead < limit)
+                    {
+                        chars = ArrayUtil.Grow(chars);
+                    }
+                }
+
+                inStr = new string(chars, 0, charsRead);
+                inStr = inStr.Trim();
+
+                if (!exhausted)
+                {
+                    // Read extra throwaway chars so that on end() we
+                    // report the correct offset:
+                    var throwaway = new char[1024];
+                    while (true)
+                    {
+                        int inc = m_input.Read(throwaway, 0, throwaway.Length);
+                        if (inc <= 0)
+                        {
+                            break;
+                        }
+                        charsRead += inc;
+                    }
+                }
+
+                inLen = inStr.Length;
+                if (inLen == 0)
+                {
+                    return false;
+                }
+                posIncrAtt.PositionIncrement = 1;
+            }
+            else
+            {
+                posIncrAtt.PositionIncrement = 0;
+            }
+
+            // if the remaining input is too short, we can't generate any n-grams
+            if (gramSize > inLen)
+            {
+                return false;
+            }
+
+            // if we have hit the end of our n-gram size range, quit
+            if (gramSize > maxGram || gramSize > inLen)
+            {
+                return false;
+            }
+
+            // grab gramSize chars from front or back
+            int start = side == Side.FRONT ? 0 : inLen - gramSize;
+            int end = start + gramSize;
+            termAtt.SetEmpty().Append(inStr, start, end);
+            offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(end));
+            gramSize++;
+            return true;
+        }
+
+        public override void End()
+        {
+            base.End();
+            // set final offset
+            int finalOffset = CorrectOffset(charsRead);
+            this.offsetAtt.SetOffset(finalOffset, finalOffset);
+        }
+
+        public override void Reset()
+        {
+            base.Reset();
+            started = false;
+        }
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/NGram/Lucene43NGramTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/NGram/Lucene43NGramTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/NGram/Lucene43NGramTokenizer.cs
new file mode 100644
index 0000000..b806345
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/NGram/Lucene43NGramTokenizer.cs
@@ -0,0 +1,173 @@
+using Lucene.Net.Analysis.TokenAttributes;
+using System;
+using System.IO;
+
+namespace Lucene.Net.Analysis.NGram
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Old broken version of <see cref="NGramTokenizer"/>.
+    /// </summary>
+    [Obsolete]
+    public sealed class Lucene43NGramTokenizer : Tokenizer
+    {
+        public const int DEFAULT_MIN_NGRAM_SIZE = 1;
+        public const int DEFAULT_MAX_NGRAM_SIZE = 2;
+
+        private int minGram, maxGram;
+        private int gramSize;
+        private int pos;
+        private int inLen; // length of the input AFTER trim()
+        private int charsRead; // length of the input
+        private string inStr;
+        private bool started;
+
+        private ICharTermAttribute termAtt;
+        private IOffsetAttribute offsetAtt;
+
+        /// <summary>
+        /// Creates <see cref="Lucene43NGramTokenizer"/> with given min and max n-grams. </summary>
+        /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
+        /// <param name="minGram"> the smallest n-gram to generate </param>
+        /// <param name="maxGram"> the largest n-gram to generate </param>
+        public Lucene43NGramTokenizer(TextReader input, int minGram, int maxGram)
+            : base(input)
+        {
+            Init(minGram, maxGram);
+        }
+
+        /// <summary>
+        /// Creates <see cref="Lucene43NGramTokenizer"/> with given min and max n-grams. </summary>
+        /// <param name="factory"> <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory"/> to use </param>
+        /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
+        /// <param name="minGram"> the smallest n-gram to generate </param>
+        /// <param name="maxGram"> the largest n-gram to generate </param>
+        public Lucene43NGramTokenizer(AttributeFactory factory, TextReader input, int minGram, int maxGram)
+            : base(factory, input)
+        {
+            Init(minGram, maxGram);
+        }
+
+        /// <summary>
+        /// Creates <see cref="Lucene43NGramTokenizer"/> with default min and max n-grams. </summary>
+        /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
+        public Lucene43NGramTokenizer(TextReader input)
+            : this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE)
+        {
+        }
+
+        private void Init(int minGram, int maxGram)
+        {
+            if (minGram < 1)
+            {
+                throw new System.ArgumentException("minGram must be greater than zero");
+            }
+            if (minGram > maxGram)
+            {
+                throw new System.ArgumentException("minGram must not be greater than maxGram");
+            }
+            this.minGram = minGram;
+            this.maxGram = maxGram;
+            termAtt = AddAttribute<ICharTermAttribute>();
+            offsetAtt = AddAttribute<IOffsetAttribute>();
+        }
+
+        /// <summary>
+        /// Returns the next token in the stream, or null at EOS. </summary>
+        public override bool IncrementToken()
+        {
+            ClearAttributes();
+            if (!started)
+            {
+                started = true;
+                gramSize = minGram;
+                char[] chars = new char[1024];
+                charsRead = 0;
+                // TODO: refactor to a shared readFully somewhere:
+                while (charsRead < chars.Length)
+                {
+                    int inc = m_input.Read(chars, charsRead, chars.Length - charsRead);
+                    if (inc == -1)
+                    {
+                        break;
+                    }
+                    charsRead += inc;
+                }
+                inStr = (new string(chars, 0, charsRead)).Trim(); // remove any trailing empty strings
+
+                if (charsRead == chars.Length)
+                {
+                    // Read extra throwaway chars so that on end() we
+                    // report the correct offset:
+                    var throwaway = new char[1024];
+                    while (true)
+                    {
+                        int inc = m_input.Read(throwaway, 0, throwaway.Length);
+                        if (inc == -1)
+                        {
+                            break;
+                        }
+                        charsRead += inc;
+                    }
+                }
+
+                inLen = inStr.Length;
+                if (inLen == 0)
+                {
+                    return false;
+                }
+            }
+
+            if (pos + gramSize > inLen) // if we hit the end of the string
+            {
+                pos = 0; // reset to beginning of string
+                gramSize++; // increase n-gram size
+                if (gramSize > maxGram) // we are done
+                {
+                    return false;
+                }
+                if (pos + gramSize > inLen)
+                {
+                    return false;
+                }
+            }
+
+            int oldPos = pos;
+            pos++;
+            termAtt.SetEmpty().Append(inStr, oldPos, oldPos + gramSize);
+            offsetAtt.SetOffset(CorrectOffset(oldPos), CorrectOffset(oldPos + gramSize));
+            return true;
+        }
+
+        public override void End()
+        {
+            base.End();
+            // set final offset
+            int finalOffset = CorrectOffset(charsRead);
+            this.offsetAtt.SetOffset(finalOffset, finalOffset);
+        }
+
+        public override void Reset()
+        {
+            base.Reset();
+            started = false;
+            pos = 0;
+        }
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramFilterFactory.cs
new file mode 100644
index 0000000..ca1d0bc
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramFilterFactory.cs
@@ -0,0 +1,56 @@
+using Lucene.Net.Analysis.Util;
+using System.Collections.Generic;
+
+namespace Lucene.Net.Analysis.NGram
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Factory for <see cref="NGramTokenFilter"/>.
+    /// <code>
+    /// &lt;fieldType name="text_ngrm" class="solr.TextField" positionIncrementGap="100"&gt;
+    ///   &lt;analyzer&gt;
+    ///     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
+    ///     &lt;filter class="solr.NGramFilterFactory" minGramSize="1" maxGramSize="2"/&gt;
+    ///   &lt;/analyzer&gt;
+    /// &lt;/fieldType&gt;</code>
+    /// </summary>
+    public class NGramFilterFactory : TokenFilterFactory
+    {
+        private readonly int maxGramSize;
+        private readonly int minGramSize;
+
+        /// <summary>
+        /// Creates a new <see cref="NGramFilterFactory"/> </summary>
+        public NGramFilterFactory(IDictionary<string, string> args)
+            : base(args)
+        {
+            minGramSize = GetInt(args, "minGramSize", NGramTokenFilter.DEFAULT_MIN_NGRAM_SIZE);
+            maxGramSize = GetInt(args, "maxGramSize", NGramTokenFilter.DEFAULT_MAX_NGRAM_SIZE);
+            if (args.Count > 0)
+            {
+                throw new System.ArgumentException("Unknown parameters: " + args);
+            }
+        }
+
+        public override TokenStream Create(TokenStream input)
+        {
+            return new NGramTokenFilter(m_luceneMatchVersion, input, minGramSize, maxGramSize);
+        }
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramTokenFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramTokenFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramTokenFilter.cs
new file mode 100644
index 0000000..f1c82c5
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramTokenFilter.cs
@@ -0,0 +1,252 @@
+using Lucene.Net.Analysis.Miscellaneous;
+using Lucene.Net.Analysis.TokenAttributes;
+using Lucene.Net.Analysis.Util;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Analysis.NGram
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Tokenizes the input into n-grams of the given size(s).
+    /// <para>You must specify the required <see cref="LuceneVersion"/> compatibility when
+    /// creating a <see cref="NGramTokenFilter"/>. As of Lucene 4.4, this token filters:
+    /// <list type="bullet">
+    ///     <item>handles supplementary characters correctly,</item>
+    ///     <item>emits all n-grams for the same token at the same position,</item>
+    ///     <item>does not modify offsets,</item>
+    ///     <item>sorts n-grams by their offset in the original token first, then
+    ///         increasing length (meaning that "abc" will give "a", "ab", "abc", "b", "bc",
+    ///         "c").</item>
+    /// </list>
+    /// </para>
+    /// <para>You can make this filter use the old behavior by providing a version &lt;
+    /// <see cref="LuceneVersion.LUCENE_44"/> in the constructor but this is not recommended as
+    /// it will lead to broken <see cref="TokenStream"/>s that will cause highlighting
+    /// bugs.
+    /// </para>
+    /// <para>If you were using this <see cref="TokenFilter"/> to perform partial highlighting,
+    /// this won't work anymore since this filter doesn't update offsets. You should
+    /// modify your analysis chain to use <see cref="NGramTokenizer"/>, and potentially
+    /// override <see cref="NGramTokenizer.IsTokenChar(int)"/> to perform pre-tokenization.
+    /// </para>
+    /// </summary>
+    public sealed class NGramTokenFilter : TokenFilter
+    {
+        public const int DEFAULT_MIN_NGRAM_SIZE = 1;
+        public const int DEFAULT_MAX_NGRAM_SIZE = 2;
+
+        private readonly int minGram, maxGram;
+
+        private char[] curTermBuffer;
+        private int curTermLength;
+        private int curCodePointCount;
+        private int curGramSize;
+        private int curPos;
+        private int curPosInc, curPosLen;
+        private int tokStart;
+        private int tokEnd;
+        private bool hasIllegalOffsets; // only if the length changed before this filter
+
+        private readonly LuceneVersion version;
+        private readonly CharacterUtils charUtils;
+        private readonly ICharTermAttribute termAtt;
+        private readonly IPositionIncrementAttribute posIncAtt;
+        private readonly IPositionLengthAttribute posLenAtt;
+        private readonly IOffsetAttribute offsetAtt;
+
+        /// <summary>
+        /// Creates <see cref="NGramTokenFilter"/> with given min and max n-grams. </summary>
+        /// <param name="version"> Lucene version to enable correct position increments.
+        ///                See <see cref="NGramTokenFilter"/> for details. </param>
+        /// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param>
+        /// <param name="minGram"> the smallest n-gram to generate </param>
+        /// <param name="maxGram"> the largest n-gram to generate </param>
+        public NGramTokenFilter(LuceneVersion version, TokenStream input, int minGram, int maxGram)
+            : base(new CodepointCountFilter(version, input, minGram, int.MaxValue))
+        {
+            this.version = version;
+            this.charUtils = version.OnOrAfter(
+#pragma warning disable 612, 618
+                LuceneVersion.LUCENE_44) ?
+#pragma warning restore 612, 618
+                CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance;
+            if (minGram < 1)
+            {
+                throw new System.ArgumentException("minGram must be greater than zero");
+            }
+            if (minGram > maxGram)
+            {
+                throw new System.ArgumentException("minGram must not be greater than maxGram");
+            }
+            this.minGram = minGram;
+            this.maxGram = maxGram;
+#pragma warning disable 612, 618
+            if (version.OnOrAfter(LuceneVersion.LUCENE_44))
+#pragma warning restore 612, 618
+            {
+                posIncAtt = AddAttribute<IPositionIncrementAttribute>();
+                posLenAtt = AddAttribute<IPositionLengthAttribute>();
+            }
+            else
+            {
+                posIncAtt = new PositionIncrementAttributeAnonymousInnerClassHelper(this);
+                posLenAtt = new PositionLengthAttributeAnonymousInnerClassHelper(this);
+            }
+            termAtt = AddAttribute<ICharTermAttribute>();
+            offsetAtt = AddAttribute<IOffsetAttribute>();
+        }
+
+        private class PositionIncrementAttributeAnonymousInnerClassHelper : PositionIncrementAttribute
+        {
+            private readonly NGramTokenFilter outerInstance;
+
+            public PositionIncrementAttributeAnonymousInnerClassHelper(NGramTokenFilter outerInstance)
+            {
+                this.outerInstance = outerInstance;
+            }
+
+            public override int PositionIncrement
+            {
+                set
+                {
+                }
+                get
+                {
+                    return 0;
+                }
+            }
+        }
+
+        private class PositionLengthAttributeAnonymousInnerClassHelper : PositionLengthAttribute
+        {
+            private readonly NGramTokenFilter outerInstance;
+
+            public PositionLengthAttributeAnonymousInnerClassHelper(NGramTokenFilter outerInstance)
+            {
+                this.outerInstance = outerInstance;
+            }
+
+            public override int PositionLength
+            {
+                set
+                {
+                }
+                get
+                {
+                    return 0;
+                }
+            }
+        }
+
+        /// <summary>
+        /// Creates <see cref="NGramTokenFilter"/> with default min and max n-grams. </summary>
+        /// <param name="version"> Lucene version to enable correct position increments.
+        ///                See <see cref="NGramTokenFilter"/> for details. </param>
+        /// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param>
+        public NGramTokenFilter(LuceneVersion version, TokenStream input)
+            : this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE)
+        {
+        }
+
+        /// <summary>
+        /// Returns the next token in the stream, or null at EOS.
+        /// </summary>
+        public override sealed bool IncrementToken()
+        {
+            while (true)
+            {
+                if (curTermBuffer == null)
+                {
+                    if (!m_input.IncrementToken())
+                    {
+                        return false;
+                    }
+                    else
+                    {
+                        curTermBuffer = (char[])termAtt.Buffer.Clone();
+                        curTermLength = termAtt.Length;
+                        curCodePointCount = charUtils.CodePointCount(termAtt.ToString());
+                        curGramSize = minGram;
+                        curPos = 0;
+                        curPosInc = posIncAtt.PositionIncrement;
+                        curPosLen = posLenAtt.PositionLength;
+                        tokStart = offsetAtt.StartOffset;
+                        tokEnd = offsetAtt.EndOffset;
+                        // if length by start + end offsets doesn't match the term text then assume
+                        // this is a synonym and don't adjust the offsets.
+                        hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
+                    }
+                }
+#pragma warning disable 612, 618
+                if (version.OnOrAfter(LuceneVersion.LUCENE_44))
+#pragma warning restore 612, 618
+                {
+                    if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount)
+                    {
+                        ++curPos;
+                        curGramSize = minGram;
+                    }
+                    if ((curPos + curGramSize) <= curCodePointCount)
+                    {
+                        ClearAttributes();
+                        int start = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
+                        int end = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
+                        termAtt.CopyBuffer(curTermBuffer, start, end - start);
+                        posIncAtt.PositionIncrement = curPosInc;
+                        curPosInc = 0;
+                        posLenAtt.PositionLength = curPosLen;
+                        offsetAtt.SetOffset(tokStart, tokEnd);
+                        curGramSize++;
+                        return true;
+                    }
+                }
+                else
+                {
+                    while (curGramSize <= maxGram)
+                    {
+                        while (curPos + curGramSize <= curTermLength) // while there is input
+                        {
+                            ClearAttributes();
+                            termAtt.CopyBuffer(curTermBuffer, curPos, curGramSize);
+                            if (hasIllegalOffsets)
+                            {
+                                offsetAtt.SetOffset(tokStart, tokEnd);
+                            }
+                            else
+                            {
+                                offsetAtt.SetOffset(tokStart + curPos, tokStart + curPos + curGramSize);
+                            }
+                            curPos++;
+                            return true;
+                        }
+                        curGramSize++; // increase n-gram size
+                        curPos = 0;
+                    }
+                }
+                curTermBuffer = null;
+            }
+        }
+
+        public override void Reset()
+        {
+            base.Reset();
+            curTermBuffer = null;
+        }
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramTokenizer.cs
new file mode 100644
index 0000000..b1845c8
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramTokenizer.cs
@@ -0,0 +1,319 @@
+using Lucene.Net.Analysis.TokenAttributes;
+using Lucene.Net.Analysis.Util;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using System;
+using System.Diagnostics;
+using System.IO;
+
+namespace Lucene.Net.Analysis.NGram
+{
+    /*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+    /// <summary>
+    /// Tokenizes the input into n-grams of the given size(s).
+    /// <para>On the contrary to <see cref="NGramTokenFilter"/>, this class sets offsets so
+    /// that characters between startOffset and endOffset in the original stream are
+    /// the same as the term chars.
+    /// </para>
+    /// <para>For example, "abcde" would be tokenized as (minGram=2, maxGram=3):
+    /// <list type="table">
+    ///     <listheader>
+    ///         <term>Term</term>
+    ///         <term>Position increment</term>
+    ///         <term>Position length</term>
+    ///         <term>Offsets</term>
+    ///     </listheader>
+    ///     <item>
+    ///         <term>ab</term>
+    ///         <term>1</term>
+    ///         <term>1</term>
+    ///         <term>[0,2[</term>
+    ///     </item>
+    ///     <item>
+    ///         <term>abc</term>
+    ///         <term>1</term>
+    ///         <term>1</term>
+    ///         <term>[0,3[</term>
+    ///     </item>
+    ///     <item>
+    ///         <term>bc</term>
+    ///         <term>1</term>
+    ///         <term>1</term>
+    ///         <term>[1,3[</term>
+    ///     </item>
+    ///     <item>
+    ///         <term>bcd</term>
+    ///         <term>1</term>
+    ///         <term>1</term>
+    ///         <term>[1,4[</term>
+    ///     </item>
+    ///     <item>
+    ///         <term>cd</term>
+    ///         <term>1</term>
+    ///         <term>1</term>
+    ///         <term>[2,4[</term>
+    ///     </item>
+    ///     <item>
+    ///         <term>cde</term>
+    ///         <term>1</term>
+    ///         <term>1</term>
+    ///         <term>[2,5[</term>
+    ///     </item>
+    ///     <item>
+    ///         <term>de</term>
+    ///         <term>1</term>
+    ///         <term>1</term>
+    ///         <term>[3,5[</term>
+    ///     </item>
+    /// </list>
+    /// </para>
+    /// <para>This tokenizer changed a lot in Lucene 4.4 in order to:
+    /// <list type="bullet">
+    ///     <item>tokenize in a streaming fashion to support streams which are larger
+    ///         than 1024 chars (limit of the previous version),</item>
+    ///     <item>count grams based on unicode code points instead of java chars (and
+    ///         never split in the middle of surrogate pairs),</item>
+    ///     <item>give the ability to pre-tokenize the stream (<see cref="IsTokenChar(int)"/>)
+    ///         before computing n-grams.</item>
+    /// </list>
+    /// </para>
+    /// <para>Additionally, this class doesn't trim trailing whitespaces and emits
+    /// tokens in a different order, tokens are now emitted by increasing start
+    /// offsets while they used to be emitted by increasing lengths (which prevented
+    /// from supporting large input streams).
+    /// </para>
+    /// <para>Although <b style="color:red">highly</b> discouraged, it is still possible
+    /// to use the old behavior through <see cref="Lucene43NGramTokenizer"/>.
+    /// </para>
+    /// </summary>
+    // non-sealed to allow for overriding IsTokenChar, but all other methods should be sealed
+    public class NGramTokenizer : Tokenizer
+    {
+        public const int DEFAULT_MIN_NGRAM_SIZE = 1;
+        public const int DEFAULT_MAX_NGRAM_SIZE = 2;
+
+        private CharacterUtils charUtils;
+        private CharacterUtils.CharacterBuffer charBuffer;
+        private int[] buffer; // like charBuffer, but converted to code points
+        private int bufferStart, bufferEnd; // remaining slice in buffer
+        private int offset;
+        private int gramSize;
+        private int minGram, maxGram;
+        private bool exhausted;
+        private int lastCheckedChar; // last offset in the buffer that we checked
+        private int lastNonTokenChar; // last offset that we found to not be a token char
+        private bool edgesOnly; // leading edges n-grams only
+
+        private ICharTermAttribute termAtt;
+        private IPositionIncrementAttribute posIncAtt;
+        private IPositionLengthAttribute posLenAtt;
+        private IOffsetAttribute offsetAtt;
+
+        internal NGramTokenizer(LuceneVersion version, TextReader input, int minGram, int maxGram, bool edgesOnly)
+              : base(input)
+        {
+            Init(version, minGram, maxGram, edgesOnly);
+        }
+
+        /// <summary>
+        /// Creates <see cref="NGramTokenizer"/> with given min and max n-grams. </summary>
+        /// <param name="version"> the lucene compatibility version </param>
+        /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
+        /// <param name="minGram"> the smallest n-gram to generate </param>
+        /// <param name="maxGram"> the largest n-gram to generate </param>
+        public NGramTokenizer(LuceneVersion version, TextReader input, int minGram, int maxGram)
+              : this(version, input, minGram, maxGram, false)
+        {
+        }
+
+        internal NGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, int minGram, int maxGram, bool edgesOnly)
+              : base(factory, input)
+        {
+            Init(version, minGram, maxGram, edgesOnly);
+        }
+
+        /// <summary>
+        /// Creates <see cref="NGramTokenizer"/> with given min and max n-grams. </summary>
+        /// <param name="version"> the lucene compatibility version </param>
+        /// <param name="factory"> <see cref="AttributeSource.AttributeFactory"/> to use </param>
+        /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
+        /// <param name="minGram"> the smallest n-gram to generate </param>
+        /// <param name="maxGram"> the largest n-gram to generate </param>
+        public NGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, int minGram, int maxGram)
+              : this(version, factory, input, minGram, maxGram, false)
+        {
+        }
+
+        /// <summary>
+        /// Creates <see cref="NGramTokenizer"/> with default min and max n-grams. </summary>
+        /// <param name="version"> the lucene compatibility version </param>
+        /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param>
+        public NGramTokenizer(LuceneVersion version, TextReader input)
+              : this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE)
+        {
+        }
+
+        private void Init(LuceneVersion version, int minGram, int maxGram, bool edgesOnly)
+        {
+#pragma warning disable 612, 618
+            if (!version.OnOrAfter(LuceneVersion.LUCENE_44))
+#pragma warning restore 612, 618
+            {
+                throw new System.ArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer/Lucene43EdgeNGramTokenizer");
+            }
+#pragma warning disable 612, 618
+            charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ?
+#pragma warning restore 612, 618
+                CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance;
+            if (minGram < 1)
+            {
+                throw new System.ArgumentException("minGram must be greater than zero");
+            }
+            if (minGram > maxGram)
+            {
+                throw new System.ArgumentException("minGram must not be greater than maxGram");
+            }
+            termAtt = AddAttribute<ICharTermAttribute>();
+            posIncAtt = AddAttribute<IPositionIncrementAttribute>();
+            posLenAtt = AddAttribute<IPositionLengthAttribute>();
+            offsetAtt = AddAttribute<IOffsetAttribute>();
+            this.minGram = minGram;
+            this.maxGram = maxGram;
+            this.edgesOnly = edgesOnly;
+            charBuffer = CharacterUtils.NewCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader
+            buffer = new int[charBuffer.Buffer.Length];
+
+            // Make the term att large enough
+            termAtt.ResizeBuffer(2 * maxGram);
+        }
+
+        public override sealed bool IncrementToken()
+        {
+            ClearAttributes();
+
+            // termination of this loop is guaranteed by the fact that every iteration
+            // either advances the buffer (calls consumes()) or increases gramSize
+            while (true)
+            {
+                // compact
+                if (bufferStart >= bufferEnd - maxGram - 1 && !exhausted)
+                {
+                    Array.Copy(buffer, bufferStart, buffer, 0, bufferEnd - bufferStart);
+                    bufferEnd -= bufferStart;
+                    lastCheckedChar -= bufferStart;
+                    lastNonTokenChar -= bufferStart;
+                    bufferStart = 0;
+
+                    // fill in remaining space
+                    exhausted = !charUtils.Fill(charBuffer, m_input, buffer.Length - bufferEnd);
+                    // convert to code points
+                    bufferEnd += charUtils.ToCodePoints(charBuffer.Buffer, 0, charBuffer.Length, buffer, bufferEnd);
+                }
+
+                // should we go to the next offset?
+                if (gramSize > maxGram || (bufferStart + gramSize) > bufferEnd)
+                {
+                    if (bufferStart + 1 + minGram > bufferEnd)
+                    {
+                        Debug.Assert(exhausted);
+                        return false;
+                    }
+                    Consume();
+                    gramSize = minGram;
+                }
+
+                UpdateLastNonTokenChar();
+
+                // retry if the token to be emitted was going to not only contain token chars
+                bool termContainsNonTokenChar = lastNonTokenChar >= bufferStart && lastNonTokenChar < (bufferStart + gramSize);
+                bool isEdgeAndPreviousCharIsTokenChar = edgesOnly && lastNonTokenChar != bufferStart - 1;
+                if (termContainsNonTokenChar || isEdgeAndPreviousCharIsTokenChar)
+                {
+                    Consume();
+                    gramSize = minGram;
+                    continue;
+                }
+
+                int length = charUtils.ToChars(buffer, bufferStart, gramSize, termAtt.Buffer, 0);
+                termAtt.Length = length;
+                posIncAtt.PositionIncrement = 1;
+                posLenAtt.PositionLength = 1;
+                offsetAtt.SetOffset(CorrectOffset(offset), CorrectOffset(offset + length));
+                ++gramSize;
+                return true;
+            }
+        }
+
+        private void UpdateLastNonTokenChar()
+        {
+            int termEnd = bufferStart + gramSize - 1;
+            if (termEnd > lastCheckedChar)
+            {
+                for (int i = termEnd; i > lastCheckedChar; --i)
+                {
+                    if (!IsTokenChar(buffer[i]))
+                    {
+                        lastNonTokenChar = i;
+                        break;
+                    }
+                }
+                lastCheckedChar = termEnd;
+            }
+        }
+
+        /// <summary>
+        /// Consume one code point. </summary>
+        private void Consume()
+        {
+            offset += Character.CharCount(buffer[bufferStart++]);
+        }
+
+        /// <summary>
+        /// Only collect characters which satisfy this condition. </summary>
+        protected virtual bool IsTokenChar(int chr)
+        {
+            return true;
+        }
+
+        public override sealed void End()
+        {
+            base.End();
+            Debug.Assert(bufferStart <= bufferEnd);
+            int endOffset = offset;
+            for (int i = bufferStart; i < bufferEnd; ++i)
+            {
+                endOffset += Character.CharCount(buffer[i]);
+            }
+            endOffset = CorrectOffset(endOffset);
+            // set final offset
+            offsetAtt.SetOffset(endOffset, endOffset);
+        }
+
+        public override sealed void Reset()
+        {
+            base.Reset();
+            bufferStart = bufferEnd = buffer.Length;
+            lastNonTokenChar = lastCheckedChar = bufferStart - 1;
+            offset = 0;
+            gramSize = minGram;
+            exhausted = false;
+            charBuffer.Reset();
+        }
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramTokenizerFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramTokenizerFactory.cs
new file mode 100644
index 0000000..cf25b65
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/NGram/NGramTokenizerFactory.cs
@@ -0,0 +1,70 @@
+using Lucene.Net.Analysis.Util;
+using Lucene.Net.Util;
+using System.Collections.Generic;
+using System.IO;
+
+namespace Lucene.Net.Analysis.NGram
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Factory for <see cref="NGramTokenizer"/>.
+    /// <code>
+    /// &lt;fieldType name="text_ngrm" class="solr.TextField" positionIncrementGap="100"&gt;
+    ///   &lt;analyzer&gt;
+    ///     &lt;tokenizer class="solr.NGramTokenizerFactory" minGramSize="1" maxGramSize="2"/&gt;
+    ///   &lt;/analyzer&gt;
+    /// &lt;/fieldType&gt;</code>
+    /// </summary>
+    public class NGramTokenizerFactory : TokenizerFactory
+    {
+        private readonly int maxGramSize;
+        private readonly int minGramSize;
+
+        /// <summary>
+        /// Creates a new <see cref="NGramTokenizerFactory"/> </summary>
+        public NGramTokenizerFactory(IDictionary<string, string> args)
+            : base(args)
+        {
+            minGramSize = GetInt(args, "minGramSize", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
+            maxGramSize = GetInt(args, "maxGramSize", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
+            if (args.Count > 0)
+            {
+                throw new System.ArgumentException("Unknown parameters: " + args);
+            }
+        }
+
+        /// <summary>
+        /// Creates the <see cref="TokenStream"/> of n-grams from the given <see cref="TextReader"/> and <see cref="AttributeSource.AttributeFactory"/>. </summary>
+        public override Tokenizer Create(AttributeSource.AttributeFactory factory, TextReader input)
+        {
+#pragma warning disable 612, 618
+            if (m_luceneMatchVersion.OnOrAfter(LuceneVersion.LUCENE_44))
+#pragma warning restore 612, 618
+            {
+                return new NGramTokenizer(m_luceneMatchVersion, factory, input, minGramSize, maxGramSize);
+            }
+            else
+            {
+#pragma warning disable 612, 618
+                return new Lucene43NGramTokenizer(factory, input, minGramSize, maxGramSize);
+#pragma warning restore 612, 618
+            }
+        }
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramFilterFactory.cs
deleted file mode 100644
index 70b44d3..0000000
--- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramFilterFactory.cs
+++ /dev/null
@@ -1,60 +0,0 @@
-using Lucene.Net.Analysis.Util;
-using System.Collections.Generic;
-
-namespace Lucene.Net.Analysis.NGram
-{
-    /*
-     * Licensed to the Apache Software Foundation (ASF) under one or more
-     * contributor license agreements.  See the NOTICE file distributed with
-     * this work for additional information regarding copyright ownership.
-     * The ASF licenses this file to You under the Apache License, Version 2.0
-     * (the "License"); you may not use this file except in compliance with
-     * the License.  You may obtain a copy of the License at
-     *
-     *     http://www.apache.org/licenses/LICENSE-2.0
-     *
-     * Unless required by applicable law or agreed to in writing, software
-     * distributed under the License is distributed on an "AS IS" BASIS,
-     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-     * See the License for the specific language governing permissions and
-     * limitations under the License.
-     */
-
-    /// <summary>
-    /// Creates new instances of <see cref="EdgeNGramTokenFilter"/>.
-    /// <code>
-    /// &lt;fieldType name="text_edgngrm" class="solr.TextField" positionIncrementGap="100"&gt;
-    ///   &lt;analyzer&gt;
-    ///     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
-    ///     &lt;filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="1"/&gt;
-    ///   &lt;/analyzer&gt;
-    /// &lt;/fieldType&gt;</code>
-    /// </summary>
-    public class EdgeNGramFilterFactory : TokenFilterFactory
-    {
-        private readonly int maxGramSize;
-        private readonly int minGramSize;
-        private readonly string side;
-
-        /// <summary>
-        /// Creates a new <see cref="EdgeNGramFilterFactory"/> </summary>
-        public EdgeNGramFilterFactory(IDictionary<string, string> args)
-            : base(args)
-        {
-            minGramSize = GetInt(args, "minGramSize", EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE);
-            maxGramSize = GetInt(args, "maxGramSize", EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE);
-            side = Get(args, "side", EdgeNGramTokenFilter.Side.FRONT.ToString());
-            if (args.Count > 0)
-            {
-                throw new System.ArgumentException("Unknown parameters: " + args);
-            }
-        }
-
-        public override TokenStream Create(TokenStream input)
-        {
-#pragma warning disable 612, 618
-            return new EdgeNGramTokenFilter(m_luceneMatchVersion, input, side, minGramSize, maxGramSize);
-#pragma warning restore 612, 618
-        }
-    }
-}
\ No newline at end of file


Mime
View raw message