lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From synhers...@apache.org
Subject [13/34] lucenenet git commit: Raw porting of Lucene.Net.Analysis.Common
Date Fri, 07 Nov 2014 23:12:17 GMT
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenFilter.cs
new file mode 100644
index 0000000..0b6dc5a
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenFilter.cs
@@ -0,0 +1,266 @@
+using System;
+
+namespace org.apache.lucene.analysis.ngram
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+	using ReverseStringFilter = org.apache.lucene.analysis.reverse.ReverseStringFilter;
+	using OffsetAttribute = org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+	using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+	using PositionIncrementAttribute = org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+	using PositionLengthAttribute = org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+	using CharacterUtils = org.apache.lucene.analysis.util.CharacterUtils;
+	using Version = org.apache.lucene.util.Version;
+
+	/// <summary>
+	/// Tokenizes the given token into n-grams of given size(s).
+	/// <para>
+	/// This <seealso cref="TokenFilter"/> create n-grams from the beginning edge or ending edge of a input token.
+	/// </para>
+	/// <para><a name="version"/>As of Lucene 4.4, this filter does not support
+	/// <seealso cref="Side#BACK"/> (you can use <seealso cref="ReverseStringFilter"/> up-front and
+	/// afterward to get the same behavior), handles supplementary characters
+	/// correctly and does not update offsets anymore.
+	/// </para>
+	/// </summary>
+	public sealed class EdgeNGramTokenFilter : TokenFilter
+	{
+	  public const Side DEFAULT_SIDE = Side.FRONT;
+	  public const int DEFAULT_MAX_GRAM_SIZE = 1;
+	  public const int DEFAULT_MIN_GRAM_SIZE = 1;
+
+	  /// <summary>
+	  /// Specifies which side of the input the n-gram should be generated from </summary>
+	  public enum Side
+	  {
+
+		/// <summary>
+		/// Get the n-gram from the front of the input </summary>
+//JAVA TO C# CONVERTER TODO TASK: The following line could not be converted:
+		FRONT
+		{
+		  public String getLabel() { return "front"
+		}
+	  },
+
+		/// <summary>
+		/// Get the n-gram from the end of the input </summary>
+		[System.Obsolete]
+//JAVA TO C# CONVERTER TODO TASK: The following line could not be converted:
+		@Deprecated BACK
+		{
+			public String getLabel()
+			{
+				return "back";
+			}
+		}
+
+		public = 
+
+		// Get the appropriate Side from a string
+		public static Side getSide(String sideName)
+		{
+//JAVA TO C# CONVERTER TODO TASK: The following line could not be converted:
+		  if (FRONT.getLabel().equals(sideName))
+		  {
+			return FRONT;
+		  }
+//JAVA TO C# CONVERTER TODO TASK: The following line could not be converted:
+		  if (BACK.getLabel().equals(sideName))
+		  {
+			return BACK;
+		  }
+		  return null;
+		}
+	}
+
+	  private readonly Version version;
+	  private readonly CharacterUtils charUtils;
+	  private readonly int minGram;
+	  private readonly int maxGram;
+	  private Side side;
+	  private char[] curTermBuffer;
+	  private int curTermLength;
+	  private int curCodePointCount;
+	  private int curGramSize;
+	  private int tokStart;
+	  private int tokEnd; // only used if the length changed before this filter
+	  private bool updateOffsets; // never if the length changed before this filter
+	  private int savePosIncr;
+	  private int savePosLen;
+
+	  private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+	  private readonly OffsetAttribute offsetAtt = addAttribute(typeof(OffsetAttribute));
+	  private readonly PositionIncrementAttribute posIncrAtt = addAttribute(typeof(PositionIncrementAttribute));
+	  private readonly PositionLengthAttribute posLenAtt = addAttribute(typeof(PositionLengthAttribute));
+
+	  /// <summary>
+	  /// Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
+	  /// </summary>
+	  /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
+	  /// <param name="input"> <seealso cref="TokenStream"/> holding the input to be tokenized </param>
+	  /// <param name="side"> the <seealso cref="Side"/> from which to chop off an n-gram </param>
+	  /// <param name="minGram"> the smallest n-gram to generate </param>
+	  /// <param name="maxGram"> the largest n-gram to generate </param>
+	  [Obsolete]
+	  public EdgeNGramTokenFilter(Version version, TokenStream input, Side side, int minGram, int maxGram) : base(input)
+	  {
+
+		if (version == null)
+		{
+		  throw new System.ArgumentException("version must not be null");
+		}
+
+		if (version.onOrAfter(Version.LUCENE_44) && side == Side.BACK)
+		{
+		  throw new System.ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4, use ReverseStringFilter up-front and afterward");
+		}
+
+		if (side == null)
+		{
+		  throw new System.ArgumentException("sideLabel must be either front or back");
+		}
+
+		if (minGram < 1)
+		{
+		  throw new System.ArgumentException("minGram must be greater than zero");
+		}
+
+		if (minGram > maxGram)
+		{
+		  throw new System.ArgumentException("minGram must not be greater than maxGram");
+		}
+
+		this.version = version;
+		this.charUtils = version.onOrAfter(Version.LUCENE_44) ? CharacterUtils.getInstance(version) : CharacterUtils.Java4Instance;
+		this.minGram = minGram;
+		this.maxGram = maxGram;
+		this.side = side;
+	  }
+
+	  /// <summary>
+	  /// Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
+	  /// </summary>
+	  /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
+	  /// <param name="input"> <seealso cref="TokenStream"/> holding the input to be tokenized </param>
+	  /// <param name="sideLabel"> the name of the <seealso cref="Side"/> from which to chop off an n-gram </param>
+	  /// <param name="minGram"> the smallest n-gram to generate </param>
+	  /// <param name="maxGram"> the largest n-gram to generate </param>
+	  [Obsolete]
+	  public EdgeNGramTokenFilter(Version version, TokenStream input, string sideLabel, int minGram, int maxGram) : this(version, input, Side.getSide(sideLabel), minGram, maxGram)
+	  {
+	  }
+
+	  /// <summary>
+	  /// Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
+	  /// </summary>
+	  /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
+	  /// <param name="input"> <seealso cref="TokenStream"/> holding the input to be tokenized </param>
+	  /// <param name="minGram"> the smallest n-gram to generate </param>
+	  /// <param name="maxGram"> the largest n-gram to generate </param>
+	  public EdgeNGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) : this(version, input, Side.FRONT, minGram, maxGram)
+	  {
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public final boolean incrementToken() throws java.io.IOException
+	  public override bool incrementToken()
+	  {
+		while (true)
+		{
+		  if (curTermBuffer == null)
+		  {
+			if (!input.incrementToken())
+			{
+			  return false;
+			}
+			else
+			{
+			  curTermBuffer = termAtt.buffer().clone();
+			  curTermLength = termAtt.length();
+			  curCodePointCount = charUtils.codePointCount(termAtt);
+			  curGramSize = minGram;
+			  tokStart = offsetAtt.startOffset();
+			  tokEnd = offsetAtt.endOffset();
+			  if (version.onOrAfter(Version.LUCENE_44))
+			  {
+				// Never update offsets
+				updateOffsets = false;
+			  }
+			  else
+			  {
+				// if length by start + end offsets doesn't match the term text then assume
+				// this is a synonym and don't adjust the offsets.
+				updateOffsets = (tokStart + curTermLength) == tokEnd;
+			  }
+			  savePosIncr += posIncrAtt.PositionIncrement;
+			  savePosLen = posLenAtt.PositionLength;
+			}
+		  }
+		  if (curGramSize <= maxGram) // if we have hit the end of our n-gram size range, quit
+		  {
+			if (curGramSize <= curCodePointCount) // if the remaining input is too short, we can't generate any n-grams
+			{
+			  // grab gramSize chars from front or back
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int start = side == Side.FRONT ? 0 : charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, curTermLength, -curGramSize);
+			  int start = side == Side.FRONT ? 0 : charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, curTermLength, -curGramSize);
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int end = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
+			  int end = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
+			  clearAttributes();
+			  if (updateOffsets)
+			  {
+				offsetAtt.setOffset(tokStart + start, tokStart + end);
+			  }
+			  else
+			  {
+				offsetAtt.setOffset(tokStart, tokEnd);
+			  }
+			  // first ngram gets increment, others don't
+			  if (curGramSize == minGram)
+			  {
+				posIncrAtt.PositionIncrement = savePosIncr;
+				savePosIncr = 0;
+			  }
+			  else
+			  {
+				posIncrAtt.PositionIncrement = 0;
+			  }
+			  posLenAtt.PositionLength = savePosLen;
+			  termAtt.copyBuffer(curTermBuffer, start, end - start);
+			  curGramSize++;
+			  return true;
+			}
+		  }
+		  curTermBuffer = null;
+		}
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void reset() throws java.io.IOException
+	  public override void reset()
+	  {
+		base.reset();
+		curTermBuffer = null;
+		savePosIncr = 0;
+	  }
+}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizer.cs
new file mode 100644
index 0000000..c8d36f6
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizer.cs
@@ -0,0 +1,71 @@
+namespace org.apache.lucene.analysis.ngram
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+	using Version = org.apache.lucene.util.Version;
+
+	/// <summary>
+	/// Tokenizes the input from an edge into n-grams of given size(s).
+	/// <para>
+	/// This <seealso cref="Tokenizer"/> create n-grams from the beginning edge or ending edge of a input token.
+	/// </para>
+	/// <para><a name="version" /> As of Lucene 4.4, this tokenizer<ul>
+	/// <li>can handle <code>maxGram</code> larger than 1024 chars, but beware that this will result in increased memory usage
+	/// <li>doesn't trim the input,
+	/// <li>sets position increments equal to 1 instead of 1 for the first token and 0 for all other ones
+	/// <li>doesn't support backward n-grams anymore.
+	/// <li>supports <seealso cref="#isTokenChar(int) pre-tokenization"/>,
+	/// <li>correctly handles supplementary characters.
+	/// </ul>
+	/// </para>
+	/// <para>Although <b style="color:red">highly</b> discouraged, it is still possible
+	/// to use the old behavior through <seealso cref="Lucene43EdgeNGramTokenizer"/>.
+	/// </para>
+	/// </summary>
+	public class EdgeNGramTokenizer : NGramTokenizer
+	{
+	  public const int DEFAULT_MAX_GRAM_SIZE = 1;
+	  public const int DEFAULT_MIN_GRAM_SIZE = 1;
+
+	  /// <summary>
+	  /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+	  /// </summary>
+	  /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
+	  /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
+	  /// <param name="minGram"> the smallest n-gram to generate </param>
+	  /// <param name="maxGram"> the largest n-gram to generate </param>
+	  public EdgeNGramTokenizer(Version version, Reader input, int minGram, int maxGram) : base(version, input, minGram, maxGram, true)
+	  {
+	  }
+
+	  /// <summary>
+	  /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+	  /// </summary>
+	  /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
+	  /// <param name="factory"> <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param>
+	  /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
+	  /// <param name="minGram"> the smallest n-gram to generate </param>
+	  /// <param name="maxGram"> the largest n-gram to generate </param>
+	  public EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram) : base(version, factory, input, minGram, maxGram, true)
+	  {
+	  }
+
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizerFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizerFactory.cs
new file mode 100644
index 0000000..195a6e1
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizerFactory.cs
@@ -0,0 +1,74 @@
+using System.Collections.Generic;
+using TokenizerFactory = Lucene.Net.Analysis.Util.TokenizerFactory;
+
+namespace org.apache.lucene.analysis.ngram
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+
+	using TokenizerFactory = TokenizerFactory;
+	using AttributeFactory = org.apache.lucene.util.AttributeSource.AttributeFactory;
+	using Version = org.apache.lucene.util.Version;
+
+	/// <summary>
+	/// Creates new instances of <seealso cref="EdgeNGramTokenizer"/>.
+	/// <pre class="prettyprint">
+	/// &lt;fieldType name="text_edgngrm" class="solr.TextField" positionIncrementGap="100"&gt;
+	///   &lt;analyzer&gt;
+	///     &lt;tokenizer class="solr.EdgeNGramTokenizerFactory" minGramSize="1" maxGramSize="1"/&gt;
+	///   &lt;/analyzer&gt;
+	/// &lt;/fieldType&gt;</pre>
+	/// </summary>
+	public class EdgeNGramTokenizerFactory : TokenizerFactory
+	{
+	  private readonly int maxGramSize;
+	  private readonly int minGramSize;
+	  private readonly string side;
+
+	  /// <summary>
+	  /// Creates a new EdgeNGramTokenizerFactory </summary>
+	  public EdgeNGramTokenizerFactory(IDictionary<string, string> args) : base(args)
+	  {
+		minGramSize = getInt(args, "minGramSize", EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE);
+		maxGramSize = getInt(args, "maxGramSize", EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE);
+		side = get(args, "side", EdgeNGramTokenFilter.Side.FRONT.Label);
+		if (args.Count > 0)
+		{
+		  throw new System.ArgumentException("Unknown parameters: " + args);
+		}
+	  }
+
+	  public override Tokenizer create(AttributeFactory factory, Reader input)
+	  {
+		if (luceneMatchVersion.onOrAfter(Version.LUCENE_44))
+		{
+		  if (!EdgeNGramTokenFilter.Side.FRONT.Label.Equals(side))
+		  {
+			throw new System.ArgumentException(typeof(EdgeNGramTokenizer).SimpleName + " does not support backward n-grams as of Lucene 4.4");
+		  }
+		  return new EdgeNGramTokenizer(luceneMatchVersion, input, minGramSize, maxGramSize);
+		}
+		else
+		{
+		  return new Lucene43EdgeNGramTokenizer(luceneMatchVersion, input, side, minGramSize, maxGramSize);
+		}
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43EdgeNGramTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43EdgeNGramTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43EdgeNGramTokenizer.cs
new file mode 100644
index 0000000..9809ccf
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43EdgeNGramTokenizer.cs
@@ -0,0 +1,328 @@
+using System;
+
+namespace org.apache.lucene.analysis.ngram
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+
+	using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+	using OffsetAttribute = org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+	using PositionIncrementAttribute = org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+	using ArrayUtil = org.apache.lucene.util.ArrayUtil;
+	using Version = org.apache.lucene.util.Version;
+
+	/// <summary>
+	/// Old version of <seealso cref="EdgeNGramTokenizer"/> which doesn't handle correctly
+	/// supplementary characters.
+	/// </summary>
+	[Obsolete]
+	public sealed class Lucene43EdgeNGramTokenizer : Tokenizer
+	{
+	  public const Side DEFAULT_SIDE = Side.FRONT;
+	  public const int DEFAULT_MAX_GRAM_SIZE = 1;
+	  public const int DEFAULT_MIN_GRAM_SIZE = 1;
+
+	  private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+	  private readonly OffsetAttribute offsetAtt = addAttribute(typeof(OffsetAttribute));
+	  private readonly PositionIncrementAttribute posIncrAtt = addAttribute(typeof(PositionIncrementAttribute));
+
+	  /// <summary>
+	  /// Specifies which side of the input the n-gram should be generated from </summary>
+	  public enum Side
+	  {
+
+		/// <summary>
+		/// Get the n-gram from the front of the input </summary>
+//JAVA TO C# CONVERTER TODO TASK: The following line could not be converted:
+		FRONT
+		{
+		  public String getLabel() { return "front"
+		}
+	  },
+
+		/// <summary>
+		/// Get the n-gram from the end of the input </summary>
+//JAVA TO C# CONVERTER TODO TASK: The following line could not be converted:
+		BACK
+		{
+		  public String getLabel()
+		  {
+			  return "back";
+		  }
+		}
+
+		public = 
+
+		// Get the appropriate Side from a string
+		public static Side getSide(String sideName)
+		{
+//JAVA TO C# CONVERTER TODO TASK: The following line could not be converted:
+		  if (FRONT.getLabel().equals(sideName))
+		  {
+			return FRONT;
+		  }
+//JAVA TO C# CONVERTER TODO TASK: The following line could not be converted:
+		  if (BACK.getLabel().equals(sideName))
+		  {
+			return BACK;
+		  }
+		  return null;
+		}
+	}
+
+	  private int minGram;
+	  private int maxGram;
+	  private int gramSize;
+	  private Side side;
+	  private bool started;
+	  private int inLen; // length of the input AFTER trim()
+	  private int charsRead; // length of the input
+	  private string inStr;
+
+
+	  /// <summary>
+	  /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+	  /// </summary>
+	  /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
+	  /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
+	  /// <param name="side"> the <seealso cref="Side"/> from which to chop off an n-gram </param>
+	  /// <param name="minGram"> the smallest n-gram to generate </param>
+	  /// <param name="maxGram"> the largest n-gram to generate </param>
+	  [Obsolete]
+	  public Lucene43EdgeNGramTokenizer(Version version, Reader input, Side side, int minGram, int maxGram) : base(input)
+	  {
+		init(version, side, minGram, maxGram);
+	  }
+
+	  /// <summary>
+	  /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+	  /// </summary>
+	  /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
+	  /// <param name="factory"> <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param>
+	  /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
+	  /// <param name="side"> the <seealso cref="Side"/> from which to chop off an n-gram </param>
+	  /// <param name="minGram"> the smallest n-gram to generate </param>
+	  /// <param name="maxGram"> the largest n-gram to generate </param>
+	  [Obsolete]
+	  public Lucene43EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, Side side, int minGram, int maxGram) : base(factory, input)
+	  {
+		init(version, side, minGram, maxGram);
+	  }
+
+	  /// <summary>
+	  /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+	  /// </summary>
+	  /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
+	  /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
+	  /// <param name="sideLabel"> the name of the <seealso cref="Side"/> from which to chop off an n-gram </param>
+	  /// <param name="minGram"> the smallest n-gram to generate </param>
+	  /// <param name="maxGram"> the largest n-gram to generate </param>
+	  [Obsolete]
+	  public Lucene43EdgeNGramTokenizer(Version version, Reader input, string sideLabel, int minGram, int maxGram) : this(version, input, Side.getSide(sideLabel), minGram, maxGram)
+	  {
+	  }
+
+	  /// <summary>
+	  /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+	  /// </summary>
+	  /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
+	  /// <param name="factory"> <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param>
+	  /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
+	  /// <param name="sideLabel"> the name of the <seealso cref="Side"/> from which to chop off an n-gram </param>
+	  /// <param name="minGram"> the smallest n-gram to generate </param>
+	  /// <param name="maxGram"> the largest n-gram to generate </param>
+	  [Obsolete]
+	  public Lucene43EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, string sideLabel, int minGram, int maxGram) : this(version, factory, input, Side.getSide(sideLabel), minGram, maxGram)
+	  {
+	  }
+
+	  /// <summary>
+	  /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+	  /// </summary>
+	  /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
+	  /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
+	  /// <param name="minGram"> the smallest n-gram to generate </param>
+	  /// <param name="maxGram"> the largest n-gram to generate </param>
+	  public Lucene43EdgeNGramTokenizer(Version version, Reader input, int minGram, int maxGram) : this(version, input, Side.FRONT, minGram, maxGram)
+	  {
+	  }
+
+	  /// <summary>
+	  /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
+	  /// </summary>
+	  /// <param name="version"> the <a href="#version">Lucene match version</a> </param>
+	  /// <param name="factory"> <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param>
+	  /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
+	  /// <param name="minGram"> the smallest n-gram to generate </param>
+	  /// <param name="maxGram"> the largest n-gram to generate </param>
+	  public Lucene43EdgeNGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram) : this(version, factory, input, Side.FRONT, minGram, maxGram)
+	  {
+	  }
+
+	  private void init(Version version, Side side, int minGram, int maxGram)
+	  {
+		if (version == null)
+		{
+		  throw new System.ArgumentException("version must not be null");
+		}
+
+		if (side == null)
+		{
+		  throw new System.ArgumentException("sideLabel must be either front or back");
+		}
+
+		if (minGram < 1)
+		{
+		  throw new System.ArgumentException("minGram must be greater than zero");
+		}
+
+		if (minGram > maxGram)
+		{
+		  throw new System.ArgumentException("minGram must not be greater than maxGram");
+		}
+
+		if (version.onOrAfter(Version.LUCENE_44))
+		{
+		  if (side == Side.BACK)
+		  {
+			throw new System.ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4");
+		  }
+		}
+		else
+		{
+		  maxGram = Math.Min(maxGram, 1024);
+		}
+
+		this.minGram = minGram;
+		this.maxGram = maxGram;
+		this.side = side;
+	  }
+
+	  /// <summary>
+	  /// Returns the next token in the stream, or null at EOS. </summary>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
+	  public override bool incrementToken()
+	  {
+		clearAttributes();
+		// if we are just starting, read the whole input
+		if (!started)
+		{
+		  started = true;
+		  gramSize = minGram;
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int limit = side == Side.FRONT ? maxGram : 1024;
+		  int limit = side == Side.FRONT ? maxGram : 1024;
+		  char[] chars = new char[Math.Min(1024, limit)];
+		  charsRead = 0;
+		  // TODO: refactor to a shared readFully somewhere:
+		  bool exhausted = false;
+		  while (charsRead < limit)
+		  {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int inc = input.read(chars, charsRead, chars.length-charsRead);
+			int inc = input.read(chars, charsRead, chars.Length - charsRead);
+			if (inc == -1)
+			{
+			  exhausted = true;
+			  break;
+			}
+			charsRead += inc;
+			if (charsRead == chars.Length && charsRead < limit)
+			{
+			  chars = ArrayUtil.grow(chars);
+			}
+		  }
+
+		  inStr = new string(chars, 0, charsRead);
+		  inStr = inStr.Trim();
+
+		  if (!exhausted)
+		  {
+			// Read extra throwaway chars so that on end() we
+			// report the correct offset:
+			char[] throwaway = new char[1024];
+			while (true)
+			{
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int inc = input.read(throwaway, 0, throwaway.length);
+			  int inc = input.read(throwaway, 0, throwaway.Length);
+			  if (inc == -1)
+			  {
+				break;
+			  }
+			  charsRead += inc;
+			}
+		  }
+
+		  inLen = inStr.length();
+		  if (inLen == 0)
+		  {
+			return false;
+		  }
+		  posIncrAtt.PositionIncrement = 1;
+		}
+		else
+		{
+		  posIncrAtt.PositionIncrement = 0;
+		}
+
+		// if the remaining input is too short, we can't generate any n-grams
+		if (gramSize > inLen)
+		{
+		  return false;
+		}
+
+		// if we have hit the end of our n-gram size range, quit
+		if (gramSize > maxGram || gramSize > inLen)
+		{
+		  return false;
+		}
+
+		// grab gramSize chars from front or back
+		int start = side == Side.FRONT ? 0 : inLen - gramSize;
+		int end = start + gramSize;
+		termAtt.setEmpty().append(inStr, start, end);
+		offsetAtt.setOffset(correctOffset(start), correctOffset(end));
+		gramSize++;
+		return true;
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void end() throws java.io.IOException
+	  public override void end()
+	  {
+		base.end();
+		// set final offset
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int finalOffset = correctOffset(charsRead);
+		int finalOffset = correctOffset(charsRead);
+		this.offsetAtt.setOffset(finalOffset, finalOffset);
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void reset() throws java.io.IOException
+	  public override void reset()
+	  {
+		base.reset();
+		started = false;
+	  }
+}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43NGramTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43NGramTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43NGramTokenizer.cs
new file mode 100644
index 0000000..b0756a6
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43NGramTokenizer.cs
@@ -0,0 +1,182 @@
+using System;
+
+namespace org.apache.lucene.analysis.ngram
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+
+	using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+	using OffsetAttribute = org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+
+	/// <summary>
+	/// Old broken version of <seealso cref="NGramTokenizer"/>.
+	/// </summary>
+	[Obsolete]
+	public sealed class Lucene43NGramTokenizer : Tokenizer
+	{
+	  public const int DEFAULT_MIN_NGRAM_SIZE = 1;
+	  public const int DEFAULT_MAX_NGRAM_SIZE = 2;
+
+	  private int minGram, maxGram;
+	  private int gramSize;
+	  private int pos;
+	  private int inLen; // length of the input AFTER trim()
+	  private int charsRead; // length of the input
+	  private string inStr;
+	  private bool started;
+
+	  private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+	  private readonly OffsetAttribute offsetAtt = addAttribute(typeof(OffsetAttribute));
+
+	  /// <summary>
+	  /// Creates NGramTokenizer with given min and max n-grams. </summary>
+	  /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
+	  /// <param name="minGram"> the smallest n-gram to generate </param>
+	  /// <param name="maxGram"> the largest n-gram to generate </param>
+	  public Lucene43NGramTokenizer(Reader input, int minGram, int maxGram) : base(input)
+	  {
+		init(minGram, maxGram);
+	  }
+
+	  /// <summary>
+	  /// Creates NGramTokenizer with given min and max n-grams. </summary>
+	  /// <param name="factory"> <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param>
+	  /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
+	  /// <param name="minGram"> the smallest n-gram to generate </param>
+	  /// <param name="maxGram"> the largest n-gram to generate </param>
+	  public Lucene43NGramTokenizer(AttributeFactory factory, Reader input, int minGram, int maxGram) : base(factory, input)
+	  {
+		init(minGram, maxGram);
+	  }
+
+	  /// <summary>
+	  /// Creates NGramTokenizer with default min and max n-grams. </summary>
+	  /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
+	  public Lucene43NGramTokenizer(Reader input) : this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE)
+	  {
+	  }
+
+	  private void init(int minGram, int maxGram)
+	  {
+		if (minGram < 1)
+		{
+		  throw new System.ArgumentException("minGram must be greater than zero");
+		}
+		if (minGram > maxGram)
+		{
+		  throw new System.ArgumentException("minGram must not be greater than maxGram");
+		}
+		this.minGram = minGram;
+		this.maxGram = maxGram;
+	  }
+
+	  /// <summary>
+	  /// Returns the next token in the stream, or null at EOS. </summary>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
+	  public override bool incrementToken()
+	  {
+		clearAttributes();
+		if (!started)
+		{
+		  started = true;
+		  gramSize = minGram;
+		  char[] chars = new char[1024];
+		  charsRead = 0;
+		  // TODO: refactor to a shared readFully somewhere:
+		  while (charsRead < chars.Length)
+		  {
+			int inc = input.read(chars, charsRead, chars.Length - charsRead);
+			if (inc == -1)
+			{
+			  break;
+			}
+			charsRead += inc;
+		  }
+		  inStr = (new string(chars, 0, charsRead)).Trim(); // remove any trailing empty strings
+
+		  if (charsRead == chars.Length)
+		  {
+			// Read extra throwaway chars so that on end() we
+			// report the correct offset:
+			char[] throwaway = new char[1024];
+			while (true)
+			{
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int inc = input.read(throwaway, 0, throwaway.length);
+			  int inc = input.read(throwaway, 0, throwaway.Length);
+			  if (inc == -1)
+			  {
+				break;
+			  }
+			  charsRead += inc;
+			}
+		  }
+
+		  inLen = inStr.Length;
+		  if (inLen == 0)
+		  {
+			return false;
+		  }
+		}
+
+		if (pos + gramSize > inLen) // if we hit the end of the string
+		{
+		  pos = 0; // reset to beginning of string
+		  gramSize++; // increase n-gram size
+		  if (gramSize > maxGram) // we are done
+		  {
+			return false;
+		  }
+		  if (pos + gramSize > inLen)
+		  {
+			return false;
+		  }
+		}
+
+		int oldPos = pos;
+		pos++;
+		termAtt.setEmpty().append(inStr, oldPos, oldPos + gramSize);
+		offsetAtt.setOffset(correctOffset(oldPos), correctOffset(oldPos + gramSize));
+		return true;
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void end() throws java.io.IOException
+	  public override void end()
+	  {
+		base.end();
+		// set final offset
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int finalOffset = correctOffset(charsRead);
+		int finalOffset = correctOffset(charsRead);
+		this.offsetAtt.setOffset(finalOffset, finalOffset);
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void reset() throws java.io.IOException
+	  public override void reset()
+	  {
+		base.reset();
+		started = false;
+		pos = 0;
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramFilterFactory.cs
new file mode 100644
index 0000000..132f3bd
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramFilterFactory.cs
@@ -0,0 +1,59 @@
+using System.Collections.Generic;
+using TokenFilterFactory = Lucene.Net.Analysis.Util.TokenFilterFactory;
+
+namespace org.apache.lucene.analysis.ngram
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+	using TokenFilterFactory = TokenFilterFactory;
+
+	/// <summary>
+	/// Factory for <seealso cref="NGramTokenFilter"/>.
+	/// <pre class="prettyprint">
+	/// &lt;fieldType name="text_ngrm" class="solr.TextField" positionIncrementGap="100"&gt;
+	///   &lt;analyzer&gt;
+	///     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
+	///     &lt;filter class="solr.NGramFilterFactory" minGramSize="1" maxGramSize="2"/&gt;
+	///   &lt;/analyzer&gt;
+	/// &lt;/fieldType&gt;</pre>
+	/// </summary>
+	public class NGramFilterFactory : TokenFilterFactory
+	{
+	  private readonly int maxGramSize;
+	  private readonly int minGramSize;
+
+	  /// <summary>
+	  /// Creates a new NGramFilterFactory </summary>
+	  public NGramFilterFactory(IDictionary<string, string> args) : base(args)
+	  {
+		minGramSize = getInt(args, "minGramSize", NGramTokenFilter.DEFAULT_MIN_NGRAM_SIZE);
+		maxGramSize = getInt(args, "maxGramSize", NGramTokenFilter.DEFAULT_MAX_NGRAM_SIZE);
+		if (args.Count > 0)
+		{
+		  throw new System.ArgumentException("Unknown parameters: " + args);
+		}
+	  }
+
+	  public override NGramTokenFilter create(TokenStream input)
+	  {
+		return new NGramTokenFilter(luceneMatchVersion, input, minGramSize, maxGramSize);
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenFilter.cs
new file mode 100644
index 0000000..3e7012c
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenFilter.cs
@@ -0,0 +1,251 @@
+namespace org.apache.lucene.analysis.ngram
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+	using CodepointCountFilter = org.apache.lucene.analysis.miscellaneous.CodepointCountFilter;
+	using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+	using OffsetAttribute = org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+	using PositionIncrementAttribute = org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+	using PositionLengthAttribute = org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+	using CharacterUtils = org.apache.lucene.analysis.util.CharacterUtils;
+	using Version = org.apache.lucene.util.Version;
+
+	/// <summary>
+	/// Tokenizes the input into n-grams of the given size(s).
+	/// <a name="version"/>
+	/// <para>You must specify the required <seealso cref="Version"/> compatibility when
+	/// creating a <seealso cref="NGramTokenFilter"/>. As of Lucene 4.4, this token filters:<ul>
+	/// <li>handles supplementary characters correctly,</li>
+	/// <li>emits all n-grams for the same token at the same position,</li>
+	/// <li>does not modify offsets,</li>
+	/// <li>sorts n-grams by their offset in the original token first, then
+	/// increasing length (meaning that "abc" will give "a", "ab", "abc", "b", "bc",
+	/// "c").</li></ul>
+	/// </para>
+	/// <para>You can make this filter use the old behavior by providing a version &lt;
+	/// <seealso cref="Version#LUCENE_44"/> in the constructor but this is not recommended as
+	/// it will lead to broken <seealso cref="TokenStream"/>s that will cause highlighting
+	/// bugs.
+	/// </para>
+	/// <para>If you were using this <seealso cref="TokenFilter"/> to perform partial highlighting,
+	/// this won't work anymore since this filter doesn't update offsets. You should
+	/// modify your analysis chain to use <seealso cref="NGramTokenizer"/>, and potentially
+	/// override <seealso cref="NGramTokenizer#isTokenChar(int)"/> to perform pre-tokenization.
+	/// </para>
+	/// </summary>
+	public sealed class NGramTokenFilter : TokenFilter
+	{
+	  public const int DEFAULT_MIN_NGRAM_SIZE = 1;
+	  public const int DEFAULT_MAX_NGRAM_SIZE = 2;
+
+	  private readonly int minGram, maxGram;
+
+	  private char[] curTermBuffer;
+	  private int curTermLength;
+	  private int curCodePointCount;
+	  private int curGramSize;
+	  private int curPos;
+	  private int curPosInc, curPosLen;
+	  private int tokStart;
+	  private int tokEnd;
+	  private bool hasIllegalOffsets; // only if the length changed before this filter
+
+	  private readonly Version version;
+	  private readonly CharacterUtils charUtils;
+	  private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+	  private readonly PositionIncrementAttribute posIncAtt;
+	  private readonly PositionLengthAttribute posLenAtt;
+	  private readonly OffsetAttribute offsetAtt = addAttribute(typeof(OffsetAttribute));
+
+	  /// <summary>
+	  /// Creates NGramTokenFilter with given min and max n-grams. </summary>
+	  /// <param name="version"> Lucene version to enable correct position increments.
+	  ///                See <a href="#version">above</a> for details. </param>
+	  /// <param name="input"> <seealso cref="TokenStream"/> holding the input to be tokenized </param>
+	  /// <param name="minGram"> the smallest n-gram to generate </param>
+	  /// <param name="maxGram"> the largest n-gram to generate </param>
+	  public NGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) : base(new CodepointCountFilter(version, input, minGram, int.MaxValue))
+	  {
+		this.version = version;
+		this.charUtils = version.onOrAfter(Version.LUCENE_44) ? CharacterUtils.getInstance(version) : CharacterUtils.Java4Instance;
+		if (minGram < 1)
+		{
+		  throw new System.ArgumentException("minGram must be greater than zero");
+		}
+		if (minGram > maxGram)
+		{
+		  throw new System.ArgumentException("minGram must not be greater than maxGram");
+		}
+		this.minGram = minGram;
+		this.maxGram = maxGram;
+		if (version.onOrAfter(Version.LUCENE_44))
+		{
+		  posIncAtt = addAttribute(typeof(PositionIncrementAttribute));
+		  posLenAtt = addAttribute(typeof(PositionLengthAttribute));
+		}
+		else
+		{
+		  posIncAtt = new PositionIncrementAttributeAnonymousInnerClassHelper(this);
+		  posLenAtt = new PositionLengthAttributeAnonymousInnerClassHelper(this);
+		}
+	  }
+
+	  private class PositionIncrementAttributeAnonymousInnerClassHelper : PositionIncrementAttribute
+	  {
+		  private readonly NGramTokenFilter outerInstance;
+
+		  public PositionIncrementAttributeAnonymousInnerClassHelper(NGramTokenFilter outerInstance)
+		  {
+			  this.outerInstance = outerInstance;
+		  }
+
+		  public override int PositionIncrement
+		  {
+			  set
+			  {
+			  }
+			  get
+			  {
+				return 0;
+			  }
+		  }
+	  }
+
+	  private class PositionLengthAttributeAnonymousInnerClassHelper : PositionLengthAttribute
+	  {
+		  private readonly NGramTokenFilter outerInstance;
+
+		  public PositionLengthAttributeAnonymousInnerClassHelper(NGramTokenFilter outerInstance)
+		  {
+			  this.outerInstance = outerInstance;
+		  }
+
+		  public override int PositionLength
+		  {
+			  set
+			  {
+			  }
+			  get
+			  {
+				return 0;
+			  }
+		  }
+	  }
+
+	  /// <summary>
+	  /// Creates NGramTokenFilter with default min and max n-grams. </summary>
+	  /// <param name="version"> Lucene version to enable correct position increments.
+	  ///                See <a href="#version">above</a> for details. </param>
+	  /// <param name="input"> <seealso cref="TokenStream"/> holding the input to be tokenized </param>
+	  public NGramTokenFilter(Version version, TokenStream input) : this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE)
+	  {
+	  }
+
+	  /// <summary>
+	  /// Returns the next token in the stream, or null at EOS. </summary>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public final boolean incrementToken() throws java.io.IOException
+	  public override bool incrementToken()
+	  {
+		while (true)
+		{
+		  if (curTermBuffer == null)
+		  {
+			if (!input.incrementToken())
+			{
+			  return false;
+			}
+			else
+			{
+			  curTermBuffer = termAtt.buffer().clone();
+			  curTermLength = termAtt.length();
+			  curCodePointCount = charUtils.codePointCount(termAtt);
+			  curGramSize = minGram;
+			  curPos = 0;
+			  curPosInc = posIncAtt.PositionIncrement;
+			  curPosLen = posLenAtt.PositionLength;
+			  tokStart = offsetAtt.startOffset();
+			  tokEnd = offsetAtt.endOffset();
+			  // if length by start + end offsets doesn't match the term text then assume
+			  // this is a synonym and don't adjust the offsets.
+			  hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
+			}
+		  }
+		  if (version.onOrAfter(Version.LUCENE_44))
+		  {
+			if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount)
+			{
+			  ++curPos;
+			  curGramSize = minGram;
+			}
+			if ((curPos + curGramSize) <= curCodePointCount)
+			{
+			  clearAttributes();
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int start = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
+			  int start = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos);
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int end = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
+			  int end = charUtils.offsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize);
+			  termAtt.copyBuffer(curTermBuffer, start, end - start);
+			  posIncAtt.PositionIncrement = curPosInc;
+			  curPosInc = 0;
+			  posLenAtt.PositionLength = curPosLen;
+			  offsetAtt.setOffset(tokStart, tokEnd);
+			  curGramSize++;
+			  return true;
+			}
+		  }
+		  else
+		  {
+			while (curGramSize <= maxGram)
+			{
+			  while (curPos + curGramSize <= curTermLength) // while there is input
+			  {
+				clearAttributes();
+				termAtt.copyBuffer(curTermBuffer, curPos, curGramSize);
+				if (hasIllegalOffsets)
+				{
+				  offsetAtt.setOffset(tokStart, tokEnd);
+				}
+				else
+				{
+				  offsetAtt.setOffset(tokStart + curPos, tokStart + curPos + curGramSize);
+				}
+				curPos++;
+				return true;
+			  }
+			  curGramSize++; // increase n-gram size
+			  curPos = 0;
+			}
+		  }
+		  curTermBuffer = null;
+		}
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void reset() throws java.io.IOException
+	  public override void reset()
+	  {
+		base.reset();
+		curTermBuffer = null;
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizer.cs
new file mode 100644
index 0000000..b782e94
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizer.cs
@@ -0,0 +1,278 @@
+using System;
+using System.Diagnostics;
+
+namespace org.apache.lucene.analysis.ngram
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+
+	using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+	using OffsetAttribute = org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+	using PositionIncrementAttribute = org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+	using PositionLengthAttribute = org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+	using CharacterUtils = org.apache.lucene.analysis.util.CharacterUtils;
+	using Version = org.apache.lucene.util.Version;
+
+	/// <summary>
+	/// Tokenizes the input into n-grams of the given size(s).
+	/// <para>On the contrary to <seealso cref="NGramTokenFilter"/>, this class sets offsets so
+	/// that characters between startOffset and endOffset in the original stream are
+	/// the same as the term chars.
+	/// </para>
+	/// <para>For example, "abcde" would be tokenized as (minGram=2, maxGram=3):
+	/// <table>
+	/// <tr><th>Term</th><td>ab</td><td>abc</td><td>bc</td><td>bcd</td><td>cd</td><td>cde</td><td>de</td></tr>
+	/// <tr><th>Position increment</th><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td></tr>
+	/// <tr><th>Position length</th><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td></tr>
+	/// <tr><th>Offsets</th><td>[0,2[</td><td>[0,3[</td><td>[1,3[</td><td>[1,4[</td><td>[2,4[</td><td>[2,5[</td><td>[3,5[</td></tr>
+	/// </table>
+	/// <a name="version"/>
+	/// </para>
+	/// <para>This tokenizer changed a lot in Lucene 4.4 in order to:<ul>
+	/// <li>tokenize in a streaming fashion to support streams which are larger
+	/// than 1024 chars (limit of the previous version),
+	/// <li>count grams based on unicode code points instead of java chars (and
+	/// never split in the middle of surrogate pairs),
+	/// <li>give the ability to <seealso cref="#isTokenChar(int) pre-tokenize"/> the stream
+	/// before computing n-grams.</ul>
+	/// </para>
+	/// <para>Additionally, this class doesn't trim trailing whitespaces and emits
+	/// tokens in a different order, tokens are now emitted by increasing start
+	/// offsets while they used to be emitted by increasing lengths (which prevented
+	/// from supporting large input streams).
+	/// </para>
+	/// <para>Although <b style="color:red">highly</b> discouraged, it is still possible
+	/// to use the old behavior through <seealso cref="Lucene43NGramTokenizer"/>.
+	/// </para>
+	/// </summary>
+	// non-final to allow for overriding isTokenChar, but all other methods should be final
+	public class NGramTokenizer : Tokenizer
+	{
+	  public const int DEFAULT_MIN_NGRAM_SIZE = 1;
+	  public const int DEFAULT_MAX_NGRAM_SIZE = 2;
+
+	  private CharacterUtils charUtils;
+	  private CharacterUtils.CharacterBuffer charBuffer;
+	  private int[] buffer; // like charBuffer, but converted to code points
+	  private int bufferStart, bufferEnd; // remaining slice in buffer
+	  private int offset;
+	  private int gramSize;
+	  private int minGram, maxGram;
+	  private bool exhausted;
+	  private int lastCheckedChar; // last offset in the buffer that we checked
+	  private int lastNonTokenChar; // last offset that we found to not be a token char
+	  private bool edgesOnly; // leading edges n-grams only
+
+	  private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+	  private readonly PositionIncrementAttribute posIncAtt = addAttribute(typeof(PositionIncrementAttribute));
+	  private readonly PositionLengthAttribute posLenAtt = addAttribute(typeof(PositionLengthAttribute));
+	  private readonly OffsetAttribute offsetAtt = addAttribute(typeof(OffsetAttribute));
+
+	  internal NGramTokenizer(Version version, Reader input, int minGram, int maxGram, bool edgesOnly) : base(input)
+	  {
+		init(version, minGram, maxGram, edgesOnly);
+	  }
+
+	  /// <summary>
+	  /// Creates NGramTokenizer with given min and max n-grams. </summary>
+	  /// <param name="version"> the lucene compatibility <a href="#version">version</a> </param>
+	  /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
+	  /// <param name="minGram"> the smallest n-gram to generate </param>
+	  /// <param name="maxGram"> the largest n-gram to generate </param>
+	  public NGramTokenizer(Version version, Reader input, int minGram, int maxGram) : this(version, input, minGram, maxGram, false)
+	  {
+	  }
+
+	  internal NGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram, bool edgesOnly) : base(factory, input)
+	  {
+		init(version, minGram, maxGram, edgesOnly);
+	  }
+
+	  /// <summary>
+	  /// Creates NGramTokenizer with given min and max n-grams. </summary>
+	  /// <param name="version"> the lucene compatibility <a href="#version">version</a> </param>
+	  /// <param name="factory"> <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/> to use </param>
+	  /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
+	  /// <param name="minGram"> the smallest n-gram to generate </param>
+	  /// <param name="maxGram"> the largest n-gram to generate </param>
+	  public NGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram) : this(version, factory, input, minGram, maxGram, false)
+	  {
+	  }
+
+	  /// <summary>
+	  /// Creates NGramTokenizer with default min and max n-grams. </summary>
+	  /// <param name="version"> the lucene compatibility <a href="#version">version</a> </param>
+	  /// <param name="input"> <seealso cref="Reader"/> holding the input to be tokenized </param>
+	  public NGramTokenizer(Version version, Reader input) : this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE)
+	  {
+	  }
+
+	  private void init(Version version, int minGram, int maxGram, bool edgesOnly)
+	  {
+		if (!version.onOrAfter(Version.LUCENE_44))
+		{
+		  throw new System.ArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer/Lucene43EdgeNGramTokenizer");
+		}
+		charUtils = version.onOrAfter(Version.LUCENE_44) ? CharacterUtils.getInstance(version) : CharacterUtils.Java4Instance;
+		if (minGram < 1)
+		{
+		  throw new System.ArgumentException("minGram must be greater than zero");
+		}
+		if (minGram > maxGram)
+		{
+		  throw new System.ArgumentException("minGram must not be greater than maxGram");
+		}
+		this.minGram = minGram;
+		this.maxGram = maxGram;
+		this.edgesOnly = edgesOnly;
+		charBuffer = CharacterUtils.newCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader
+		buffer = new int[charBuffer.Buffer.Length];
+		// Make the term att large enough
+		termAtt.resizeBuffer(2 * maxGram);
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public final boolean incrementToken() throws java.io.IOException
+	  public override bool incrementToken()
+	  {
+		clearAttributes();
+
+		// termination of this loop is guaranteed by the fact that every iteration
+		// either advances the buffer (calls consumes()) or increases gramSize
+		while (true)
+		{
+		  // compact
+		  if (bufferStart >= bufferEnd - maxGram - 1 && !exhausted)
+		  {
+			Array.Copy(buffer, bufferStart, buffer, 0, bufferEnd - bufferStart);
+			bufferEnd -= bufferStart;
+			lastCheckedChar -= bufferStart;
+			lastNonTokenChar -= bufferStart;
+			bufferStart = 0;
+
+			// fill in remaining space
+			exhausted = !charUtils.fill(charBuffer, input, buffer.Length - bufferEnd);
+			// convert to code points
+			bufferEnd += charUtils.toCodePoints(charBuffer.Buffer, 0, charBuffer.Length, buffer, bufferEnd);
+		  }
+
+		  // should we go to the next offset?
+		  if (gramSize > maxGram || (bufferStart + gramSize) > bufferEnd)
+		  {
+			if (bufferStart + 1 + minGram > bufferEnd)
+			{
+			  Debug.Assert(exhausted);
+			  return false;
+			}
+			consume();
+			gramSize = minGram;
+		  }
+
+		  updateLastNonTokenChar();
+
+		  // retry if the token to be emitted was going to not only contain token chars
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final boolean termContainsNonTokenChar = lastNonTokenChar >= bufferStart && lastNonTokenChar < (bufferStart + gramSize);
+		  bool termContainsNonTokenChar = lastNonTokenChar >= bufferStart && lastNonTokenChar < (bufferStart + gramSize);
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final boolean isEdgeAndPreviousCharIsTokenChar = edgesOnly && lastNonTokenChar != bufferStart - 1;
+		  bool isEdgeAndPreviousCharIsTokenChar = edgesOnly && lastNonTokenChar != bufferStart - 1;
+		  if (termContainsNonTokenChar || isEdgeAndPreviousCharIsTokenChar)
+		  {
+			consume();
+			gramSize = minGram;
+			continue;
+		  }
+
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int length = charUtils.toChars(buffer, bufferStart, gramSize, termAtt.buffer(), 0);
+		  int length = charUtils.toChars(buffer, bufferStart, gramSize, termAtt.buffer(), 0);
+		  termAtt.Length = length;
+		  posIncAtt.PositionIncrement = 1;
+		  posLenAtt.PositionLength = 1;
+		  offsetAtt.setOffset(correctOffset(offset), correctOffset(offset + length));
+		  ++gramSize;
+		  return true;
+		}
+	  }
+
+	  private void updateLastNonTokenChar()
+	  {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int termEnd = bufferStart + gramSize - 1;
+		int termEnd = bufferStart + gramSize - 1;
+		if (termEnd > lastCheckedChar)
+		{
+		  for (int i = termEnd; i > lastCheckedChar; --i)
+		  {
+			if (!isTokenChar(buffer[i]))
+			{
+			  lastNonTokenChar = i;
+			  break;
+			}
+		  }
+		  lastCheckedChar = termEnd;
+		}
+	  }
+
+	  /// <summary>
+	  /// Consume one code point. </summary>
+	  private void consume()
+	  {
+		offset += char.charCount(buffer[bufferStart++]);
+	  }
+
+	  /// <summary>
+	  /// Only collect characters which satisfy this condition. </summary>
+	  protected internal virtual bool isTokenChar(int chr)
+	  {
+		return true;
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public final void end() throws java.io.IOException
+	  public override void end()
+	  {
+		base.end();
+		Debug.Assert(bufferStart <= bufferEnd);
+		int endOffset = offset;
+		for (int i = bufferStart; i < bufferEnd; ++i)
+		{
+		  endOffset += char.charCount(buffer[i]);
+		}
+		endOffset = correctOffset(endOffset);
+		// set final offset
+		offsetAtt.setOffset(endOffset, endOffset);
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public final void reset() throws java.io.IOException
+	  public override void reset()
+	  {
+		base.reset();
+		bufferStart = bufferEnd = buffer.Length;
+		lastNonTokenChar = lastCheckedChar = bufferStart - 1;
+		offset = 0;
+		gramSize = minGram;
+		exhausted = false;
+		charBuffer.reset();
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizerFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizerFactory.cs
new file mode 100644
index 0000000..d90b0ad
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizerFactory.cs
@@ -0,0 +1,70 @@
+using System.Collections.Generic;
+using TokenizerFactory = Lucene.Net.Analysis.Util.TokenizerFactory;
+
+namespace org.apache.lucene.analysis.ngram
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+	using TokenizerFactory = TokenizerFactory;
+	using AttributeFactory = org.apache.lucene.util.AttributeSource.AttributeFactory;
+	using Version = org.apache.lucene.util.Version;
+
+
+	/// <summary>
+	/// Factory for <seealso cref="NGramTokenizer"/>.
+	/// <pre class="prettyprint">
+	/// &lt;fieldType name="text_ngrm" class="solr.TextField" positionIncrementGap="100"&gt;
+	///   &lt;analyzer&gt;
+	///     &lt;tokenizer class="solr.NGramTokenizerFactory" minGramSize="1" maxGramSize="2"/&gt;
+	///   &lt;/analyzer&gt;
+	/// &lt;/fieldType&gt;</pre>
+	/// </summary>
+	public class NGramTokenizerFactory : TokenizerFactory
+	{
+	  private readonly int maxGramSize;
+	  private readonly int minGramSize;
+
+	  /// <summary>
+	  /// Creates a new NGramTokenizerFactory </summary>
+	  public NGramTokenizerFactory(IDictionary<string, string> args) : base(args)
+	  {
+		minGramSize = getInt(args, "minGramSize", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
+		maxGramSize = getInt(args, "maxGramSize", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
+		if (args.Count > 0)
+		{
+		  throw new System.ArgumentException("Unknown parameters: " + args);
+		}
+	  }
+
+	  /// <summary>
+	  /// Creates the <seealso cref="TokenStream"/> of n-grams from the given <seealso cref="Reader"/> and <seealso cref="AttributeFactory"/>. </summary>
+	  public override Tokenizer create(AttributeFactory factory, Reader input)
+	  {
+		if (luceneMatchVersion.onOrAfter(Version.LUCENE_44))
+		{
+		  return new NGramTokenizer(luceneMatchVersion, factory, input, minGramSize, maxGramSize);
+		}
+		else
+		{
+		  return new Lucene43NGramTokenizer(factory, input, minGramSize, maxGramSize);
+		}
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Nl/DutchAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Nl/DutchAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Nl/DutchAnalyzer.cs
new file mode 100644
index 0000000..a1e5d3f
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Nl/DutchAnalyzer.cs
@@ -0,0 +1,231 @@
+using System;
+
+namespace org.apache.lucene.analysis.nl
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+	using LowerCaseFilter = org.apache.lucene.analysis.core.LowerCaseFilter;
+	using StopFilter = org.apache.lucene.analysis.core.StopFilter;
+	using SetKeywordMarkerFilter = org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+	using StemmerOverrideMap = org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap;
+	using StemmerOverrideFilter = org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter;
+	using SnowballFilter = org.apache.lucene.analysis.snowball.SnowballFilter;
+	using StandardFilter = org.apache.lucene.analysis.standard.StandardFilter;
+	using StandardTokenizer = org.apache.lucene.analysis.standard.StandardTokenizer;
+	using StandardAnalyzer = org.apache.lucene.analysis.standard.StandardAnalyzer; // for javadoc
+	using org.apache.lucene.analysis.util;
+	using CharArraySet = org.apache.lucene.analysis.util.CharArraySet;
+	using WordlistLoader = org.apache.lucene.analysis.util.WordlistLoader;
+	using CharsRef = org.apache.lucene.util.CharsRef;
+	using IOUtils = org.apache.lucene.util.IOUtils;
+	using Version = org.apache.lucene.util.Version;
+
+
+	/// <summary>
+	/// <seealso cref="Analyzer"/> for Dutch language. 
+	/// <para>
+	/// Supports an external list of stopwords (words that
+	/// will not be indexed at all), an external list of exclusions (word that will
+	/// not be stemmed, but indexed) and an external list of word-stem pairs that overrule
+	/// the algorithm (dictionary stemming).
+	/// A default set of stopwords is used unless an alternative list is specified, but the
+	/// exclusion list is empty by default.
+	/// </para>
+	/// 
+	/// <a name="version"/>
+	/// <para>You must specify the required <seealso cref="Version"/>
+	/// compatibility when creating DutchAnalyzer:
+	/// <ul>
+	///   <li> As of 3.6, <seealso cref="#DutchAnalyzer(Version, CharArraySet)"/> and
+	///        <seealso cref="#DutchAnalyzer(Version, CharArraySet, CharArraySet)"/> also populate
+	///        the default entries for the stem override dictionary
+	///   <li> As of 3.1, Snowball stemming is done with SnowballFilter, 
+	///        LowerCaseFilter is used prior to StopFilter, and Snowball 
+	///        stopwords are used by default.
+	///   <li> As of 2.9, StopFilter preserves position
+	///        increments
+	/// </ul>
+	/// 
+	/// </para>
+	/// <para><b>NOTE</b>: This class uses the same <seealso cref="Version"/>
+	/// dependent settings as <seealso cref="StandardAnalyzer"/>.</para>
+	/// </summary>
+	public sealed class DutchAnalyzer : Analyzer
+	{
+
+	  /// <summary>
+	  /// File containing default Dutch stopwords. </summary>
+	  public const string DEFAULT_STOPWORD_FILE = "dutch_stop.txt";
+
+	  /// <summary>
+	  /// Returns an unmodifiable instance of the default stop-words set. </summary>
+	  /// <returns> an unmodifiable instance of the default stop-words set. </returns>
+	  public static CharArraySet DefaultStopSet
+	  {
+		  get
+		  {
+			return DefaultSetHolder.DEFAULT_STOP_SET;
+		  }
+	  }
+
+	  private class DefaultSetHolder
+	  {
+		internal static readonly CharArraySet DEFAULT_STOP_SET;
+		internal static readonly CharArrayMap<string> DEFAULT_STEM_DICT;
+		static DefaultSetHolder()
+		{
+		  try
+		  {
+			DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(typeof(SnowballFilter), DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
+		  }
+		  catch (IOException)
+		  {
+			// default set should always be present as it is part of the
+			// distribution (JAR)
+			throw new Exception("Unable to load default stopword set");
+		  }
+
+		  DEFAULT_STEM_DICT = new CharArrayMap<>(Version.LUCENE_CURRENT, 4, false);
+		  DEFAULT_STEM_DICT.put("fiets", "fiets"); //otherwise fiet
+		  DEFAULT_STEM_DICT.put("bromfiets", "bromfiets"); //otherwise bromfiet
+		  DEFAULT_STEM_DICT.put("ei", "eier");
+		  DEFAULT_STEM_DICT.put("kind", "kinder");
+		}
+	  }
+
+
+	  /// <summary>
+	  /// Contains the stopwords used with the StopFilter.
+	  /// </summary>
+	  private readonly CharArraySet stoptable;
+
+	  /// <summary>
+	  /// Contains words that should be indexed but not stemmed.
+	  /// </summary>
+	  private CharArraySet excltable = CharArraySet.EMPTY_SET;
+
+	  private readonly StemmerOverrideMap stemdict;
+
+	  // null if on 3.1 or later - only for bw compat
+	  private readonly CharArrayMap<string> origStemdict;
+	  private readonly Version matchVersion;
+
+	  /// <summary>
+	  /// Builds an analyzer with the default stop words (<seealso cref="#getDefaultStopSet()"/>) 
+	  /// and a few default entries for the stem exclusion table.
+	  /// 
+	  /// </summary>
+	  public DutchAnalyzer(Version matchVersion) : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET, CharArraySet.EMPTY_SET, DefaultSetHolder.DEFAULT_STEM_DICT)
+	  {
+		// historically, only this ctor populated the stem dict!!!!!
+	  }
+
+	  public DutchAnalyzer(Version matchVersion, CharArraySet stopwords) : this(matchVersion, stopwords, CharArraySet.EMPTY_SET, matchVersion.onOrAfter(Version.LUCENE_36) ? DefaultSetHolder.DEFAULT_STEM_DICT : CharArrayMap.emptyMap<string>())
+	  {
+		// historically, this ctor never the stem dict!!!!!
+		// so we populate it only for >= 3.6
+	  }
+
+	  public DutchAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable) : this(matchVersion, stopwords, stemExclusionTable, matchVersion.onOrAfter(Version.LUCENE_36) ? DefaultSetHolder.DEFAULT_STEM_DICT : CharArrayMap.emptyMap<string>())
+	  {
+		// historically, this ctor never the stem dict!!!!!
+		// so we populate it only for >= 3.6
+	  }
+
+	  public DutchAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionTable, CharArrayMap<string> stemOverrideDict)
+	  {
+		this.matchVersion = matchVersion;
+		this.stoptable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stopwords));
+		this.excltable = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionTable));
+		if (stemOverrideDict.Empty || !matchVersion.onOrAfter(Version.LUCENE_31))
+		{
+		  this.stemdict = null;
+		  this.origStemdict = CharArrayMap.unmodifiableMap(CharArrayMap.copy(matchVersion, stemOverrideDict));
+		}
+		else
+		{
+		  this.origStemdict = null;
+		  // we don't need to ignore case here since we lowercase in this analyzer anyway
+		  StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(false);
+		  CharArrayMap<string>.EntryIterator iter = stemOverrideDict.entrySet().GetEnumerator();
+		  CharsRef spare = new CharsRef();
+		  while (iter.hasNext())
+		  {
+			char[] nextKey = iter.nextKey();
+			spare.copyChars(nextKey, 0, nextKey.Length);
+			builder.add(spare, iter.currentValue());
+		  }
+		  try
+		  {
+			this.stemdict = builder.build();
+		  }
+		  catch (IOException ex)
+		  {
+			throw new Exception("can not build stem dict", ex);
+		  }
+		}
+	  }
+
+	  /// <summary>
+	  /// Returns a (possibly reused) <seealso cref="TokenStream"/> which tokenizes all the 
+	  /// text in the provided <seealso cref="Reader"/>.
+	  /// </summary>
+	  /// <returns> A <seealso cref="TokenStream"/> built from a <seealso cref="StandardTokenizer"/>
+	  ///   filtered with <seealso cref="StandardFilter"/>, <seealso cref="LowerCaseFilter"/>, 
+	  ///   <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is provided,
+	  ///   <seealso cref="StemmerOverrideFilter"/>, and <seealso cref="SnowballFilter"/> </returns>
+	  protected internal override TokenStreamComponents createComponents(string fieldName, Reader aReader)
+	  {
+		if (matchVersion.onOrAfter(Version.LUCENE_31))
+		{
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, aReader);
+		  Tokenizer source = new StandardTokenizer(matchVersion, aReader);
+		  TokenStream result = new StandardFilter(matchVersion, source);
+		  result = new LowerCaseFilter(matchVersion, result);
+		  result = new StopFilter(matchVersion, result, stoptable);
+		  if (!excltable.Empty)
+		  {
+			result = new SetKeywordMarkerFilter(result, excltable);
+		  }
+		  if (stemdict != null)
+		  {
+			result = new StemmerOverrideFilter(result, stemdict);
+		  }
+		  result = new SnowballFilter(result, new org.tartarus.snowball.ext.DutchStemmer());
+		  return new TokenStreamComponents(source, result);
+		}
+		else
+		{
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, aReader);
+		  Tokenizer source = new StandardTokenizer(matchVersion, aReader);
+		  TokenStream result = new StandardFilter(matchVersion, source);
+		  result = new StopFilter(matchVersion, result, stoptable);
+		  if (!excltable.Empty)
+		  {
+			result = new SetKeywordMarkerFilter(result, excltable);
+		  }
+		  result = new DutchStemFilter(result, origStemdict);
+		  return new TokenStreamComponents(source, result);
+		}
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Nl/DutchStemFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Nl/DutchStemFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Nl/DutchStemFilter.cs
new file mode 100644
index 0000000..50d1ce1
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Nl/DutchStemFilter.cs
@@ -0,0 +1,129 @@
+using System;
+using System.Collections.Generic;
+
+namespace org.apache.lucene.analysis.nl
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+
+	using KeywordMarkerFilter = org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; // for javadoc
+	using SnowballFilter = org.apache.lucene.analysis.snowball.SnowballFilter;
+	using KeywordAttribute = org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+	using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+	/// <summary>
+	/// A <seealso cref="TokenFilter"/> that stems Dutch words. 
+	/// <para>
+	/// It supports a table of words that should
+	/// not be stemmed at all. The stemmer used can be changed at runtime after the
+	/// filter object is created (as long as it is a <seealso cref="DutchStemmer"/>).
+	/// </para>
+	/// <para>
+	/// To prevent terms from being stemmed use an instance of
+	/// <seealso cref="KeywordMarkerFilter"/> or a custom <seealso cref="TokenFilter"/> that sets
+	/// the <seealso cref="KeywordAttribute"/> before this <seealso cref="TokenStream"/>.
+	/// </para> </summary>
+	/// <seealso cref= KeywordMarkerFilter </seealso>
+	/// @deprecated (3.1) Use <seealso cref="SnowballFilter"/> with 
+	/// <seealso cref="org.tartarus.snowball.ext.DutchStemmer"/> instead, which has the
+	/// same functionality. This filter will be removed in Lucene 5.0 
+	[Obsolete("(3.1) Use <seealso cref="SnowballFilter"/> with")]
+	public sealed class DutchStemFilter : TokenFilter
+	{
+	  /// <summary>
+	  /// The actual token in the input stream.
+	  /// </summary>
+	  private DutchStemmer stemmer = new DutchStemmer();
+
+	  private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+	  private readonly KeywordAttribute keywordAttr = addAttribute(typeof(KeywordAttribute));
+
+	  public DutchStemFilter(TokenStream _in) : base(_in)
+	  {
+	  }
+
+	  /// <param name="stemdictionary"> Dictionary of word stem pairs, that overrule the algorithm </param>
+	  public DutchStemFilter<T1>(TokenStream _in, IDictionary<T1> stemdictionary) : this(_in)
+	  {
+		stemmer.StemDictionary = stemdictionary;
+	  }
+
+	  /// <summary>
+	  /// Returns the next token in the stream, or null at EOS
+	  /// </summary>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
+	  public override bool incrementToken()
+	  {
+		if (input.incrementToken())
+		{
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final String term = termAtt.toString();
+		  string term = termAtt.ToString();
+
+		  // Check the exclusion table.
+		  if (!keywordAttr.Keyword)
+		  {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final String s = stemmer.stem(term);
+			string s = stemmer.stem(term);
+			// If not stemmed, don't waste the time adjusting the token.
+			if ((s != null) && !s.Equals(term))
+			{
+			  termAtt.setEmpty().append(s);
+			}
+		  }
+		  return true;
+		}
+		else
+		{
+		  return false;
+		}
+	  }
+
+	  /// <summary>
+	  /// Set a alternative/custom <seealso cref="DutchStemmer"/> for this filter.
+	  /// </summary>
+	  public DutchStemmer Stemmer
+	  {
+		  set
+		  {
+			if (value != null)
+			{
+			  this.stemmer = value;
+			}
+		  }
+	  }
+
+	  /// <summary>
+	  /// Set dictionary for stemming, this dictionary overrules the algorithm,
+	  /// so you can correct for a particular unwanted word-stem pair.
+	  /// </summary>
+	  public Dictionary<T1> StemDictionary<T1>
+	  {
+		  set
+		  {
+			if (stemmer != null)
+			{
+			  stemmer.StemDictionary = value;
+			}
+		  }
+	  }
+	}
+}
\ No newline at end of file


Mime
View raw message