lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From synhers...@apache.org
Subject [32/34] lucenenet git commit: Raw porting of Lucene.Net.Analysis.Common
Date Fri, 07 Nov 2014 23:12:36 GMT
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKTokenizer.cs
new file mode 100644
index 0000000..c091904
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKTokenizer.cs
@@ -0,0 +1,370 @@
+using System;
+
+namespace org.apache.lucene.analysis.cjk
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+
+	using OffsetAttribute = org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+	using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+	using TypeAttribute = org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+
+	/// <summary>
+	/// CJKTokenizer is designed for Chinese, Japanese, and Korean languages.
+	/// <para>  
+	/// The tokens returned are every two adjacent characters with overlap match.
+	/// </para>
+	/// <para>
+	/// Example: "java C1C2C3C4" will be segmented to: "java" "C1C2" "C2C3" "C3C4".
+	/// </para>
+	/// Additionally, the following is applied to Latin text (such as English):
+	/// <ul>
+	/// <li>Text is converted to lowercase.
+	/// <li>Numeric digits, '+', '#', and '_' are tokenized as letters.
+	/// <li>Full-width forms are converted to half-width forms.
+	/// </ul>
+	/// For more info on Asian language (Chinese, Japanese, and Korean) text segmentation:
+	/// please search  <a
+	/// href="http://www.google.com/search?q=word+chinese+segment">google</a>
+	/// </summary>
+	/// @deprecated Use StandardTokenizer, CJKWidthFilter, CJKBigramFilter, and LowerCaseFilter instead. 
+	[Obsolete("Use StandardTokenizer, CJKWidthFilter, CJKBigramFilter, and LowerCaseFilter instead.")]
+	public sealed class CJKTokenizer : Tokenizer
+	{
+		//~ Static fields/initializers ---------------------------------------------
+		/// <summary>
+		/// Word token type </summary>
+		internal const int WORD_TYPE = 0;
+
+		/// <summary>
+		/// Single byte token type </summary>
+		internal const int SINGLE_TOKEN_TYPE = 1;
+
+		/// <summary>
+		/// Double byte token type </summary>
+		internal const int DOUBLE_TOKEN_TYPE = 2;
+
+		/// <summary>
+		/// Names for token types </summary>
+		internal static readonly string[] TOKEN_TYPE_NAMES = new string[] {"word", "single", "double"};
+
+		/// <summary>
+		/// Max word length </summary>
+		private const int MAX_WORD_LEN = 255;
+
+		/// <summary>
+		/// buffer size: </summary>
+		private const int IO_BUFFER_SIZE = 256;
+
+		//~ Instance fields --------------------------------------------------------
+
+		/// <summary>
+		/// word offset, used to imply which character(in ) is parsed </summary>
+		private int offset = 0;
+
+		/// <summary>
+		/// the index used only for ioBuffer </summary>
+		private int bufferIndex = 0;
+
+		/// <summary>
+		/// data length </summary>
+		private int dataLen = 0;
+
+		/// <summary>
+		/// character buffer, store the characters which are used to compose <br>
+		/// the returned Token
+		/// </summary>
+		private readonly char[] buffer = new char[MAX_WORD_LEN];
+
+		/// <summary>
+		/// I/O buffer, used to store the content of the input(one of the <br>
+		/// members of Tokenizer)
+		/// </summary>
+		private readonly char[] ioBuffer = new char[IO_BUFFER_SIZE];
+
+		/// <summary>
+		/// word type: single=>ASCII  double=>non-ASCII word=>default </summary>
+		private int tokenType = WORD_TYPE;
+
+		/// <summary>
+		/// tag: previous character is a cached double-byte character  "C1C2C3C4"
+		/// ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
+		/// C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
+		/// </summary>
+		private bool preIsTokened = false;
+
+		private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+		private readonly OffsetAttribute offsetAtt = addAttribute(typeof(OffsetAttribute));
+		private readonly TypeAttribute typeAtt = addAttribute(typeof(TypeAttribute));
+
+		//~ Constructors -----------------------------------------------------------
+
+		/// <summary>
+		/// Construct a token stream processing the given input.
+		/// </summary>
+		/// <param name="in"> I/O reader </param>
+		public CJKTokenizer(Reader @in) : base(@in)
+		{
+		}
+
+		public CJKTokenizer(AttributeFactory factory, Reader @in) : base(factory, @in)
+		{
+		}
+
+		//~ Methods ----------------------------------------------------------------
+
+		/// <summary>
+		/// Returns true for the next token in the stream, or false at EOS.
+		/// See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
+		/// for detail.
+		/// </summary>
+		/// <returns> false for end of stream, true otherwise
+		/// </returns>
+		/// <exception cref="java.io.IOException"> - throw IOException when read error <br>
+		///         happened in the InputStream
+		///  </exception>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
+		public override bool incrementToken()
+		{
+			clearAttributes();
+			/// <summary>
+			/// how many character(s) has been stored in buffer </summary>
+
+			while (true) // loop until we find a non-empty token
+			{
+
+			  int length = 0;
+
+			  /// <summary>
+			  /// the position used to create Token </summary>
+			  int start = offset;
+
+			  while (true) // loop until we've found a full token
+			  {
+				/// <summary>
+				/// current character </summary>
+				char c;
+
+				/// <summary>
+				/// unicode block of current character for detail </summary>
+				char.UnicodeBlock ub;
+
+				offset++;
+
+				if (bufferIndex >= dataLen)
+				{
+					dataLen = input.read(ioBuffer);
+					bufferIndex = 0;
+				}
+
+				if (dataLen == -1)
+				{
+					if (length > 0)
+					{
+						if (preIsTokened == true)
+						{
+							length = 0;
+							preIsTokened = false;
+						}
+						else
+						{
+						  offset--;
+						}
+
+						break;
+					}
+					else
+					{
+						offset--;
+						return false;
+					}
+				}
+				else
+				{
+					//get current character
+					c = ioBuffer[bufferIndex++];
+
+					//get the UnicodeBlock of the current character
+					ub = char.UnicodeBlock.of(c);
+				}
+
+				//if the current character is ASCII or Extend ASCII
+				if ((ub == char.UnicodeBlock.BASIC_LATIN) || (ub == char.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS))
+				{
+					if (ub == char.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS)
+					{
+					  int i = (int) c;
+					  if (i >= 65281 && i <= 65374)
+					  {
+						// convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN
+						i = i - 65248;
+						c = (char) i;
+					  }
+					}
+
+					// if the current character is a letter or "_" "+" "#"
+					if (char.IsLetterOrDigit(c) || ((c == '_') || (c == '+') || (c == '#')))
+					{
+						if (length == 0)
+						{
+							// "javaC1C2C3C4linux" <br>
+							//      ^--: the current character begin to token the ASCII
+							// letter
+							start = offset - 1;
+						}
+						else if (tokenType == DOUBLE_TOKEN_TYPE)
+						{
+							// "javaC1C2C3C4linux" <br>
+							//              ^--: the previous non-ASCII
+							// : the current character
+							offset--;
+							bufferIndex--;
+
+							if (preIsTokened == true)
+							{
+								// there is only one non-ASCII has been stored
+								length = 0;
+								preIsTokened = false;
+								break;
+							}
+							else
+							{
+								break;
+							}
+						}
+
+						// store the LowerCase(c) in the buffer
+						buffer[length++] = char.ToLower(c);
+						tokenType = SINGLE_TOKEN_TYPE;
+
+						// break the procedure if buffer overflowed!
+						if (length == MAX_WORD_LEN)
+						{
+							break;
+						}
+					}
+					else if (length > 0)
+					{
+						if (preIsTokened == true)
+						{
+							length = 0;
+							preIsTokened = false;
+						}
+						else
+						{
+							break;
+						}
+					}
+				}
+				else
+				{
+					// non-ASCII letter, e.g."C1C2C3C4"
+					if (char.IsLetter(c))
+					{
+						if (length == 0)
+						{
+							start = offset - 1;
+							buffer[length++] = c;
+							tokenType = DOUBLE_TOKEN_TYPE;
+						}
+						else
+						{
+						  if (tokenType == SINGLE_TOKEN_TYPE)
+						  {
+								offset--;
+								bufferIndex--;
+
+								//return the previous ASCII characters
+								break;
+						  }
+							else
+							{
+								buffer[length++] = c;
+								tokenType = DOUBLE_TOKEN_TYPE;
+
+								if (length == 2)
+								{
+									offset--;
+									bufferIndex--;
+									preIsTokened = true;
+
+									break;
+								}
+							}
+						}
+					}
+					else if (length > 0)
+					{
+						if (preIsTokened == true)
+						{
+							// empty the buffer
+							length = 0;
+							preIsTokened = false;
+						}
+						else
+						{
+							break;
+						}
+					}
+				}
+			  }
+
+			if (length > 0)
+			{
+			  termAtt.copyBuffer(buffer, 0, length);
+			  offsetAtt.setOffset(correctOffset(start), correctOffset(start + length));
+			  typeAtt.Type = TOKEN_TYPE_NAMES[tokenType];
+			  return true;
+			}
+			else if (dataLen == -1)
+			{
+			  offset--;
+			  return false;
+			}
+
+			// Cycle back and try for the next token (don't
+			// return an empty string)
+			}
+		}
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public final void end() throws java.io.IOException
+		public override void end()
+		{
+		  base.end();
+		  // set final offset
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int finalOffset = correctOffset(offset);
+		  int finalOffset = correctOffset(offset);
+		  this.offsetAtt.setOffset(finalOffset, finalOffset);
+		}
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void reset() throws java.io.IOException
+		public override void reset()
+		{
+		  base.reset();
+		  offset = bufferIndex = dataLen = 0;
+		  preIsTokened = false;
+		  tokenType = WORD_TYPE;
+		}
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKTokenizerFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKTokenizerFactory.cs
new file mode 100644
index 0000000..526b1b4
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKTokenizerFactory.cs
@@ -0,0 +1,58 @@
+using System;
+using System.Collections.Generic;
+
+namespace org.apache.lucene.analysis.cjk
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+	using TokenizerFactory = org.apache.lucene.analysis.util.TokenizerFactory;
+	using AttributeFactory = org.apache.lucene.util.AttributeSource.AttributeFactory;
+
+
+	/// <summary>
+	/// Factory for <seealso cref="CJKTokenizer"/>. 
+	/// <pre class="prettyprint" >
+	/// &lt;fieldType name="text_cjk" class="solr.TextField" positionIncrementGap="100"&gt;
+	///   &lt;analyzer&gt;
+	///     &lt;tokenizer class="solr.CJKTokenizerFactory"/&gt;
+	///   &lt;/analyzer&gt;
+	/// &lt;/fieldType&gt;</pre> </summary>
+	/// @deprecated Use <seealso cref="CJKBigramFilterFactory"/> instead. 
+	[Obsolete("Use <seealso cref="CJKBigramFilterFactory"/> instead.")]
+	public class CJKTokenizerFactory : TokenizerFactory
+	{
+
+	  /// <summary>
+	  /// Creates a new CJKTokenizerFactory </summary>
+	  public CJKTokenizerFactory(IDictionary<string, string> args) : base(args)
+	  {
+		if (args.Count > 0)
+		{
+		  throw new System.ArgumentException("Unknown parameters: " + args);
+		}
+	  }
+
+	  public override CJKTokenizer create(AttributeFactory factory, Reader @in)
+	  {
+		return new CJKTokenizer(factory, @in);
+	  }
+	}
+
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKWidthFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKWidthFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKWidthFilter.cs
new file mode 100644
index 0000000..8beffcc
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKWidthFilter.cs
@@ -0,0 +1,113 @@
+namespace org.apache.lucene.analysis.cjk
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+	using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+	using StemmerUtil = org.apache.lucene.analysis.util.StemmerUtil;
+
+	/// <summary>
+	/// A <seealso cref="TokenFilter"/> that normalizes CJK width differences:
+	/// <ul>
+	///   <li>Folds fullwidth ASCII variants into the equivalent basic latin
+	///   <li>Folds halfwidth Katakana variants into the equivalent kana
+	/// </ul>
+	/// <para>
+	/// NOTE: this filter can be viewed as a (practical) subset of NFKC/NFKD
+	/// Unicode normalization. See the normalization support in the ICU package
+	/// for full normalization.
+	/// </para>
+	/// </summary>
+	public sealed class CJKWidthFilter : TokenFilter
+	{
+	  private CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+
+	  /* halfwidth kana mappings: 0xFF65-0xFF9D 
+	   *
+	   * note: 0xFF9C and 0xFF9D are only mapped to 0x3099 and 0x309A
+	   * as a fallback when they cannot properly combine with a preceding 
+	   * character into a composed form.
+	   */
+	  private static readonly char[] KANA_NORM = new char[] {0x30fb, 0x30f2, 0x30a1, 0x30a3, 0x30a5, 0x30a7, 0x30a9, 0x30e3, 0x30e5, 0x30e7, 0x30c3, 0x30fc, 0x30a2, 0x30a4, 0x30a6, 0x30a8, 0x30aa, 0x30ab, 0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9, 0x30bb, 0x30bd, 0x30bf, 0x30c1, 0x30c4, 0x30c6, 0x30c8, 0x30ca, 0x30cb, 0x30cc, 0x30cd, 0x30ce, 0x30cf, 0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de, 0x30df, 0x30e0, 0x30e1, 0x30e2, 0x30e4, 0x30e6, 0x30e8, 0x30e9, 0x30ea, 0x30eb, 0x30ec, 0x30ed, 0x30ef, 0x30f3, 0x3099, 0x309A};
+
+	  public CJKWidthFilter(TokenStream input) : base(input)
+	  {
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
+	  public override bool incrementToken()
+	  {
+		if (input.incrementToken())
+		{
+		  char[] text = termAtt.buffer();
+		  int length = termAtt.length();
+		  for (int i = 0; i < length; i++)
+		  {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final char ch = text[i];
+			char ch = text[i];
+			if (ch >= 0xFF01 && ch <= 0xFF5E)
+			{
+			  // Fullwidth ASCII variants
+			  text[i] -= 0xFEE0;
+			}
+			else if (ch >= 0xFF65 && ch <= 0xFF9F)
+			{
+			  // Halfwidth Katakana variants
+			  if ((ch == 0xFF9E || ch == 0xFF9F) && i > 0 && combine(text, i, ch))
+			  {
+				length = StemmerUtil.delete(text, i--, length);
+			  }
+			  else
+			  {
+				text[i] = KANA_NORM[ch - 0xFF65];
+			  }
+			}
+		  }
+		  termAtt.Length = length;
+		  return true;
+		}
+		else
+		{
+		  return false;
+		}
+	  }
+
+	  /* kana combining diffs: 0x30A6-0x30FD */
+	  private static readonly sbyte[] KANA_COMBINE_VOICED = new sbyte[] {78, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1};
+
+	  private static readonly sbyte[] KANA_COMBINE_HALF_VOICED = new sbyte[] {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+	  /// <summary>
+	  /// returns true if we successfully combined the voice mark </summary>
+	  private static bool combine(char[] text, int pos, char ch)
+	  {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final char prev = text[pos-1];
+		char prev = text[pos - 1];
+		if (prev >= 0x30A6 && prev <= 0x30FD)
+		{
+		  text[pos - 1] += (ch == 0xFF9F) ? KANA_COMBINE_HALF_VOICED[prev - 0x30A6] : KANA_COMBINE_VOICED[prev - 0x30A6];
+		  return text[pos - 1] != prev;
+		}
+		return false;
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKWidthFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKWidthFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKWidthFilterFactory.cs
new file mode 100644
index 0000000..a917f90
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKWidthFilterFactory.cs
@@ -0,0 +1,66 @@
+using System.Collections.Generic;
+
+namespace org.apache.lucene.analysis.cjk
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+	using AbstractAnalysisFactory = org.apache.lucene.analysis.util.AbstractAnalysisFactory;
+	using MultiTermAwareComponent = org.apache.lucene.analysis.util.MultiTermAwareComponent;
+	using TokenFilterFactory = org.apache.lucene.analysis.util.TokenFilterFactory;
+
+	/// <summary>
+	/// Factory for <seealso cref="CJKWidthFilter"/>.
+	/// <pre class="prettyprint">
+	/// &lt;fieldType name="text_cjk" class="solr.TextField"&gt;
+	///   &lt;analyzer&gt;
+	///     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
+	///     &lt;filter class="solr.CJKWidthFilterFactory"/&gt;
+	///     &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
+	///     &lt;filter class="solr.CJKBigramFilterFactory"/&gt;
+	///   &lt;/analyzer&gt;
+	/// &lt;/fieldType&gt;</pre>
+	/// </summary>
+	public class CJKWidthFilterFactory : TokenFilterFactory, MultiTermAwareComponent
+	{
+
+	  /// <summary>
+	  /// Creates a new CJKWidthFilterFactory </summary>
+	  public CJKWidthFilterFactory(IDictionary<string, string> args) : base(args)
+	  {
+		if (args.Count > 0)
+		{
+		  throw new System.ArgumentException("Unknown parameters: " + args);
+		}
+	  }
+
+	  public override TokenStream create(TokenStream input)
+	  {
+		return new CJKWidthFilter(input);
+	  }
+
+	  public virtual AbstractAnalysisFactory MultiTermComponent
+	  {
+		  get
+		  {
+			return this;
+		  }
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniAnalyzer.cs
new file mode 100644
index 0000000..d964550
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniAnalyzer.cs
@@ -0,0 +1,139 @@
+using System;
+
+namespace org.apache.lucene.analysis.ckb
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+
+	using LowerCaseFilter = org.apache.lucene.analysis.core.LowerCaseFilter;
+	using StopFilter = org.apache.lucene.analysis.core.StopFilter;
+	using SetKeywordMarkerFilter = org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+	using StandardFilter = org.apache.lucene.analysis.standard.StandardFilter;
+	using StandardTokenizer = org.apache.lucene.analysis.standard.StandardTokenizer;
+	using CharArraySet = org.apache.lucene.analysis.util.CharArraySet;
+	using StopwordAnalyzerBase = org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+	using WordlistLoader = org.apache.lucene.analysis.util.WordlistLoader;
+	using IOUtils = org.apache.lucene.util.IOUtils;
+	using Version = org.apache.lucene.util.Version;
+
+	/// <summary>
+	/// <seealso cref="Analyzer"/> for Sorani Kurdish.
+	/// </summary>
+	public sealed class SoraniAnalyzer : StopwordAnalyzerBase
+	{
+	  private readonly CharArraySet stemExclusionSet;
+
+	  /// <summary>
+	  /// File containing default Kurdish stopwords. </summary>
+	  public const string DEFAULT_STOPWORD_FILE = "stopwords.txt";
+
+	  /// <summary>
+	  /// Returns an unmodifiable instance of the default stop words set. </summary>
+	  /// <returns> default stop words set. </returns>
+	  public static CharArraySet DefaultStopSet
+	  {
+		  get
+		  {
+			return DefaultSetHolder.DEFAULT_STOP_SET;
+		  }
+	  }
+
+	  /// <summary>
+	  /// Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class 
+	  /// accesses the static final set the first time.;
+	  /// </summary>
+	  private class DefaultSetHolder
+	  {
+		internal static readonly CharArraySet DEFAULT_STOP_SET;
+
+		static DefaultSetHolder()
+		{
+		  try
+		  {
+			DEFAULT_STOP_SET = WordlistLoader.getWordSet(IOUtils.getDecodingReader(typeof(SoraniAnalyzer), DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8), Version.LUCENE_CURRENT);
+		  }
+		  catch (IOException)
+		  {
+			// default set should always be present as it is part of the
+			// distribution (JAR)
+			throw new Exception("Unable to load default stopword set");
+		  }
+		}
+	  }
+
+	  /// <summary>
+	  /// Builds an analyzer with the default stop words: <seealso cref="#DEFAULT_STOPWORD_FILE"/>.
+	  /// </summary>
+	  public SoraniAnalyzer(Version matchVersion) : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
+	  {
+	  }
+
+	  /// <summary>
+	  /// Builds an analyzer with the given stop words.
+	  /// </summary>
+	  /// <param name="matchVersion"> lucene compatibility version </param>
+	  /// <param name="stopwords"> a stopword set </param>
+	  public SoraniAnalyzer(Version matchVersion, CharArraySet stopwords) : this(matchVersion, stopwords, CharArraySet.EMPTY_SET)
+	  {
+	  }
+
+	  /// <summary>
+	  /// Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+	  /// provided this analyzer will add a <seealso cref="SetKeywordMarkerFilter"/> before
+	  /// stemming.
+	  /// </summary>
+	  /// <param name="matchVersion"> lucene compatibility version </param>
+	  /// <param name="stopwords"> a stopword set </param>
+	  /// <param name="stemExclusionSet"> a set of terms not to be stemmed </param>
+	  public SoraniAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) : base(matchVersion, stopwords)
+	  {
+		this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
+	  }
+
+	  /// <summary>
+	  /// Creates a
+	  /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
+	  /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
+	  /// </summary>
+	  /// <returns> A
+	  ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
+	  ///         built from an <seealso cref="StandardTokenizer"/> filtered with
+	  ///         <seealso cref="StandardFilter"/>, <seealso cref="SoraniNormalizationFilter"/>, 
+	  ///         <seealso cref="LowerCaseFilter"/>, <seealso cref="StopFilter"/>
+	  ///         , <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
+	  ///         provided and <seealso cref="SoraniStemFilter"/>. </returns>
+	  protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
+	  {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
+		Tokenizer source = new StandardTokenizer(matchVersion, reader);
+		TokenStream result = new StandardFilter(matchVersion, source);
+		result = new SoraniNormalizationFilter(result);
+		result = new LowerCaseFilter(matchVersion, result);
+		result = new StopFilter(matchVersion, result, stopwords);
+		if (!stemExclusionSet.Empty)
+		{
+		  result = new SetKeywordMarkerFilter(result, stemExclusionSet);
+		}
+		result = new SoraniStemFilter(result);
+		return new TokenStreamComponents(source, result);
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniNormalizationFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniNormalizationFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniNormalizationFilter.cs
new file mode 100644
index 0000000..17133ba
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniNormalizationFilter.cs
@@ -0,0 +1,52 @@
+namespace org.apache.lucene.analysis.ckb
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+	using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+	/// <summary>
+	/// A <seealso cref="TokenFilter"/> that applies <seealso cref="SoraniNormalizer"/> to normalize the
+	/// orthography.
+	/// </summary>
+	public sealed class SoraniNormalizationFilter : TokenFilter
+	{
+	  private readonly SoraniNormalizer normalizer = new SoraniNormalizer();
+	  private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+
+	  public SoraniNormalizationFilter(TokenStream input) : base(input)
+	  {
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
+	  public override bool incrementToken()
+	  {
+		if (input.incrementToken())
+		{
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int newlen = normalizer.normalize(termAtt.buffer(), termAtt.length());
+		  int newlen = normalizer.normalize(termAtt.buffer(), termAtt.length());
+		  termAtt.Length = newlen;
+		  return true;
+		}
+		return false;
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniNormalizationFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniNormalizationFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniNormalizationFilterFactory.cs
new file mode 100644
index 0000000..5f68eb7
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniNormalizationFilterFactory.cs
@@ -0,0 +1,64 @@
+using System.Collections.Generic;
+
+namespace org.apache.lucene.analysis.ckb
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+	using AbstractAnalysisFactory = org.apache.lucene.analysis.util.AbstractAnalysisFactory;
+	using MultiTermAwareComponent = org.apache.lucene.analysis.util.MultiTermAwareComponent;
+	using TokenFilterFactory = org.apache.lucene.analysis.util.TokenFilterFactory;
+
+	/// <summary>
+	/// Factory for <seealso cref="SoraniNormalizationFilter"/>.
+	/// <pre class="prettyprint">
+	/// &lt;fieldType name="text_ckbnormal" class="solr.TextField" positionIncrementGap="100"&gt;
+	///   &lt;analyzer&gt;
+	///     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
+	///     &lt;filter class="solr.SoraniNormalizationFilterFactory"/&gt;
+	///   &lt;/analyzer&gt;
+	/// &lt;/fieldType&gt;</pre>
+	/// </summary>
+	public class SoraniNormalizationFilterFactory : TokenFilterFactory, MultiTermAwareComponent
+	{
+
+	  /// <summary>
+	  /// Creates a new SoraniNormalizationFilterFactory </summary>
+	  public SoraniNormalizationFilterFactory(IDictionary<string, string> args) : base(args)
+	  {
+		if (args.Count > 0)
+		{
+		  throw new System.ArgumentException("Unknown parameters: " + args);
+		}
+	  }
+
+	  public override SoraniNormalizationFilter create(TokenStream input)
+	  {
+		return new SoraniNormalizationFilter(input);
+	  }
+
+	  public virtual AbstractAnalysisFactory MultiTermComponent
+	  {
+		  get
+		  {
+			return this;
+		  }
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniNormalizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniNormalizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniNormalizer.cs
new file mode 100644
index 0000000..9c3f551
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniNormalizer.cs
@@ -0,0 +1,140 @@
+namespace org.apache.lucene.analysis.ckb
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+//JAVA TO C# CONVERTER TODO TASK: This Java 'import static' statement cannot be converted to C#:
+//	import static org.apache.lucene.analysis.util.StemmerUtil.delete;
+
+	/// <summary>
+	/// Normalizes the Unicode representation of Sorani text.
+	/// <para>
+	/// Normalization consists of:
+	/// <ul>
+	///   <li>Alternate forms of 'y' (0064, 0649) are converted to 06CC (FARSI YEH)
+	///   <li>Alternate form of 'k' (0643) is converted to 06A9 (KEHEH)
+	///   <li>Alternate forms of vowel 'e' (0647+200C, word-final 0647, 0629) are converted to 06D5 (AE)
+	///   <li>Alternate (joining) form of 'h' (06BE) is converted to 0647
+	///   <li>Alternate forms of 'rr' (0692, word-initial 0631) are converted to 0695 (REH WITH SMALL V BELOW)
+	///   <li>Harakat, tatweel, and formatting characters such as directional controls are removed.
+	/// </ul>
+	/// </para>
+	/// </summary>
+	public class SoraniNormalizer
+	{
+
+	  internal const char YEH = '\u064A';
+	  internal const char DOTLESS_YEH = '\u0649';
+	  internal const char FARSI_YEH = '\u06CC';
+
+	  internal const char KAF = '\u0643';
+	  internal const char KEHEH = '\u06A9';
+
+	  internal const char HEH = '\u0647';
+	  internal const char AE = '\u06D5';
+	  internal const char ZWNJ = '\u200C';
+	  internal const char HEH_DOACHASHMEE = '\u06BE';
+	  internal const char TEH_MARBUTA = '\u0629';
+
+	  internal const char REH = '\u0631';
+	  internal const char RREH = '\u0695';
+	  internal const char RREH_ABOVE = '\u0692';
+
+	  internal const char TATWEEL = '\u0640';
+	  internal const char FATHATAN = '\u064B';
+	  internal const char DAMMATAN = '\u064C';
+	  internal const char KASRATAN = '\u064D';
+	  internal const char FATHA = '\u064E';
+	  internal const char DAMMA = '\u064F';
+	  internal const char KASRA = '\u0650';
+	  internal const char SHADDA = '\u0651';
+	  internal const char SUKUN = '\u0652';
+
+	  /// <summary>
+	  /// Normalize an input buffer of Sorani text
+	  /// </summary>
+	  /// <param name="s"> input buffer </param>
+	  /// <param name="len"> length of input buffer </param>
+	  /// <returns> length of input buffer after normalization </returns>
+	  public virtual int normalize(char[] s, int len)
+	  {
+		for (int i = 0; i < len; i++)
+		{
+		  switch (s[i])
+		  {
+			case YEH:
+			case DOTLESS_YEH:
+			  s[i] = FARSI_YEH;
+			  break;
+			case KAF:
+			  s[i] = KEHEH;
+			  break;
+			case ZWNJ:
+			  if (i > 0 && s[i - 1] == HEH)
+			  {
+				s[i - 1] = AE;
+			  }
+			  len = delete(s, i, len);
+			  i--;
+			  break;
+			case HEH:
+			  if (i == len - 1)
+			  {
+				s[i] = AE;
+			  }
+			  break;
+			case TEH_MARBUTA:
+			  s[i] = AE;
+			  break;
+			case HEH_DOACHASHMEE:
+			  s[i] = HEH;
+			  break;
+			case REH:
+			  if (i == 0)
+			  {
+				s[i] = RREH;
+			  }
+			  break;
+			case RREH_ABOVE:
+			  s[i] = RREH;
+			  break;
+			case TATWEEL:
+			case KASRATAN:
+			case DAMMATAN:
+			case FATHATAN:
+			case FATHA:
+			case DAMMA:
+			case KASRA:
+			case SHADDA:
+			case SUKUN:
+			  len = delete(s, i, len);
+			  i--;
+			  break;
+			default:
+			  if (char.getType(s[i]) == char.FORMAT)
+			  {
+				len = delete(s, i, len);
+				i--;
+			  }
+		  break;
+		  }
+		}
+		return len;
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniStemFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniStemFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniStemFilter.cs
new file mode 100644
index 0000000..5d79be0
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniStemFilter.cs
@@ -0,0 +1,66 @@
+namespace org.apache.lucene.analysis.ckb
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+	using SetKeywordMarkerFilter = org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; // javadoc @link
+	using KeywordAttribute = org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+	using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+	/// <summary>
+	/// A <seealso cref="TokenFilter"/> that applies <seealso cref="SoraniStemmer"/> to stem Sorani words.
+	/// <para>
+	/// To prevent terms from being stemmed use an instance of
+	/// <seealso cref="SetKeywordMarkerFilter"/> or a custom <seealso cref="TokenFilter"/> that sets
+	/// the <seealso cref="KeywordAttribute"/> before this <seealso cref="TokenStream"/>.
+	/// </para> </summary>
+	/// <seealso cref= SetKeywordMarkerFilter  </seealso>
+
+	public sealed class SoraniStemFilter : TokenFilter
+	{
+	  private readonly SoraniStemmer stemmer = new SoraniStemmer();
+	  private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+	  private readonly KeywordAttribute keywordAttr = addAttribute(typeof(KeywordAttribute));
+
+	  public SoraniStemFilter(TokenStream input) : base(input)
+	  {
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
+	  public override bool incrementToken()
+	  {
+		if (input.incrementToken())
+		{
+		  if (!keywordAttr.Keyword)
+		  {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+			int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
+			termAtt.Length = newlen;
+		  }
+		  return true;
+		}
+		else
+		{
+		  return false;
+		}
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniStemFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniStemFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniStemFilterFactory.cs
new file mode 100644
index 0000000..67018ad
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniStemFilterFactory.cs
@@ -0,0 +1,55 @@
+using System.Collections.Generic;
+
+namespace org.apache.lucene.analysis.ckb
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+	using TokenFilterFactory = org.apache.lucene.analysis.util.TokenFilterFactory;
+
+	/// <summary>
+	/// Factory for <seealso cref="SoraniStemFilter"/>.
+	/// <pre class="prettyprint">
+	/// &lt;fieldType name="text_ckbstem" class="solr.TextField" positionIncrementGap="100"&gt;
+	///   &lt;analyzer&gt;
+	///     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
+	///     &lt;filter class="solr.SoraniNormalizationFilterFactory"/&gt;
+	///     &lt;filter class="solr.SoraniStemFilterFactory"/&gt;
+	///   &lt;/analyzer&gt;
+	/// &lt;/fieldType&gt;</pre>
+	/// </summary>
+	public class SoraniStemFilterFactory : TokenFilterFactory
+	{
+
+	  /// <summary>
+	  /// Creates a new SoraniStemFilterFactory </summary>
+	  public SoraniStemFilterFactory(IDictionary<string, string> args) : base(args)
+	  {
+		if (args.Count > 0)
+		{
+		  throw new System.ArgumentException("Unknown parameters: " + args);
+		}
+	  }
+
+	  public override SoraniStemFilter create(TokenStream input)
+	  {
+		return new SoraniStemFilter(input);
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniStemmer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniStemmer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniStemmer.cs
new file mode 100644
index 0000000..4ec57cb
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ckb/SoraniStemmer.cs
@@ -0,0 +1,139 @@
+namespace org.apache.lucene.analysis.ckb
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+//JAVA TO C# CONVERTER TODO TASK: This Java 'import static' statement cannot be converted to C#:
+//	import static org.apache.lucene.analysis.util.StemmerUtil.endsWith;
+
+	/// <summary>
+	/// Light stemmer for Sorani
+	/// </summary>
+	public class SoraniStemmer
+	{
+
+	  /// <summary>
+	  /// Stem an input buffer of Sorani text.
+	  /// </summary>
+	  /// <param name="s"> input buffer </param>
+	  /// <param name="len"> length of input buffer </param>
+	  /// <returns> length of input buffer after normalization </returns>
+	  public virtual int stem(char[] s, int len)
+	  {
+		// postposition
+		if (len > 5 && endsWith(s, len, "دا"))
+		{
+		  len -= 2;
+		}
+		else if (len > 4 && endsWith(s, len, "نا"))
+		{
+		  len--;
+		}
+		else if (len > 6 && endsWith(s, len, "ەوە"))
+		{
+		  len -= 3;
+		}
+
+		// possessive pronoun
+		if (len > 6 && (endsWith(s, len, "مان") || endsWith(s, len, "یان") || endsWith(s, len, "تان")))
+		{
+		  len -= 3;
+		}
+
+		// indefinite singular ezafe
+		if (len > 6 && endsWith(s, len, "ێکی"))
+		{
+		  return len - 3;
+		}
+		else if (len > 7 && endsWith(s, len, "یەکی"))
+		{
+		  return len - 4;
+		}
+		// indefinite singular
+		if (len > 5 && endsWith(s, len, "ێک"))
+		{
+		  return len - 2;
+		}
+		else if (len > 6 && endsWith(s, len, "یەک"))
+		{
+		  return len - 3;
+		}
+		// definite singular
+		else if (len > 6 && endsWith(s, len, "ەکە"))
+		{
+		  return len - 3;
+		}
+		else if (len > 5 && endsWith(s, len, "کە"))
+		{
+		  return len - 2;
+		}
+		// definite plural
+		else if (len > 7 && endsWith(s, len, "ەکان"))
+		{
+		  return len - 4;
+		}
+		else if (len > 6 && endsWith(s, len, "کان"))
+		{
+		  return len - 3;
+		}
+		// indefinite plural ezafe
+		else if (len > 7 && endsWith(s, len, "یانی"))
+		{
+		  return len - 4;
+		}
+		else if (len > 6 && endsWith(s, len, "انی"))
+		{
+		  return len - 3;
+		}
+		// indefinite plural
+		else if (len > 6 && endsWith(s, len, "یان"))
+		{
+		  return len - 3;
+		}
+		else if (len > 5 && endsWith(s, len, "ان"))
+		{
+		  return len - 2;
+		}
+		// demonstrative plural
+		else if (len > 7 && endsWith(s, len, "یانە"))
+		{
+		  return len - 4;
+		}
+		else if (len > 6 && endsWith(s, len, "انە"))
+		{
+		  return len - 3;
+		}
+		// demonstrative singular
+		else if (len > 5 && (endsWith(s, len, "ایە") || endsWith(s, len, "ەیە")))
+		{
+		  return len - 2;
+		}
+		else if (len > 4 && endsWith(s, len, "ە"))
+		{
+		  return len - 1;
+		}
+		// absolute singular ezafe
+		else if (len > 4 && endsWith(s, len, "ی"))
+		{
+		  return len - 1;
+		}
+		return len;
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseAnalyzer.cs
new file mode 100644
index 0000000..9023664
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseAnalyzer.cs
@@ -0,0 +1,49 @@
+using System;
+
+namespace org.apache.lucene.analysis.cn
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+	using StandardAnalyzer = org.apache.lucene.analysis.standard.StandardAnalyzer; // javadoc @link
+
+	/// <summary>
+	/// An <seealso cref="Analyzer"/> that tokenizes text with <seealso cref="ChineseTokenizer"/> and
+	/// filters with <seealso cref="ChineseFilter"/> </summary>
+	/// @deprecated (3.1) Use <seealso cref="StandardAnalyzer"/> instead, which has the same functionality.
+	/// This analyzer will be removed in Lucene 5.0 
+	[Obsolete("(3.1) Use <seealso cref="StandardAnalyzer"/> instead, which has the same functionality.")]
+	public sealed class ChineseAnalyzer : Analyzer
+	  /// <summary>
+	  /// Creates
+	  /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
+	  /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
+	  /// </summary>
+	  /// <returns> <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
+	  ///         built from a <seealso cref="ChineseTokenizer"/> filtered with
+	  ///         <seealso cref="ChineseFilter"/> </returns>
+	{
+		protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
+		{
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new ChineseTokenizer(reader);
+		  Tokenizer source = new ChineseTokenizer(reader);
+		  return new TokenStreamComponents(source, new ChineseFilter(source));
+		}
+	}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseFilter.cs
new file mode 100644
index 0000000..a631a04
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseFilter.cs
@@ -0,0 +1,104 @@
+using System;
+
+namespace org.apache.lucene.analysis.cn
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+
+	using StopFilter = org.apache.lucene.analysis.core.StopFilter;
+	using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+	using CharArraySet = org.apache.lucene.analysis.util.CharArraySet;
+	using Version = org.apache.lucene.util.Version;
+
+	/// <summary>
+	/// A <seealso cref="TokenFilter"/> with a stop word table.  
+	/// <ul>
+	/// <li>Numeric tokens are removed.
+	/// <li>English tokens must be larger than 1 character.
+	/// <li>One Chinese character as one Chinese word.
+	/// </ul>
+	/// TO DO:
+	/// <ol>
+	/// <li>Add Chinese stop words, such as \ue400
+	/// <li>Dictionary based Chinese word extraction
+	/// <li>Intelligent Chinese word extraction
+	/// </ol>
+	/// </summary>
+	/// @deprecated (3.1) Use <seealso cref="StopFilter"/> instead, which has the same functionality.
+	/// This filter will be removed in Lucene 5.0 
+	[Obsolete("(3.1) Use <seealso cref="StopFilter"/> instead, which has the same functionality.")]
+	public sealed class ChineseFilter : TokenFilter
+	{
+
+
+		// Only English now, Chinese to be added later.
+		public static readonly string[] STOP_WORDS = new string[] {"and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"};
+
+
+		private CharArraySet stopTable;
+
+		private CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+
+		public ChineseFilter(TokenStream @in) : base(@in)
+		{
+
+			stopTable = new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(STOP_WORDS), false);
+		}
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
+		public override bool incrementToken()
+		{
+
+			while (input.incrementToken())
+			{
+				char[] text = termAtt.buffer();
+				int termLength = termAtt.length();
+
+			  // why not key off token type here assuming ChineseTokenizer comes first?
+				if (!stopTable.contains(text, 0, termLength))
+				{
+					switch (char.getType(text[0]))
+					{
+
+					case char.LOWERCASE_LETTER:
+					case char.UPPERCASE_LETTER:
+
+						// English word/token should larger than 1 character.
+						if (termLength > 1)
+						{
+							return true;
+						}
+						break;
+					case char.OTHER_LETTER:
+
+						// One Chinese character as one Chinese word.
+						// Chinese word extraction to be added later here.
+
+						return true;
+					}
+
+				}
+
+			}
+			return false;
+		}
+
+	}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseFilterFactory.cs
new file mode 100644
index 0000000..8e496d7
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseFilterFactory.cs
@@ -0,0 +1,51 @@
+using System;
+using System.Collections.Generic;
+
+namespace org.apache.lucene.analysis.cn
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+	using StopFilterFactory = org.apache.lucene.analysis.core.StopFilterFactory; // javadocs
+	using TokenFilterFactory = org.apache.lucene.analysis.util.TokenFilterFactory;
+
+	/// <summary>
+	/// Factory for <seealso cref="ChineseFilter"/> </summary>
+	/// @deprecated Use <seealso cref="StopFilterFactory"/> instead. 
+	[Obsolete("Use <seealso cref="StopFilterFactory"/> instead.")]
+	public class ChineseFilterFactory : TokenFilterFactory
+	{
+
+	  /// <summary>
+	  /// Creates a new ChineseFilterFactory </summary>
+	  public ChineseFilterFactory(IDictionary<string, string> args) : base(args)
+	  {
+		if (args.Count > 0)
+		{
+		  throw new System.ArgumentException("Unknown parameters: " + args);
+		}
+	  }
+
+	  public override ChineseFilter create(TokenStream @in)
+	  {
+		return new ChineseFilter(@in);
+	  }
+	}
+
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseTokenizer.cs
new file mode 100644
index 0000000..b2fb638
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseTokenizer.cs
@@ -0,0 +1,199 @@
+using System;
+
+namespace org.apache.lucene.analysis.cn
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+
+
+	using StandardTokenizer = org.apache.lucene.analysis.standard.StandardTokenizer;
+	using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+	using OffsetAttribute = org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+
+
+	/// <summary>
+	/// Tokenize Chinese text as individual chinese characters.
+	/// 
+	/// <para>
+	/// The difference between ChineseTokenizer and
+	/// CJKTokenizer is that they have different
+	/// token parsing logic.
+	/// </para>
+	/// <para>
+	/// For example, if the Chinese text
+	/// "C1C2C3C4" is to be indexed:
+	/// <ul>
+	/// <li>The tokens returned from ChineseTokenizer are C1, C2, C3, C4. 
+	/// <li>The tokens returned from the CJKTokenizer are C1C2, C2C3, C3C4.
+	/// </ul>
+	/// </para>
+	/// <para>
+	/// Therefore the index created by CJKTokenizer is much larger.
+	/// </para>
+	/// <para>
+	/// The problem is that when searching for C1, C1C2, C1C3,
+	/// C4C2, C1C2C3 ... the ChineseTokenizer works, but the
+	/// CJKTokenizer will not work.
+	/// </para> </summary>
+	/// @deprecated (3.1) Use <seealso cref="StandardTokenizer"/> instead, which has the same functionality.
+	/// This filter will be removed in Lucene 5.0 
+	[Obsolete("(3.1) Use <seealso cref="StandardTokenizer"/> instead, which has the same functionality.")]
+	public sealed class ChineseTokenizer : Tokenizer
+	{
+
+
+		public ChineseTokenizer(Reader @in) : base(@in)
+		{
+		}
+
+		public ChineseTokenizer(AttributeFactory factory, Reader @in) : base(factory, @in)
+		{
+		}
+
+		private int offset = 0, bufferIndex = 0, dataLen = 0;
+		private const int MAX_WORD_LEN = 255;
+		private const int IO_BUFFER_SIZE = 1024;
+		private readonly char[] buffer = new char[MAX_WORD_LEN];
+		private readonly char[] ioBuffer = new char[IO_BUFFER_SIZE];
+
+
+		private int length;
+		private int start;
+
+		private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+		private readonly OffsetAttribute offsetAtt = addAttribute(typeof(OffsetAttribute));
+
+		private void push(char c)
+		{
+
+			if (length == 0) // start of token
+			{
+				start = offset - 1;
+			}
+			buffer[length++] = char.ToLower(c); // buffer it
+
+		}
+
+		private bool flush()
+		{
+
+			if (length > 0)
+			{
+				//System.out.println(new String(buffer, 0,
+				//length));
+			  termAtt.copyBuffer(buffer, 0, length);
+			  offsetAtt.setOffset(correctOffset(start), correctOffset(start + length));
+			  return true;
+			}
+			else
+			{
+				return false;
+			}
+		}
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
+		public override bool incrementToken()
+		{
+			clearAttributes();
+
+			length = 0;
+			start = offset;
+
+
+			while (true)
+			{
+
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final char c;
+				char c;
+				offset++;
+
+				if (bufferIndex >= dataLen)
+				{
+					dataLen = input.read(ioBuffer);
+					bufferIndex = 0;
+				}
+
+				if (dataLen == -1)
+				{
+				  offset--;
+				  return flush();
+				}
+				else
+				{
+					c = ioBuffer[bufferIndex++];
+				}
+
+
+				switch (char.getType(c))
+				{
+
+				case char.DECIMAL_DIGIT_NUMBER:
+				case char.LOWERCASE_LETTER:
+				case char.UPPERCASE_LETTER:
+					push(c);
+					if (length == MAX_WORD_LEN)
+					{
+						return flush();
+					}
+					break;
+
+				case char.OTHER_LETTER:
+					if (length > 0)
+					{
+						bufferIndex--;
+						offset--;
+						return flush();
+					}
+					push(c);
+					return flush();
+
+				default:
+					if (length > 0)
+					{
+						return flush();
+					}
+					break;
+				}
+			}
+		}
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public final void end() throws java.io.IOException
+		public override void end()
+		{
+		  base.end();
+		  // set final offset
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int finalOffset = correctOffset(offset);
+		  int finalOffset = correctOffset(offset);
+		  this.offsetAtt.setOffset(finalOffset, finalOffset);
+		}
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void reset() throws java.io.IOException
+		public override void reset()
+		{
+		  base.reset();
+		  offset = bufferIndex = dataLen = 0;
+		}
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseTokenizerFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseTokenizerFactory.cs
new file mode 100644
index 0000000..3abb93f
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Cn/ChineseTokenizerFactory.cs
@@ -0,0 +1,52 @@
+using System;
+using System.Collections.Generic;
+
+namespace org.apache.lucene.analysis.cn
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+
+	using TokenizerFactory = org.apache.lucene.analysis.util.TokenizerFactory;
+	using AttributeFactory = org.apache.lucene.util.AttributeSource.AttributeFactory;
+
+	/// <summary>
+	/// Factory for <seealso cref="ChineseTokenizer"/> </summary>
+	/// @deprecated Use <seealso cref="org.apache.lucene.analysis.standard.StandardTokenizerFactory"/> instead. 
+	[Obsolete("Use <seealso cref="org.apache.lucene.analysis.standard.StandardTokenizerFactory"/> instead.")]
+	public class ChineseTokenizerFactory : TokenizerFactory
+	{
+
+	  /// <summary>
+	  /// Creates a new ChineseTokenizerFactory </summary>
+	  public ChineseTokenizerFactory(IDictionary<string, string> args) : base(args)
+	  {
+		if (args.Count > 0)
+		{
+		  throw new System.ArgumentException("Unknown parameters: " + args);
+		}
+	  }
+
+	  public override ChineseTokenizer create(AttributeFactory factory, Reader @in)
+	  {
+		return new ChineseTokenizer(factory, @in);
+	  }
+	}
+
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/CommonGrams/CommonGramsFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/CommonGrams/CommonGramsFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/CommonGrams/CommonGramsFilter.cs
new file mode 100644
index 0000000..2b97da8
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/CommonGrams/CommonGramsFilter.cs
@@ -0,0 +1,199 @@
+using System.Text;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace org.apache.lucene.analysis.commongrams
+{
+
+	using OffsetAttribute = org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+	using PositionIncrementAttribute = org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+	using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+	using PositionLengthAttribute = org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+	using TypeAttribute = org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+	using CharArraySet = org.apache.lucene.analysis.util.CharArraySet;
+	using Version = org.apache.lucene.util.Version;
+
+	/*
+	 * TODO: Consider implementing https://issues.apache.org/jira/browse/LUCENE-1688 changes to stop list and associated constructors 
+	 */
+
+	/// <summary>
+	/// Construct bigrams for frequently occurring terms while indexing. Single terms
+	/// are still indexed too, with bigrams overlaid. This is achieved through the
+	/// use of <seealso cref="PositionIncrementAttribute#setPositionIncrement(int)"/>. Bigrams have a type
+	/// of <seealso cref="#GRAM_TYPE"/> Example:
+	/// <ul>
+	/// <li>input:"the quick brown fox"</li>
+	/// <li>output:|"the","the-quick"|"brown"|"fox"|</li>
+	/// <li>"the-quick" has a position increment of 0 so it is in the same position
+	/// as "the" "the-quick" has a term.type() of "gram"</li>
+	/// 
+	/// </ul>
+	/// </summary>
+
+	/*
+	 * Constructors and makeCommonSet based on similar code in StopFilter
+	 */
+	public sealed class CommonGramsFilter : TokenFilter
+	{
+
+	  public const string GRAM_TYPE = "gram";
+	  private const char SEPARATOR = '_';
+
+	  private readonly CharArraySet commonWords;
+
+	  private readonly StringBuilder buffer = new StringBuilder();
+
+	  private readonly CharTermAttribute termAttribute = addAttribute(typeof(CharTermAttribute));
+	  private readonly OffsetAttribute offsetAttribute = addAttribute(typeof(OffsetAttribute));
+	  private readonly TypeAttribute typeAttribute = addAttribute(typeof(TypeAttribute));
+	  private readonly PositionIncrementAttribute posIncAttribute = addAttribute(typeof(PositionIncrementAttribute));
+	  private readonly PositionLengthAttribute posLenAttribute = addAttribute(typeof(PositionLengthAttribute));
+
+	  private int lastStartOffset;
+	  private bool lastWasCommon;
+	  private State savedState;
+
+	  /// <summary>
+	  /// Construct a token stream filtering the given input using a Set of common
+	  /// words to create bigrams. Outputs both unigrams with position increment and
+	  /// bigrams with position increment 0 type=gram where one or both of the words
+	  /// in a potential bigram are in the set of common words .
+	  /// </summary>
+	  /// <param name="input"> TokenStream input in filter chain </param>
+	  /// <param name="commonWords"> The set of common words. </param>
+	  public CommonGramsFilter(Version matchVersion, TokenStream input, CharArraySet commonWords) : base(input)
+	  {
+		this.commonWords = commonWords;
+	  }
+
+	  /// <summary>
+	  /// Inserts bigrams for common words into a token stream. For each input token,
+	  /// output the token. If the token and/or the following token are in the list
+	  /// of common words also output a bigram with position increment 0 and
+	  /// type="gram"
+	  /// 
+	  /// TODO:Consider adding an option to not emit unigram stopwords
+	  /// as in CDL XTF BigramStopFilter, CommonGramsQueryFilter would need to be
+	  /// changed to work with this.
+	  /// 
+	  /// TODO: Consider optimizing for the case of three
+	  /// commongrams i.e "man of the year" normally produces 3 bigrams: "man-of",
+	  /// "of-the", "the-year" but with proper management of positions we could
+	  /// eliminate the middle bigram "of-the"and save a disk seek and a whole set of
+	  /// position lookups.
+	  /// </summary>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
+	  public override bool incrementToken()
+	  {
+		// get the next piece of input
+		if (savedState != null)
+		{
+		  restoreState(savedState);
+		  savedState = null;
+		  saveTermBuffer();
+		  return true;
+		}
+		else if (!input.incrementToken())
+		{
+			return false;
+		}
+
+		/* We build n-grams before and after stopwords. 
+		 * When valid, the buffer always contains at least the separator.
+		 * If its empty, there is nothing before this stopword.
+		 */
+		if (lastWasCommon || (Common && buffer.Length > 0))
+		{
+		  savedState = captureState();
+		  gramToken();
+		  return true;
+		}
+
+		saveTermBuffer();
+		return true;
+	  }
+
+	  /// <summary>
+	  /// {@inheritDoc}
+	  /// </summary>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void reset() throws java.io.IOException
+	  public override void reset()
+	  {
+		base.reset();
+		lastWasCommon = false;
+		savedState = null;
+		buffer.Length = 0;
+	  }
+
+	  // ================================================= Helper Methods ================================================
+
+	  /// <summary>
+	  /// Determines if the current token is a common term
+	  /// </summary>
+	  /// <returns> {@code true} if the current token is a common term, {@code false} otherwise </returns>
+	  private bool Common
+	  {
+		  get
+		  {
+			return commonWords != null && commonWords.contains(termAttribute.buffer(), 0, termAttribute.length());
+		  }
+	  }
+
+	  /// <summary>
+	  /// Saves this information to form the left part of a gram
+	  /// </summary>
+	  private void saveTermBuffer()
+	  {
+		buffer.Length = 0;
+		buffer.Append(termAttribute.buffer(), 0, termAttribute.length());
+		buffer.Append(SEPARATOR);
+		lastStartOffset = offsetAttribute.startOffset();
+		lastWasCommon = Common;
+	  }
+
+	  /// <summary>
+	  /// Constructs a compound token.
+	  /// </summary>
+	  private void gramToken()
+	  {
+		buffer.Append(termAttribute.buffer(), 0, termAttribute.length());
+		int endOffset = offsetAttribute.endOffset();
+
+		clearAttributes();
+
+		int length = buffer.Length;
+		char[] termText = termAttribute.buffer();
+		if (length > termText.Length)
+		{
+		  termText = termAttribute.resizeBuffer(length);
+		}
+
+		buffer.getChars(0, length, termText, 0);
+		termAttribute.Length = length;
+		posIncAttribute.PositionIncrement = 0;
+		posLenAttribute.PositionLength = 2; // bigram
+		offsetAttribute.setOffset(lastStartOffset, endOffset);
+		typeAttribute.Type = GRAM_TYPE;
+		buffer.Length = 0;
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/CommonGrams/CommonGramsFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/CommonGrams/CommonGramsFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/CommonGrams/CommonGramsFilterFactory.cs
new file mode 100644
index 0000000..2233e83
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/CommonGrams/CommonGramsFilterFactory.cs
@@ -0,0 +1,104 @@
+using System;
+using System.Collections.Generic;
+using Lucene.Net.Analysis.Core;
+using Lucene.Net.Analysis.Util;
+using org.apache.lucene.analysis.commongrams;
+using org.apache.lucene.analysis.util;
+
+namespace Lucene.Net.Analysis.CommonGrams
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+    /// <summary>
+	/// Constructs a <seealso cref="CommonGramsFilter"/>.
+	/// <pre class="prettyprint">
+	/// &lt;fieldType name="text_cmmngrms" class="solr.TextField" positionIncrementGap="100"&gt;
+	///   &lt;analyzer&gt;
+	///     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
+	///     &lt;filter class="solr.CommonGramsFilterFactory" words="commongramsstopwords.txt" ignoreCase="false"/&gt;
+	///   &lt;/analyzer&gt;
+	/// &lt;/fieldType&gt;</pre>
+	/// </summary>
+	public class CommonGramsFilterFactory : TokenFilterFactory, ResourceLoaderAware
+	{
+	  // TODO: shared base class for Stop/Keep/CommonGrams? 
+	  private CharArraySet commonWords;
+	  private readonly string commonWordFiles;
+	  private readonly string format;
+	  private readonly bool ignoreCase;
+
+	  /// <summary>
+	  /// Creates a new CommonGramsFilterFactory </summary>
+	  public CommonGramsFilterFactory(IDictionary<string, string> args) : base(args)
+	  {
+		commonWordFiles = get(args, "words");
+		format = get(args, "format");
+		ignoreCase = getBoolean(args, "ignoreCase", false);
+		if (args.Count > 0)
+		{
+		  throw new System.ArgumentException("Unknown parameters: " + args);
+		}
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void inform(ResourceLoader loader) throws java.io.IOException
+	  public virtual void inform(ResourceLoader loader)
+	  {
+		if (commonWordFiles != null)
+		{
+		  if ("snowball".Equals(format, StringComparison.CurrentCultureIgnoreCase))
+		  {
+			commonWords = getSnowballWordSet(loader, commonWordFiles, ignoreCase);
+		  }
+		  else
+		  {
+			commonWords = GetWordSet(loader, commonWordFiles, ignoreCase);
+		  }
+		}
+		else
+		{
+		  commonWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
+		}
+	  }
+
+	  public virtual bool IgnoreCase
+	  {
+		  get
+		  {
+			return ignoreCase;
+		  }
+	  }
+
+	  public virtual CharArraySet CommonWords
+	  {
+		  get
+		  {
+			return commonWords;
+		  }
+	  }
+
+	  public override TokenFilter Create(TokenStream input)
+	  {
+		CommonGramsFilter commonGrams = new CommonGramsFilter(luceneMatchVersion, input, commonWords);
+		return commonGrams;
+	  }
+	}
+
+
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/CommonGrams/CommonGramsQueryFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/CommonGrams/CommonGramsQueryFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/CommonGrams/CommonGramsQueryFilter.cs
new file mode 100644
index 0000000..b787bde
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/CommonGrams/CommonGramsQueryFilter.cs
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+namespace org.apache.lucene.analysis.commongrams
+{
+
+	using PositionIncrementAttribute = org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+	using TypeAttribute = org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+
+//JAVA TO C# CONVERTER TODO TASK: This Java 'import static' statement cannot be converted to C#:
+//	import static org.apache.lucene.analysis.commongrams.CommonGramsFilter.GRAM_TYPE;
+
+	/// <summary>
+	/// Wrap a CommonGramsFilter optimizing phrase queries by only returning single
+	/// words when they are not a member of a bigram.
+	/// 
+	/// Example:
+	/// <ul>
+	/// <li>query input to CommonGramsFilter: "the rain in spain falls mainly"
+	/// <li>output of CommomGramsFilter/input to CommonGramsQueryFilter:
+	/// |"the, "the-rain"|"rain" "rain-in"|"in, "in-spain"|"spain"|"falls"|"mainly"
+	/// <li>output of CommonGramsQueryFilter:"the-rain", "rain-in" ,"in-spain",
+	/// "falls", "mainly"
+	/// </ul>
+	/// </summary>
+
+	/*
+	 * See:http://hudson.zones.apache.org/hudson/job/Lucene-trunk/javadoc//all/org/apache/lucene/analysis/TokenStream.html and
+	 * http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/analysis/package.html?revision=718798
+	 */
+	public sealed class CommonGramsQueryFilter : TokenFilter
+	{
+
+	  private readonly TypeAttribute typeAttribute = addAttribute(typeof(TypeAttribute));
+	  private readonly PositionIncrementAttribute posIncAttribute = addAttribute(typeof(PositionIncrementAttribute));
+
+	  private State previous;
+	  private string previousType;
+	  private bool exhausted;
+
+	  /// <summary>
+	  /// Constructs a new CommonGramsQueryFilter based on the provided CommomGramsFilter 
+	  /// </summary>
+	  /// <param name="input"> CommonGramsFilter the QueryFilter will use </param>
+	  public CommonGramsQueryFilter(CommonGramsFilter input) : base(input)
+	  {
+	  }
+
+	  /// <summary>
+	  /// {@inheritDoc}
+	  /// </summary>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void reset() throws java.io.IOException
+	  public override void reset()
+	  {
+		base.reset();
+		previous = null;
+		previousType = null;
+		exhausted = false;
+	  }
+
+	  /// <summary>
+	  /// Output bigrams whenever possible to optimize queries. Only output unigrams
+	  /// when they are not a member of a bigram. Example:
+	  /// <ul>
+	  /// <li>input: "the rain in spain falls mainly"
+	  /// <li>output:"the-rain", "rain-in" ,"in-spain", "falls", "mainly"
+	  /// </ul>
+	  /// </summary>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
+	  public override bool incrementToken()
+	  {
+		while (!exhausted && input.incrementToken())
+		{
+		  State current = captureState();
+
+		  if (previous != null && !GramType)
+		  {
+			restoreState(previous);
+			previous = current;
+			previousType = typeAttribute.type();
+
+			if (GramType)
+			{
+			  posIncAttribute.PositionIncrement = 1;
+			}
+			return true;
+		  }
+
+		  previous = current;
+		}
+
+		exhausted = true;
+
+		if (previous == null || GRAM_TYPE.Equals(previousType))
+		{
+		  return false;
+		}
+
+		restoreState(previous);
+		previous = null;
+
+		if (GramType)
+		{
+		  posIncAttribute.PositionIncrement = 1;
+		}
+		return true;
+	  }
+
+	  // ================================================= Helper Methods ================================================
+
+	  /// <summary>
+	  /// Convenience method to check if the current type is a gram type
+	  /// </summary>
+	  /// <returns> {@code true} if the current type is a gram type, {@code false} otherwise </returns>
+	  public bool GramType
+	  {
+		  get
+		  {
+			return GRAM_TYPE.Equals(typeAttribute.type());
+		  }
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/CommonGrams/CommonGramsQueryFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/CommonGrams/CommonGramsQueryFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/CommonGrams/CommonGramsQueryFilterFactory.cs
new file mode 100644
index 0000000..ddee353
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/CommonGrams/CommonGramsQueryFilterFactory.cs
@@ -0,0 +1,55 @@
+using System.Collections.Generic;
+using Lucene.Net.Analysis.CommonGrams;
+
+namespace org.apache.lucene.analysis.commongrams
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+
+	/// <summary>
+	/// Construct <seealso cref="CommonGramsQueryFilter"/>.
+	/// 
+	/// <pre class="prettyprint">
+	/// &lt;fieldType name="text_cmmngrmsqry" class="solr.TextField" positionIncrementGap="100"&gt;
+	///   &lt;analyzer&gt;
+	///     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
+	///     &lt;filter class="solr.CommonGramsQueryFilterFactory" words="commongramsquerystopwords.txt" ignoreCase="false"/&gt;
+	///   &lt;/analyzer&gt;
+	/// &lt;/fieldType&gt;</pre>
+	/// </summary>
+	public class CommonGramsQueryFilterFactory : CommonGramsFilterFactory
+	{
+
+	  /// <summary>
+	  /// Creates a new CommonGramsQueryFilterFactory </summary>
+	  public CommonGramsQueryFilterFactory(IDictionary<string, string> args) : base(args)
+	  {
+	  }
+
+	  /// <summary>
+	  /// Create a CommonGramsFilter and wrap it with a CommonGramsQueryFilter
+	  /// </summary>
+	  public override TokenFilter create(TokenStream input)
+	  {
+		CommonGramsFilter commonGrams = (CommonGramsFilter) base.create(input);
+		return new CommonGramsQueryFilter(commonGrams);
+	  }
+	}
+
+}
\ No newline at end of file


Mime
View raw message