lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From synhers...@apache.org
Subject [33/34] lucenenet git commit: Raw porting of Lucene.Net.Analysis.Common
Date Fri, 07 Nov 2014 23:12:37 GMT
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Br/BrazilianStemmer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Br/BrazilianStemmer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Br/BrazilianStemmer.cs
new file mode 100644
index 0000000..d7c385d
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Br/BrazilianStemmer.cs
@@ -0,0 +1,1395 @@
+namespace org.apache.lucene.analysis.br
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+	/// <summary>
+	/// A stemmer for Brazilian Portuguese words.
+	/// </summary>
+	public class BrazilianStemmer
+	{
+	  private static readonly Locale locale = new Locale("pt", "BR");
+
+	  /// <summary>
+	  /// Changed term
+	  /// </summary>
+	  private string TERM;
+	  private string CT;
+	  private string R1;
+	  private string R2;
+	  private string RV;
+
+
+	  public BrazilianStemmer()
+	  {
+	  }
+
+	  /// <summary>
+	  /// Stems the given term to an unique <tt>discriminator</tt>.
+	  /// </summary>
+	  /// <param name="term">  The term that should be stemmed. </param>
+	  /// <returns>      Discriminator for <tt>term</tt> </returns>
+	  protected internal virtual string stem(string term)
+	  {
+		bool altered = false; // altered the term
+
+		// creates CT
+		createCT(term);
+
+		if (!isIndexable(CT))
+		{
+		  return null;
+		}
+		if (!isStemmable(CT))
+		{
+		  return CT;
+		}
+
+		R1 = getR1(CT);
+		R2 = getR1(R1);
+		RV = getRV(CT);
+		TERM = term + ";" + CT;
+
+		altered = step1();
+		if (!altered)
+		{
+		  altered = step2();
+		}
+
+		if (altered)
+		{
+		  step3();
+		}
+		else
+		{
+		  step4();
+		}
+
+		step5();
+
+		return CT;
+	  }
+
+	  /// <summary>
+	  /// Checks a term if it can be processed correctly.
+	  /// </summary>
+	  /// <returns>  true if, and only if, the given term consists in letters. </returns>
+	  private bool isStemmable(string term)
+	  {
+		for (int c = 0; c < term.Length; c++)
+		{
+		  // Discard terms that contain non-letter characters.
+		  if (!char.IsLetter(term[c]))
+		  {
+			return false;
+		  }
+		}
+		return true;
+	  }
+
+	  /// <summary>
+	  /// Checks a term if it can be processed indexed.
+	  /// </summary>
+	  /// <returns>  true if it can be indexed </returns>
+	  private bool isIndexable(string term)
+	  {
+		return (term.Length < 30) && (term.Length > 2);
+	  }
+
+	  /// <summary>
+	  /// See if string is 'a','e','i','o','u'
+	  /// </summary>
+	  /// <returns> true if is vowel </returns>
+	  private bool isVowel(char value)
+	  {
+		return (value == 'a') || (value == 'e') || (value == 'i') || (value == 'o') || (value == 'u');
+	  }
+
+	  /// <summary>
+	  /// Gets R1
+	  /// 
+	  /// R1 - is the region after the first non-vowel following a vowel,
+	  ///      or is the null region at the end of the word if there is
+	  ///      no such non-vowel.
+	  /// </summary>
+	  /// <returns> null or a string representing R1 </returns>
+	  private string getR1(string value)
+	  {
+		int i;
+		int j;
+
+		// be-safe !!!
+		if (value == null)
+		{
+		  return null;
+		}
+
+		// find 1st vowel
+		i = value.Length - 1;
+		for (j = 0 ; j < i ; j++)
+		{
+		  if (isVowel(value[j]))
+		  {
+			break;
+		  }
+		}
+
+		if (!(j < i))
+		{
+		  return null;
+		}
+
+		// find 1st non-vowel
+		for (; j < i ; j++)
+		{
+		  if (!(isVowel(value[j])))
+		  {
+			break;
+		  }
+		}
+
+		if (!(j < i))
+		{
+		  return null;
+		}
+
+		return value.Substring(j + 1);
+	  }
+
+	  /// <summary>
+	  /// Gets RV
+	  /// 
+	  /// RV - IF the second letter is a consonant, RV is the region after
+	  ///      the next following vowel,
+	  /// 
+	  ///      OR if the first two letters are vowels, RV is the region
+	  ///      after the next consonant,
+	  /// 
+	  ///      AND otherwise (consonant-vowel case) RV is the region after
+	  ///      the third letter.
+	  /// 
+	  ///      BUT RV is the end of the word if this positions cannot be
+	  ///      found.
+	  /// </summary>
+	  /// <returns> null or a string representing RV </returns>
+	  private string getRV(string value)
+	  {
+		int i;
+		int j;
+
+		// be-safe !!!
+		if (value == null)
+		{
+		  return null;
+		}
+
+		i = value.Length - 1;
+
+		// RV - IF the second letter is a consonant, RV is the region after
+		//      the next following vowel,
+		if ((i > 0) && !isVowel(value[1]))
+		{
+		  // find 1st vowel
+		  for (j = 2 ; j < i ; j++)
+		  {
+			if (isVowel(value[j]))
+			{
+			  break;
+			}
+		  }
+
+		  if (j < i)
+		  {
+			return value.Substring(j + 1);
+		  }
+		}
+
+
+		// RV - OR if the first two letters are vowels, RV is the region
+		//      after the next consonant,
+		if ((i > 1) && isVowel(value[0]) && isVowel(value[1]))
+		{
+		  // find 1st consoant
+		  for (j = 2 ; j < i ; j++)
+		  {
+			if (!isVowel(value[j]))
+			{
+			  break;
+			}
+		  }
+
+		  if (j < i)
+		  {
+			return value.Substring(j + 1);
+		  }
+		}
+
+		// RV - AND otherwise (consonant-vowel case) RV is the region after
+		//      the third letter.
+		if (i > 2)
+		{
+		  return value.Substring(3);
+		}
+
+		return null;
+	  }
+
+	  /// <summary>
+	  /// 1) Turn to lowercase
+	  /// 2) Remove accents
+	  /// 3) ã -> a ; õ -> o
+	  /// 4) ç -> c
+	  /// </summary>
+	  /// <returns> null or a string transformed </returns>
+	  private string changeTerm(string value)
+	  {
+		int j;
+		string r = "";
+
+		// be-safe !!!
+		if (value == null)
+		{
+		  return null;
+		}
+
+		value = value.ToLower(locale);
+		for (j = 0 ; j < value.Length ; j++)
+		{
+		  if ((value[j] == 'á') || (value[j] == 'â') || (value[j] == 'ã'))
+		  {
+			r = r + "a";
+			continue;
+		  }
+		  if ((value[j] == 'é') || (value[j] == 'ê'))
+		  {
+			r = r + "e";
+			continue;
+		  }
+		  if (value[j] == 'í')
+		  {
+			r = r + "i";
+			continue;
+		  }
+		  if ((value[j] == 'ó') || (value[j] == 'ô') || (value[j] == 'õ'))
+		  {
+			r = r + "o";
+			continue;
+		  }
+		  if ((value[j] == 'ú') || (value[j] == 'ü'))
+		  {
+			r = r + "u";
+			continue;
+		  }
+		  if (value[j] == 'ç')
+		  {
+			r = r + "c";
+			continue;
+		  }
+		  if (value[j] == 'ñ')
+		  {
+			r = r + "n";
+			continue;
+		  }
+
+		  r = r + value[j];
+		}
+
+		return r;
+	  }
+
+	  /// <summary>
+	  /// Check if a string ends with a suffix
+	  /// </summary>
+	  /// <returns> true if the string ends with the specified suffix </returns>
+	  private bool suffix(string value, string suffix)
+	  {
+
+		// be-safe !!!
+		if ((value == null) || (suffix_Renamed == null))
+		{
+		  return false;
+		}
+
+		if (suffix_Renamed.Length > value.Length)
+		{
+		  return false;
+		}
+
+		return value.Substring(value.Length - suffix_Renamed.Length).Equals(suffix_Renamed);
+	  }
+
+	  /// <summary>
+	  /// Replace a string suffix by another
+	  /// </summary>
+	  /// <returns> the replaced String </returns>
+	  private string replaceSuffix(string value, string toReplace, string changeTo)
+	  {
+		string vvalue;
+
+		// be-safe !!!
+		if ((value == null) || (toReplace == null) || (changeTo == null))
+		{
+		  return value;
+		}
+
+		vvalue = removeSuffix(value,toReplace);
+
+		if (value.Equals(vvalue))
+		{
+		  return value;
+		}
+		else
+		{
+		  return vvalue + changeTo;
+		}
+	  }
+
+	  /// <summary>
+	  /// Remove a string suffix
+	  /// </summary>
+	  /// <returns> the String without the suffix </returns>
+	  private string removeSuffix(string value, string toRemove)
+	  {
+		// be-safe !!!
+		if ((value == null) || (toRemove == null) || !suffix(value,toRemove))
+		{
+		  return value;
+		}
+
+		return value.Substring(0,value.Length - toRemove.Length);
+	  }
+
+	  /// <summary>
+	  /// See if a suffix is preceded by a String
+	  /// </summary>
+	  /// <returns> true if the suffix is preceded </returns>
+	  private bool suffixPreceded(string value, string suffix, string preceded)
+	  {
+		// be-safe !!!
+		if ((value == null) || (suffix_Renamed == null) || (preceded == null) || !suffix(value,suffix_Renamed))
+		{
+		  return false;
+		}
+
+		return suffix(removeSuffix(value,suffix_Renamed),preceded);
+	  }
+
+	  /// <summary>
+	  /// Creates CT (changed term) , substituting * 'ã' and 'õ' for 'a~' and 'o~'.
+	  /// </summary>
+	  private void createCT(string term)
+	  {
+		CT = changeTerm(term);
+
+		if (CT.Length < 2)
+		{
+			return;
+		}
+
+		// if the first character is ... , remove it
+		if ((CT[0] == '"') || (CT[0] == '\'') || (CT[0] == '-') || (CT[0] == ',') || (CT[0] == ';') || (CT[0] == '.') || (CT[0] == '?') || (CT[0] == '!'))
+		{
+			CT = CT.Substring(1);
+		}
+
+		if (CT.Length < 2)
+		{
+			return;
+		}
+
+		// if the last character is ... , remove it
+		if ((CT[CT.Length - 1] == '-') || (CT[CT.Length - 1] == ',') || (CT[CT.Length - 1] == ';') || (CT[CT.Length - 1] == '.') || (CT[CT.Length - 1] == '?') || (CT[CT.Length - 1] == '!') || (CT[CT.Length - 1] == '\'') || (CT[CT.Length - 1] == '"'))
+		{
+			CT = CT.Substring(0,CT.Length - 1);
+		}
+	  }
+
+
+	  /// <summary>
+	  /// Standard suffix removal.
+	  /// Search for the longest among the following suffixes, and perform
+	  /// the following actions:
+	  /// </summary>
+	  /// <returns> false if no ending was removed </returns>
+	  private bool step1()
+	  {
+		if (CT == null)
+		{
+			return false;
+		}
+
+		// suffix length = 7
+		if (suffix(CT,"uciones") && suffix(R2,"uciones"))
+		{
+			CT = replaceSuffix(CT,"uciones","u");
+			return true;
+		}
+
+		// suffix length = 6
+		if (CT.Length >= 6)
+		{
+		  if (suffix(CT,"imentos") && suffix(R2,"imentos"))
+		  {
+			  CT = removeSuffix(CT,"imentos");
+			  return true;
+		  }
+		  if (suffix(CT,"amentos") && suffix(R2,"amentos"))
+		  {
+			  CT = removeSuffix(CT,"amentos");
+			  return true;
+		  }
+		  if (suffix(CT,"adores") && suffix(R2,"adores"))
+		  {
+			  CT = removeSuffix(CT,"adores");
+			  return true;
+		  }
+		  if (suffix(CT,"adoras") && suffix(R2,"adoras"))
+		  {
+			  CT = removeSuffix(CT,"adoras");
+			  return true;
+		  }
+		  if (suffix(CT,"logias") && suffix(R2,"logias"))
+		  {
+			  replaceSuffix(CT,"logias","log");
+			  return true;
+		  }
+		  if (suffix(CT,"encias") && suffix(R2,"encias"))
+		  {
+			  CT = replaceSuffix(CT,"encias","ente");
+			  return true;
+		  }
+		  if (suffix(CT,"amente") && suffix(R1,"amente"))
+		  {
+			  CT = removeSuffix(CT,"amente");
+			  return true;
+		  }
+		  if (suffix(CT,"idades") && suffix(R2,"idades"))
+		  {
+			  CT = removeSuffix(CT,"idades");
+			  return true;
+		  }
+		}
+
+		// suffix length = 5
+		if (CT.Length >= 5)
+		{
+		  if (suffix(CT,"acoes") && suffix(R2,"acoes"))
+		  {
+			  CT = removeSuffix(CT,"acoes");
+			  return true;
+		  }
+		  if (suffix(CT,"imento") && suffix(R2,"imento"))
+		  {
+			  CT = removeSuffix(CT,"imento");
+			  return true;
+		  }
+		  if (suffix(CT,"amento") && suffix(R2,"amento"))
+		  {
+			  CT = removeSuffix(CT,"amento");
+			  return true;
+		  }
+		  if (suffix(CT,"adora") && suffix(R2,"adora"))
+		  {
+			  CT = removeSuffix(CT,"adora");
+			  return true;
+		  }
+		  if (suffix(CT,"ismos") && suffix(R2,"ismos"))
+		  {
+			  CT = removeSuffix(CT,"ismos");
+			  return true;
+		  }
+		  if (suffix(CT,"istas") && suffix(R2,"istas"))
+		  {
+			  CT = removeSuffix(CT,"istas");
+			  return true;
+		  }
+		  if (suffix(CT,"logia") && suffix(R2,"logia"))
+		  {
+			  CT = replaceSuffix(CT,"logia","log");
+			  return true;
+		  }
+		  if (suffix(CT,"ucion") && suffix(R2,"ucion"))
+		  {
+			  CT = replaceSuffix(CT,"ucion","u");
+			  return true;
+		  }
+		  if (suffix(CT,"encia") && suffix(R2,"encia"))
+		  {
+			  CT = replaceSuffix(CT,"encia","ente");
+			  return true;
+		  }
+		  if (suffix(CT,"mente") && suffix(R2,"mente"))
+		  {
+			  CT = removeSuffix(CT,"mente");
+			  return true;
+		  }
+		  if (suffix(CT,"idade") && suffix(R2,"idade"))
+		  {
+			  CT = removeSuffix(CT,"idade");
+			  return true;
+		  }
+		}
+
+		// suffix length = 4
+		if (CT.Length >= 4)
+		{
+		  if (suffix(CT,"acao") && suffix(R2,"acao"))
+		  {
+			  CT = removeSuffix(CT,"acao");
+			  return true;
+		  }
+		  if (suffix(CT,"ezas") && suffix(R2,"ezas"))
+		  {
+			  CT = removeSuffix(CT,"ezas");
+			  return true;
+		  }
+		  if (suffix(CT,"icos") && suffix(R2,"icos"))
+		  {
+			  CT = removeSuffix(CT,"icos");
+			  return true;
+		  }
+		  if (suffix(CT,"icas") && suffix(R2,"icas"))
+		  {
+			  CT = removeSuffix(CT,"icas");
+			  return true;
+		  }
+		  if (suffix(CT,"ismo") && suffix(R2,"ismo"))
+		  {
+			  CT = removeSuffix(CT,"ismo");
+			  return true;
+		  }
+		  if (suffix(CT,"avel") && suffix(R2,"avel"))
+		  {
+			  CT = removeSuffix(CT,"avel");
+			  return true;
+		  }
+		  if (suffix(CT,"ivel") && suffix(R2,"ivel"))
+		  {
+			  CT = removeSuffix(CT,"ivel");
+			  return true;
+		  }
+		  if (suffix(CT,"ista") && suffix(R2,"ista"))
+		  {
+			  CT = removeSuffix(CT,"ista");
+			  return true;
+		  }
+		  if (suffix(CT,"osos") && suffix(R2,"osos"))
+		  {
+			  CT = removeSuffix(CT,"osos");
+			  return true;
+		  }
+		  if (suffix(CT,"osas") && suffix(R2,"osas"))
+		  {
+			  CT = removeSuffix(CT,"osas");
+			  return true;
+		  }
+		  if (suffix(CT,"ador") && suffix(R2,"ador"))
+		  {
+			  CT = removeSuffix(CT,"ador");
+			  return true;
+		  }
+		  if (suffix(CT,"ivas") && suffix(R2,"ivas"))
+		  {
+			  CT = removeSuffix(CT,"ivas");
+			  return true;
+		  }
+		  if (suffix(CT,"ivos") && suffix(R2,"ivos"))
+		  {
+			  CT = removeSuffix(CT,"ivos");
+			  return true;
+		  }
+		  if (suffix(CT,"iras") && suffix(RV,"iras") && suffixPreceded(CT,"iras","e"))
+		  {
+			  CT = replaceSuffix(CT,"iras","ir");
+			  return true;
+		  }
+		}
+
+		// suffix length = 3
+		if (CT.Length >= 3)
+		{
+		  if (suffix(CT,"eza") && suffix(R2,"eza"))
+		  {
+			  CT = removeSuffix(CT,"eza");
+			  return true;
+		  }
+		  if (suffix(CT,"ico") && suffix(R2,"ico"))
+		  {
+			  CT = removeSuffix(CT,"ico");
+			  return true;
+		  }
+		  if (suffix(CT,"ica") && suffix(R2,"ica"))
+		  {
+			  CT = removeSuffix(CT,"ica");
+			  return true;
+		  }
+		  if (suffix(CT,"oso") && suffix(R2,"oso"))
+		  {
+			  CT = removeSuffix(CT,"oso");
+			  return true;
+		  }
+		  if (suffix(CT,"osa") && suffix(R2,"osa"))
+		  {
+			  CT = removeSuffix(CT,"osa");
+			  return true;
+		  }
+		  if (suffix(CT,"iva") && suffix(R2,"iva"))
+		  {
+			  CT = removeSuffix(CT,"iva");
+			  return true;
+		  }
+		  if (suffix(CT,"ivo") && suffix(R2,"ivo"))
+		  {
+			  CT = removeSuffix(CT,"ivo");
+			  return true;
+		  }
+		  if (suffix(CT,"ira") && suffix(RV,"ira") && suffixPreceded(CT,"ira","e"))
+		  {
+			  CT = replaceSuffix(CT,"ira","ir");
+			  return true;
+		  }
+		}
+
+		// no ending was removed by step1
+		return false;
+	  }
+
+
+	  /// <summary>
+	  /// Verb suffixes.
+	  /// 
+	  /// Search for the longest among the following suffixes in RV,
+	  /// and if found, delete.
+	  /// </summary>
+	  /// <returns> false if no ending was removed </returns>
+	  private bool step2()
+	  {
+		if (RV == null)
+		{
+			return false;
+		}
+
+		// suffix lenght = 7
+		if (RV.Length >= 7)
+		{
+		  if (suffix(RV,"issemos"))
+		  {
+			CT = removeSuffix(CT,"issemos");
+			return true;
+		  }
+		  if (suffix(RV,"essemos"))
+		  {
+			CT = removeSuffix(CT,"essemos");
+			return true;
+		  }
+		  if (suffix(RV,"assemos"))
+		  {
+			CT = removeSuffix(CT,"assemos");
+			return true;
+		  }
+		  if (suffix(RV,"ariamos"))
+		  {
+			CT = removeSuffix(CT,"ariamos");
+			return true;
+		  }
+		  if (suffix(RV,"eriamos"))
+		  {
+			CT = removeSuffix(CT,"eriamos");
+			return true;
+		  }
+		  if (suffix(RV,"iriamos"))
+		  {
+			CT = removeSuffix(CT,"iriamos");
+			return true;
+		  }
+		}
+
+		// suffix length = 6
+		if (RV.Length >= 6)
+		{
+		  if (suffix(RV,"iremos"))
+		  {
+			CT = removeSuffix(CT,"iremos");
+			return true;
+		  }
+		  if (suffix(RV,"eremos"))
+		  {
+			CT = removeSuffix(CT,"eremos");
+			return true;
+		  }
+		  if (suffix(RV,"aremos"))
+		  {
+			CT = removeSuffix(CT,"aremos");
+			return true;
+		  }
+		  if (suffix(RV,"avamos"))
+		  {
+			CT = removeSuffix(CT,"avamos");
+			return true;
+		  }
+		  if (suffix(RV,"iramos"))
+		  {
+			CT = removeSuffix(CT,"iramos");
+			return true;
+		  }
+		  if (suffix(RV,"eramos"))
+		  {
+			CT = removeSuffix(CT,"eramos");
+			return true;
+		  }
+		  if (suffix(RV,"aramos"))
+		  {
+			CT = removeSuffix(CT,"aramos");
+			return true;
+		  }
+		  if (suffix(RV,"asseis"))
+		  {
+			CT = removeSuffix(CT,"asseis");
+			return true;
+		  }
+		  if (suffix(RV,"esseis"))
+		  {
+			CT = removeSuffix(CT,"esseis");
+			return true;
+		  }
+		  if (suffix(RV,"isseis"))
+		  {
+			CT = removeSuffix(CT,"isseis");
+			return true;
+		  }
+		  if (suffix(RV,"arieis"))
+		  {
+			CT = removeSuffix(CT,"arieis");
+			return true;
+		  }
+		  if (suffix(RV,"erieis"))
+		  {
+			CT = removeSuffix(CT,"erieis");
+			return true;
+		  }
+		  if (suffix(RV,"irieis"))
+		  {
+			CT = removeSuffix(CT,"irieis");
+			return true;
+		  }
+		}
+
+
+		// suffix length = 5
+		if (RV.Length >= 5)
+		{
+		  if (suffix(RV,"irmos"))
+		  {
+			CT = removeSuffix(CT,"irmos");
+			return true;
+		  }
+		  if (suffix(RV,"iamos"))
+		  {
+			CT = removeSuffix(CT,"iamos");
+			return true;
+		  }
+		  if (suffix(RV,"armos"))
+		  {
+			CT = removeSuffix(CT,"armos");
+			return true;
+		  }
+		  if (suffix(RV,"ermos"))
+		  {
+			CT = removeSuffix(CT,"ermos");
+			return true;
+		  }
+		  if (suffix(RV,"areis"))
+		  {
+			CT = removeSuffix(CT,"areis");
+			return true;
+		  }
+		  if (suffix(RV,"ereis"))
+		  {
+			CT = removeSuffix(CT,"ereis");
+			return true;
+		  }
+		  if (suffix(RV,"ireis"))
+		  {
+			CT = removeSuffix(CT,"ireis");
+			return true;
+		  }
+		  if (suffix(RV,"asses"))
+		  {
+			CT = removeSuffix(CT,"asses");
+			return true;
+		  }
+		  if (suffix(RV,"esses"))
+		  {
+			CT = removeSuffix(CT,"esses");
+			return true;
+		  }
+		  if (suffix(RV,"isses"))
+		  {
+			CT = removeSuffix(CT,"isses");
+			return true;
+		  }
+		  if (suffix(RV,"astes"))
+		  {
+			CT = removeSuffix(CT,"astes");
+			return true;
+		  }
+		  if (suffix(RV,"assem"))
+		  {
+			CT = removeSuffix(CT,"assem");
+			return true;
+		  }
+		  if (suffix(RV,"essem"))
+		  {
+			CT = removeSuffix(CT,"essem");
+			return true;
+		  }
+		  if (suffix(RV,"issem"))
+		  {
+			CT = removeSuffix(CT,"issem");
+			return true;
+		  }
+		  if (suffix(RV,"ardes"))
+		  {
+			CT = removeSuffix(CT,"ardes");
+			return true;
+		  }
+		  if (suffix(RV,"erdes"))
+		  {
+			CT = removeSuffix(CT,"erdes");
+			return true;
+		  }
+		  if (suffix(RV,"irdes"))
+		  {
+			CT = removeSuffix(CT,"irdes");
+			return true;
+		  }
+		  if (suffix(RV,"ariam"))
+		  {
+			CT = removeSuffix(CT,"ariam");
+			return true;
+		  }
+		  if (suffix(RV,"eriam"))
+		  {
+			CT = removeSuffix(CT,"eriam");
+			return true;
+		  }
+		  if (suffix(RV,"iriam"))
+		  {
+			CT = removeSuffix(CT,"iriam");
+			return true;
+		  }
+		  if (suffix(RV,"arias"))
+		  {
+			CT = removeSuffix(CT,"arias");
+			return true;
+		  }
+		  if (suffix(RV,"erias"))
+		  {
+			CT = removeSuffix(CT,"erias");
+			return true;
+		  }
+		  if (suffix(RV,"irias"))
+		  {
+			CT = removeSuffix(CT,"irias");
+			return true;
+		  }
+		  if (suffix(RV,"estes"))
+		  {
+			CT = removeSuffix(CT,"estes");
+			return true;
+		  }
+		  if (suffix(RV,"istes"))
+		  {
+			CT = removeSuffix(CT,"istes");
+			return true;
+		  }
+		  if (suffix(RV,"areis"))
+		  {
+			CT = removeSuffix(CT,"areis");
+			return true;
+		  }
+		  if (suffix(RV,"aveis"))
+		  {
+			CT = removeSuffix(CT,"aveis");
+			return true;
+		  }
+		}
+
+		// suffix length = 4
+		if (RV.Length >= 4)
+		{
+		  if (suffix(RV,"aria"))
+		  {
+			CT = removeSuffix(CT,"aria");
+			return true;
+		  }
+		  if (suffix(RV,"eria"))
+		  {
+			CT = removeSuffix(CT,"eria");
+			return true;
+		  }
+		  if (suffix(RV,"iria"))
+		  {
+			CT = removeSuffix(CT,"iria");
+			return true;
+		  }
+		  if (suffix(RV,"asse"))
+		  {
+			CT = removeSuffix(CT,"asse");
+			return true;
+		  }
+		  if (suffix(RV,"esse"))
+		  {
+			CT = removeSuffix(CT,"esse");
+			return true;
+		  }
+		  if (suffix(RV,"isse"))
+		  {
+			CT = removeSuffix(CT,"isse");
+			return true;
+		  }
+		  if (suffix(RV,"aste"))
+		  {
+			CT = removeSuffix(CT,"aste");
+			return true;
+		  }
+		  if (suffix(RV,"este"))
+		  {
+			CT = removeSuffix(CT,"este");
+			return true;
+		  }
+		  if (suffix(RV,"iste"))
+		  {
+			CT = removeSuffix(CT,"iste");
+			return true;
+		  }
+		  if (suffix(RV,"arei"))
+		  {
+			CT = removeSuffix(CT,"arei");
+			return true;
+		  }
+		  if (suffix(RV,"erei"))
+		  {
+			CT = removeSuffix(CT,"erei");
+			return true;
+		  }
+		  if (suffix(RV,"irei"))
+		  {
+			CT = removeSuffix(CT,"irei");
+			return true;
+		  }
+		  if (suffix(RV,"aram"))
+		  {
+			CT = removeSuffix(CT,"aram");
+			return true;
+		  }
+		  if (suffix(RV,"eram"))
+		  {
+			CT = removeSuffix(CT,"eram");
+			return true;
+		  }
+		  if (suffix(RV,"iram"))
+		  {
+			CT = removeSuffix(CT,"iram");
+			return true;
+		  }
+		  if (suffix(RV,"avam"))
+		  {
+			CT = removeSuffix(CT,"avam");
+			return true;
+		  }
+		  if (suffix(RV,"arem"))
+		  {
+			CT = removeSuffix(CT,"arem");
+			return true;
+		  }
+		  if (suffix(RV,"erem"))
+		  {
+			CT = removeSuffix(CT,"erem");
+			return true;
+		  }
+		  if (suffix(RV,"irem"))
+		  {
+			CT = removeSuffix(CT,"irem");
+			return true;
+		  }
+		  if (suffix(RV,"ando"))
+		  {
+			CT = removeSuffix(CT,"ando");
+			return true;
+		  }
+		  if (suffix(RV,"endo"))
+		  {
+			CT = removeSuffix(CT,"endo");
+			return true;
+		  }
+		  if (suffix(RV,"indo"))
+		  {
+			CT = removeSuffix(CT,"indo");
+			return true;
+		  }
+		  if (suffix(RV,"arao"))
+		  {
+			CT = removeSuffix(CT,"arao");
+			return true;
+		  }
+		  if (suffix(RV,"erao"))
+		  {
+			CT = removeSuffix(CT,"erao");
+			return true;
+		  }
+		  if (suffix(RV,"irao"))
+		  {
+			CT = removeSuffix(CT,"irao");
+			return true;
+		  }
+		  if (suffix(RV,"adas"))
+		  {
+			CT = removeSuffix(CT,"adas");
+			return true;
+		  }
+		  if (suffix(RV,"idas"))
+		  {
+			CT = removeSuffix(CT,"idas");
+			return true;
+		  }
+		  if (suffix(RV,"aras"))
+		  {
+			CT = removeSuffix(CT,"aras");
+			return true;
+		  }
+		  if (suffix(RV,"eras"))
+		  {
+			CT = removeSuffix(CT,"eras");
+			return true;
+		  }
+		  if (suffix(RV,"iras"))
+		  {
+			CT = removeSuffix(CT,"iras");
+			return true;
+		  }
+		  if (suffix(RV,"avas"))
+		  {
+			CT = removeSuffix(CT,"avas");
+			return true;
+		  }
+		  if (suffix(RV,"ares"))
+		  {
+			CT = removeSuffix(CT,"ares");
+			return true;
+		  }
+		  if (suffix(RV,"eres"))
+		  {
+			CT = removeSuffix(CT,"eres");
+			return true;
+		  }
+		  if (suffix(RV,"ires"))
+		  {
+			CT = removeSuffix(CT,"ires");
+			return true;
+		  }
+		  if (suffix(RV,"ados"))
+		  {
+			CT = removeSuffix(CT,"ados");
+			return true;
+		  }
+		  if (suffix(RV,"idos"))
+		  {
+			CT = removeSuffix(CT,"idos");
+			return true;
+		  }
+		  if (suffix(RV,"amos"))
+		  {
+			CT = removeSuffix(CT,"amos");
+			return true;
+		  }
+		  if (suffix(RV,"emos"))
+		  {
+			CT = removeSuffix(CT,"emos");
+			return true;
+		  }
+		  if (suffix(RV,"imos"))
+		  {
+			CT = removeSuffix(CT,"imos");
+			return true;
+		  }
+		  if (suffix(RV,"iras"))
+		  {
+			CT = removeSuffix(CT,"iras");
+			return true;
+		  }
+		  if (suffix(RV,"ieis"))
+		  {
+			CT = removeSuffix(CT,"ieis");
+			return true;
+		  }
+		}
+
+		// suffix length = 3
+		if (RV.Length >= 3)
+		{
+		  if (suffix(RV,"ada"))
+		  {
+			CT = removeSuffix(CT,"ada");
+			return true;
+		  }
+		  if (suffix(RV,"ida"))
+		  {
+			CT = removeSuffix(CT,"ida");
+			return true;
+		  }
+		  if (suffix(RV,"ara"))
+		  {
+			CT = removeSuffix(CT,"ara");
+			return true;
+		  }
+		  if (suffix(RV,"era"))
+		  {
+			CT = removeSuffix(CT,"era");
+			return true;
+		  }
+		  if (suffix(RV,"ira"))
+		  {
+			CT = removeSuffix(CT,"ava");
+			return true;
+		  }
+		  if (suffix(RV,"iam"))
+		  {
+			CT = removeSuffix(CT,"iam");
+			return true;
+		  }
+		  if (suffix(RV,"ado"))
+		  {
+			CT = removeSuffix(CT,"ado");
+			return true;
+		  }
+		  if (suffix(RV,"ido"))
+		  {
+			CT = removeSuffix(CT,"ido");
+			return true;
+		  }
+		  if (suffix(RV,"ias"))
+		  {
+			CT = removeSuffix(CT,"ias");
+			return true;
+		  }
+		  if (suffix(RV,"ais"))
+		  {
+			CT = removeSuffix(CT,"ais");
+			return true;
+		  }
+		  if (suffix(RV,"eis"))
+		  {
+			CT = removeSuffix(CT,"eis");
+			return true;
+		  }
+		  if (suffix(RV,"ira"))
+		  {
+			CT = removeSuffix(CT,"ira");
+			return true;
+		  }
+		  if (suffix(RV,"ear"))
+		  {
+			CT = removeSuffix(CT,"ear");
+			return true;
+		  }
+		}
+
+		// suffix length = 2
+		if (RV.Length >= 2)
+		{
+		  if (suffix(RV,"ia"))
+		  {
+			CT = removeSuffix(CT,"ia");
+			return true;
+		  }
+		  if (suffix(RV,"ei"))
+		  {
+			CT = removeSuffix(CT,"ei");
+			return true;
+		  }
+		  if (suffix(RV,"am"))
+		  {
+			CT = removeSuffix(CT,"am");
+			return true;
+		  }
+		  if (suffix(RV,"em"))
+		  {
+			CT = removeSuffix(CT,"em");
+			return true;
+		  }
+		  if (suffix(RV,"ar"))
+		  {
+			CT = removeSuffix(CT,"ar");
+			return true;
+		  }
+		  if (suffix(RV,"er"))
+		  {
+			CT = removeSuffix(CT,"er");
+			return true;
+		  }
+		  if (suffix(RV,"ir"))
+		  {
+			CT = removeSuffix(CT,"ir");
+			return true;
+		  }
+		  if (suffix(RV,"as"))
+		  {
+			CT = removeSuffix(CT,"as");
+			return true;
+		  }
+		  if (suffix(RV,"es"))
+		  {
+			CT = removeSuffix(CT,"es");
+			return true;
+		  }
+		  if (suffix(RV,"is"))
+		  {
+			CT = removeSuffix(CT,"is");
+			return true;
+		  }
+		  if (suffix(RV,"eu"))
+		  {
+			CT = removeSuffix(CT,"eu");
+			return true;
+		  }
+		  if (suffix(RV,"iu"))
+		  {
+			CT = removeSuffix(CT,"iu");
+			return true;
+		  }
+		  if (suffix(RV,"iu"))
+		  {
+			CT = removeSuffix(CT,"iu");
+			return true;
+		  }
+		  if (suffix(RV,"ou"))
+		  {
+			CT = removeSuffix(CT,"ou");
+			return true;
+		  }
+		}
+
+		// no ending was removed by step2
+		return false;
+	  }
+
+	  /// <summary>
+	  /// Delete suffix 'i' if in RV and preceded by 'c'
+	  /// 
+	  /// </summary>
+	  private void step3()
+	  {
+		if (RV == null)
+		{
+			return;
+		}
+
+		if (suffix(RV,"i") && suffixPreceded(RV,"i","c"))
+		{
+		  CT = removeSuffix(CT,"i");
+		}
+
+	  }
+
+	  /// <summary>
+	  /// Residual suffix
+	  /// 
+	  /// If the word ends with one of the suffixes (os a i o á í ó)
+	  /// in RV, delete it
+	  /// 
+	  /// </summary>
+	  private void step4()
+	  {
+		if (RV == null)
+		{
+			return;
+		}
+
+		if (suffix(RV,"os"))
+		{
+		  CT = removeSuffix(CT,"os");
+		  return;
+		}
+		if (suffix(RV,"a"))
+		{
+		  CT = removeSuffix(CT,"a");
+		  return;
+		}
+		if (suffix(RV,"i"))
+		{
+		  CT = removeSuffix(CT,"i");
+		  return;
+		}
+		if (suffix(RV,"o"))
+		{
+		  CT = removeSuffix(CT,"o");
+		  return;
+		}
+
+	  }
+
+	  /// <summary>
+	  /// If the word ends with one of ( e é ê) in RV,delete it,
+	  /// and if preceded by 'gu' (or 'ci') with the 'u' (or 'i') in RV,
+	  /// delete the 'u' (or 'i')
+	  /// 
+	  /// Or if the word ends ç remove the cedilha
+	  /// 
+	  /// </summary>
+	  private void step5()
+	  {
+		if (RV == null)
+		{
+			return;
+		}
+
+		if (suffix(RV,"e"))
+		{
+		  if (suffixPreceded(RV,"e","gu"))
+		  {
+			CT = removeSuffix(CT,"e");
+			CT = removeSuffix(CT,"u");
+			return;
+		  }
+
+		  if (suffixPreceded(RV,"e","ci"))
+		  {
+			CT = removeSuffix(CT,"e");
+			CT = removeSuffix(CT,"i");
+			return;
+		  }
+
+		  CT = removeSuffix(CT,"e");
+		  return;
+		}
+	  }
+
+	  /// <summary>
+	  /// For log and debug purpose
+	  /// </summary>
+	  /// <returns>  TERM, CT, RV, R1 and R2 </returns>
+	  public virtual string log()
+	  {
+		return " (TERM = " + TERM + ")" + " (CT = " + CT + ")" + " (RV = " + RV + ")" + " (R1 = " + R1 + ")" + " (R2 = " + R2 + ")";
+	  }
+
+	}
+
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Ca/CatalanAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ca/CatalanAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ca/CatalanAnalyzer.cs
new file mode 100644
index 0000000..939d358
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ca/CatalanAnalyzer.cs
@@ -0,0 +1,154 @@
+using System;
+
+namespace org.apache.lucene.analysis.ca
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+
+	using LowerCaseFilter = org.apache.lucene.analysis.core.LowerCaseFilter;
+	using StopFilter = org.apache.lucene.analysis.core.StopFilter;
+	using SetKeywordMarkerFilter = org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+	using SnowballFilter = org.apache.lucene.analysis.snowball.SnowballFilter;
+	using StandardFilter = org.apache.lucene.analysis.standard.StandardFilter;
+	using StandardTokenizer = org.apache.lucene.analysis.standard.StandardTokenizer;
+	using CharArraySet = org.apache.lucene.analysis.util.CharArraySet;
+	using ElisionFilter = org.apache.lucene.analysis.util.ElisionFilter;
+	using StopwordAnalyzerBase = org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+	using Version = org.apache.lucene.util.Version;
+	using CatalanStemmer = org.tartarus.snowball.ext.CatalanStemmer;
+
+	/// <summary>
+	/// <seealso cref="Analyzer"/> for Catalan.
+	/// <para>
+	/// <a name="version"/>
+	/// </para>
+	/// <para>You must specify the required <seealso cref="Version"/>
+	/// compatibility when creating CatalanAnalyzer:
+	/// <ul>
+	///   <li> As of 3.6, ElisionFilter with a set of Catalan 
+	///        contractions is used by default.
+	/// </ul>
+	/// </para>
+	/// </summary>
+	public sealed class CatalanAnalyzer : StopwordAnalyzerBase
+	{
+	  private readonly CharArraySet stemExclusionSet;
+
+	  /// <summary>
+	  /// File containing default Catalan stopwords. </summary>
+	  public const string DEFAULT_STOPWORD_FILE = "stopwords.txt";
+
+	  private static readonly CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList("d", "l", "m", "n", "s", "t"), true));
+
+	  /// <summary>
+	  /// Returns an unmodifiable instance of the default stop words set. </summary>
+	  /// <returns> default stop words set. </returns>
+	  public static CharArraySet DefaultStopSet
+	  {
+		  get
+		  {
+			return DefaultSetHolder.DEFAULT_STOP_SET;
+		  }
+	  }
+
+	  /// <summary>
+	  /// Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class 
+	  /// accesses the static final set the first time.;
+	  /// </summary>
+	  private class DefaultSetHolder
+	  {
+		internal static readonly CharArraySet DEFAULT_STOP_SET;
+
+		static DefaultSetHolder()
+		{
+		  try
+		  {
+			DEFAULT_STOP_SET = loadStopwordSet(false, typeof(CatalanAnalyzer), DEFAULT_STOPWORD_FILE, "#");
+		  }
+		  catch (IOException)
+		  {
+			// default set should always be present as it is part of the
+			// distribution (JAR)
+			throw new Exception("Unable to load default stopword set");
+		  }
+		}
+	  }
+
+	  /// <summary>
+	  /// Builds an analyzer with the default stop words: <seealso cref="#DEFAULT_STOPWORD_FILE"/>.
+	  /// </summary>
+	  public CatalanAnalyzer(Version matchVersion) : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
+	  {
+	  }
+
+	  /// <summary>
+	  /// Builds an analyzer with the given stop words.
+	  /// </summary>
+	  /// <param name="matchVersion"> lucene compatibility version </param>
+	  /// <param name="stopwords"> a stopword set </param>
+	  public CatalanAnalyzer(Version matchVersion, CharArraySet stopwords) : this(matchVersion, stopwords, CharArraySet.EMPTY_SET)
+	  {
+	  }
+
+	  /// <summary>
+	  /// Builds an analyzer with the given stop words. If a non-empty stem exclusion set is
+	  /// provided this analyzer will add a <seealso cref="SetKeywordMarkerFilter"/> before
+	  /// stemming.
+	  /// </summary>
+	  /// <param name="matchVersion"> lucene compatibility version </param>
+	  /// <param name="stopwords"> a stopword set </param>
+	  /// <param name="stemExclusionSet"> a set of terms not to be stemmed </param>
+	  public CatalanAnalyzer(Version matchVersion, CharArraySet stopwords, CharArraySet stemExclusionSet) : base(matchVersion, stopwords)
+	  {
+		this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
+	  }
+
+	  /// <summary>
+	  /// Creates a
+	  /// <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
+	  /// which tokenizes all the text in the provided <seealso cref="Reader"/>.
+	  /// </summary>
+	  /// <returns> A
+	  ///         <seealso cref="org.apache.lucene.analysis.Analyzer.TokenStreamComponents"/>
+	  ///         built from an <seealso cref="StandardTokenizer"/> filtered with
+	  ///         <seealso cref="StandardFilter"/>, <seealso cref="ElisionFilter"/>, <seealso cref="LowerCaseFilter"/>, 
+	  ///         <seealso cref="StopFilter"/>, <seealso cref="SetKeywordMarkerFilter"/> if a stem exclusion set is
+	  ///         provided and <seealso cref="SnowballFilter"/>. </returns>
+	  protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
+	  {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
+		Tokenizer source = new StandardTokenizer(matchVersion, reader);
+		TokenStream result = new StandardFilter(matchVersion, source);
+		if (matchVersion.onOrAfter(Version.LUCENE_36))
+		{
+		  result = new ElisionFilter(result, DEFAULT_ARTICLES);
+		}
+		result = new LowerCaseFilter(matchVersion, result);
+		result = new StopFilter(matchVersion, result, stopwords);
+		if (!stemExclusionSet.Empty)
+		{
+		  result = new SetKeywordMarkerFilter(result, stemExclusionSet);
+		}
+		result = new SnowballFilter(result, new CatalanStemmer());
+		return new TokenStreamComponents(source, result);
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/BaseCharFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/BaseCharFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/BaseCharFilter.cs
new file mode 100644
index 0000000..1127842
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/BaseCharFilter.cs
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Analysis.CharFilter
+{
+    /// <summary>
+	/// Base utility class for implementing a <seealso cref="CharFilter"/>.
+	/// You subclass this, and then record mappings by calling
+	/// <seealso cref="#addOffCorrectMap"/>, and then invoke the correct
+	/// method to correct an offset.
+	/// </summary>
+	public abstract class BaseCharFilter : CharFilter
+	{
+
+	  private int[] offsets;
+	  private int[] diffs;
+	  private int size = 0;
+
+	  public BaseCharFilter(Reader @in) : base(@in)
+	  {
+	  }
+
+	  /// <summary>
+	  /// Retrieve the corrected offset. </summary>
+	  protected internal override int correct(int currentOff)
+	  {
+		if (offsets == null || currentOff < offsets[0])
+		{
+		  return currentOff;
+		}
+
+		int hi = size - 1;
+		if (currentOff >= offsets[hi])
+		{
+		  return currentOff + diffs[hi];
+		}
+
+		int lo = 0;
+		int mid = -1;
+
+		while (hi >= lo)
+		{
+		  mid = (int)((uint)(lo + hi) >> 1);
+		  if (currentOff < offsets[mid])
+		  {
+			hi = mid - 1;
+		  }
+		  else if (currentOff > offsets[mid])
+		  {
+			lo = mid + 1;
+		  }
+		  else
+		  {
+			return currentOff + diffs[mid];
+		  }
+		}
+
+		if (currentOff < offsets[mid])
+		{
+		  return mid == 0 ? currentOff : currentOff + diffs[mid - 1];
+		}
+		else
+		{
+		  return currentOff + diffs[mid];
+		}
+	  }
+
+	  protected internal virtual int LastCumulativeDiff
+	  {
+		  get
+		  {
+			return offsets == null ? 0 : diffs[size-1];
+		  }
+	  }
+
+	  /// <summary>
+	  /// <para>
+	  ///   Adds an offset correction mapping at the given output stream offset.
+	  /// </para>
+	  /// <para>
+	  ///   Assumption: the offset given with each successive call to this method
+	  ///   will not be smaller than the offset given at the previous invocation.
+	  /// </para>
+	  /// </summary>
+	  /// <param name="off"> The output stream offset at which to apply the correction </param>
+	  /// <param name="cumulativeDiff"> The input offset is given by adding this
+	  ///                       to the output offset </param>
+	  protected internal virtual void addOffCorrectMap(int off, int cumulativeDiff)
+	  {
+		if (offsets == null)
+		{
+		  offsets = new int[64];
+		  diffs = new int[64];
+		}
+		else if (size == offsets.Length)
+		{
+		  offsets = ArrayUtil.grow(offsets);
+		  diffs = ArrayUtil.grow(diffs);
+		}
+
+		assert(size == 0 || off >= offsets[size - 1]) : "Offset #" + size + "(" + off + ") is less than the last recorded offset " + offsets[size - 1] + "\n" + Arrays.ToString(offsets) + "\n" + Arrays.ToString(diffs);
+
+		if (size == 0 || off != offsets[size - 1])
+		{
+		  offsets[size] = off;
+		  diffs[size++] = cumulativeDiff;
+		} // Overwrite the diff at the last recorded offset
+		else
+		{
+		  diffs[size - 1] = cumulativeDiff;
+		}
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/HTMLStripCharFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/HTMLStripCharFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/HTMLStripCharFilterFactory.cs
new file mode 100644
index 0000000..2d527fc
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/HTMLStripCharFilterFactory.cs
@@ -0,0 +1,67 @@
+using System.Collections.Generic;
+
+namespace org.apache.lucene.analysis.charfilter
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+	using CharFilterFactory = org.apache.lucene.analysis.util.CharFilterFactory;
+
+
+	/// <summary>
+	/// Factory for <seealso cref="HTMLStripCharFilter"/>. 
+	/// <pre class="prettyprint">
+	/// &lt;fieldType name="text_html" class="solr.TextField" positionIncrementGap="100"&gt;
+	///   &lt;analyzer&gt;
+	///     &lt;charFilter class="solr.HTMLStripCharFilterFactory" escapedTags="a, title" /&gt;
+	///     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
+	///   &lt;/analyzer&gt;
+	/// &lt;/fieldType&gt;</pre>
+	/// </summary>
+	public class HTMLStripCharFilterFactory : CharFilterFactory
+	{
+	  internal readonly HashSet<string> escapedTags;
+	  internal static readonly Pattern TAG_NAME_PATTERN = Pattern.compile("[^\\s,]+");
+
+	  /// <summary>
+	  /// Creates a new HTMLStripCharFilterFactory </summary>
+	  public HTMLStripCharFilterFactory(IDictionary<string, string> args) : base(args)
+	  {
+		escapedTags = getSet(args, "escapedTags");
+		if (args.Count > 0)
+		{
+		  throw new System.ArgumentException("Unknown parameters: " + args);
+		}
+	  }
+
+	  public override HTMLStripCharFilter create(Reader input)
+	  {
+		HTMLStripCharFilter charFilter;
+		if (null == escapedTags)
+		{
+		  charFilter = new HTMLStripCharFilter(input);
+		}
+		else
+		{
+		  charFilter = new HTMLStripCharFilter(input, escapedTags);
+		}
+		return charFilter;
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/MappingCharFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/MappingCharFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/MappingCharFilter.cs
new file mode 100644
index 0000000..5a148be
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/MappingCharFilter.cs
@@ -0,0 +1,240 @@
+using System;
+using System.Diagnostics;
+using System.Collections.Generic;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+using Lucene.Net.Analysis.CharFilter;
+
+namespace org.apache.lucene.analysis.charfilter
+{
+
+
+	using RollingCharBuffer = org.apache.lucene.analysis.util.RollingCharBuffer;
+	using CharsRef = org.apache.lucene.util.CharsRef;
+	using CharSequenceOutputs = org.apache.lucene.util.fst.CharSequenceOutputs;
+	using FST = org.apache.lucene.util.fst.FST;
+	using Outputs = org.apache.lucene.util.fst.Outputs;
+
+	/// <summary>
+	/// Simplistic <seealso cref="CharFilter"/> that applies the mappings
+	/// contained in a <seealso cref="NormalizeCharMap"/> to the character
+	/// stream, and correcting the resulting changes to the
+	/// offsets.  Matching is greedy (longest pattern matching at
+	/// a given point wins).  Replacement is allowed to be the
+	/// empty string.
+	/// </summary>
+
+	public class MappingCharFilter : BaseCharFilter
+	{
+
+	  private readonly Outputs<CharsRef> outputs = CharSequenceOutputs.Singleton;
+	  private readonly FST<CharsRef> map;
+	  private readonly FST.BytesReader fstReader;
+	  private readonly RollingCharBuffer buffer = new RollingCharBuffer();
+	  private readonly FST.Arc<CharsRef> scratchArc = new FST.Arc<CharsRef>();
+	  private readonly IDictionary<char?, FST.Arc<CharsRef>> cachedRootArcs;
+
+	  private CharsRef replacement;
+	  private int replacementPointer;
+	  private int inputOff;
+
+	  /// <summary>
+	  /// Default constructor that takes a <seealso cref="Reader"/>. </summary>
+	  public MappingCharFilter(NormalizeCharMap normMap, Reader @in) : base(@in)
+	  {
+		buffer.reset(@in);
+
+		map = normMap.map;
+		cachedRootArcs = normMap.cachedRootArcs;
+
+		if (map != null)
+		{
+		  fstReader = map.BytesReader;
+		}
+		else
+		{
+		  fstReader = null;
+		}
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void reset() throws java.io.IOException
+	  public override void reset()
+	  {
+		input.reset();
+		buffer.reset(input);
+		replacement = null;
+		inputOff = 0;
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public int read() throws java.io.IOException
+	  public override int read()
+	  {
+
+		//System.out.println("\nread");
+		while (true)
+		{
+
+		  if (replacement != null && replacementPointer < replacement.length)
+		  {
+			//System.out.println("  return repl[" + replacementPointer + "]=" + replacement.chars[replacement.offset + replacementPointer]);
+			return replacement.chars[replacement.offset + replacementPointer++];
+		  }
+
+		  // TODO: a more efficient approach would be Aho/Corasick's
+		  // algorithm
+		  // (http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm)
+		  // or this generalizatio: www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps
+		  //
+		  // I think this would be (almost?) equivalent to 1) adding
+		  // epsilon arcs from all final nodes back to the init
+		  // node in the FST, 2) adding a .* (skip any char)
+		  // loop on the initial node, and 3) determinizing
+		  // that.  Then we would not have to restart matching
+		  // at each position.
+
+		  int lastMatchLen = -1;
+		  CharsRef lastMatch = null;
+
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int firstCH = buffer.get(inputOff);
+		  int firstCH = buffer.get(inputOff);
+		  if (firstCH != -1)
+		  {
+			FST.Arc<CharsRef> arc = cachedRootArcs[Convert.ToChar((char) firstCH)];
+			if (arc != null)
+			{
+			  if (!FST.targetHasArcs(arc))
+			  {
+				// Fast pass for single character match:
+				Debug.Assert(arc.Final);
+				lastMatchLen = 1;
+				lastMatch = arc.output;
+			  }
+			  else
+			  {
+				int lookahead = 0;
+				CharsRef output = arc.output;
+				while (true)
+				{
+				  lookahead++;
+
+				  if (arc.Final)
+				  {
+					// Match! (to node is final)
+					lastMatchLen = lookahead;
+					lastMatch = outputs.add(output, arc.nextFinalOutput);
+					// Greedy: keep searching to see if there's a
+					// longer match...
+				  }
+
+				  if (!FST.targetHasArcs(arc))
+				  {
+					break;
+				  }
+
+				  int ch = buffer.get(inputOff + lookahead);
+				  if (ch == -1)
+				  {
+					break;
+				  }
+				  if ((arc = map.findTargetArc(ch, arc, scratchArc, fstReader)) == null)
+				  {
+					// Dead end
+					break;
+				  }
+				  output = outputs.add(output, arc.output);
+				}
+			  }
+			}
+		  }
+
+		  if (lastMatch != null)
+		  {
+			inputOff += lastMatchLen;
+			//System.out.println("  match!  len=" + lastMatchLen + " repl=" + lastMatch);
+
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int diff = lastMatchLen - lastMatch.length;
+			int diff = lastMatchLen - lastMatch.length;
+
+			if (diff != 0)
+			{
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int prevCumulativeDiff = getLastCumulativeDiff();
+			  int prevCumulativeDiff = LastCumulativeDiff;
+			  if (diff > 0)
+			  {
+				// Replacement is shorter than matched input:
+				addOffCorrectMap(inputOff - diff - prevCumulativeDiff, prevCumulativeDiff + diff);
+			  }
+			  else
+			  {
+				// Replacement is longer than matched input: remap
+				// the "extra" chars all back to the same input
+				// offset:
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int outputStart = inputOff - prevCumulativeDiff;
+				int outputStart = inputOff - prevCumulativeDiff;
+				for (int extraIDX = 0;extraIDX < -diff;extraIDX++)
+				{
+				  addOffCorrectMap(outputStart + extraIDX, prevCumulativeDiff - extraIDX - 1);
+				}
+			  }
+			}
+
+			replacement = lastMatch;
+			replacementPointer = 0;
+
+		  }
+		  else
+		  {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int ret = buffer.get(inputOff);
+			int ret = buffer.get(inputOff);
+			if (ret != -1)
+			{
+			  inputOff++;
+			  buffer.freeBefore(inputOff);
+			}
+			return ret;
+		  }
+		}
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public int read(char[] cbuf, int off, int len) throws java.io.IOException
+	  public override int read(char[] cbuf, int off, int len)
+	  {
+		int numRead = 0;
+		for (int i = off; i < off + len; i++)
+		{
+		  int c = read();
+		  if (c == -1)
+		  {
+			  break;
+		  }
+		  cbuf[i] = (char) c;
+		  numRead++;
+		}
+
+		return numRead == 0 ? - 1 : numRead;
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/MappingCharFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/MappingCharFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/MappingCharFilterFactory.cs
new file mode 100644
index 0000000..4489b7c
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/MappingCharFilterFactory.cs
@@ -0,0 +1,184 @@
+using System.Collections.Generic;
+using Lucene.Net.Analysis.Util;
+
+namespace org.apache.lucene.analysis.charfilter
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+
+	using AbstractAnalysisFactory = AbstractAnalysisFactory;
+	using CharFilterFactory = org.apache.lucene.analysis.util.CharFilterFactory;
+	using MultiTermAwareComponent = org.apache.lucene.analysis.util.MultiTermAwareComponent;
+	using ResourceLoader = org.apache.lucene.analysis.util.ResourceLoader;
+	using ResourceLoaderAware = org.apache.lucene.analysis.util.ResourceLoaderAware;
+
+	/// <summary>
+	/// Factory for <seealso cref="MappingCharFilter"/>. 
+	/// <pre class="prettyprint">
+	/// &lt;fieldType name="text_map" class="solr.TextField" positionIncrementGap="100"&gt;
+	///   &lt;analyzer&gt;
+	///     &lt;charFilter class="solr.MappingCharFilterFactory" mapping="mapping.txt"/&gt;
+	///     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
+	///   &lt;/analyzer&gt;
+	/// &lt;/fieldType&gt;</pre>
+	/// 
+	/// @since Solr 1.4
+	/// </summary>
+	public class MappingCharFilterFactory : CharFilterFactory, ResourceLoaderAware, MultiTermAwareComponent
+	{
+
+	  protected internal NormalizeCharMap normMap;
+	  private readonly string mapping;
+
+	  /// <summary>
+	  /// Creates a new MappingCharFilterFactory </summary>
+	  public MappingCharFilterFactory(IDictionary<string, string> args) : base(args)
+	  {
+		mapping = get(args, "mapping");
+		if (args.Count > 0)
+		{
+		  throw new System.ArgumentException("Unknown parameters: " + args);
+		}
+	  }
+
+	  // TODO: this should use inputstreams from the loader, not File!
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void inform(org.apache.lucene.analysis.util.ResourceLoader loader) throws java.io.IOException
+	  public virtual void inform(ResourceLoader loader)
+	  {
+		if (mapping != null)
+		{
+		  IList<string> wlist = null;
+		  File mappingFile = new File(mapping);
+		  if (mappingFile.exists())
+		  {
+			wlist = getLines(loader, mapping);
+		  }
+		  else
+		  {
+			IList<string> files = splitFileNames(mapping);
+			wlist = new List<>();
+			foreach (string file in files)
+			{
+			  IList<string> lines = getLines(loader, file.Trim());
+			  wlist.AddRange(lines);
+			}
+		  }
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
+		  NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
+		  parseRules(wlist, builder);
+		  normMap = builder.build();
+		  if (normMap.map == null)
+		  {
+			// if the inner FST is null, it means it accepts nothing (e.g. the file is empty)
+			// so just set the whole map to null
+			normMap = null;
+		  }
+		}
+	  }
+
+	  public override Reader create(Reader input)
+	  {
+		// if the map is null, it means there's actually no mappings... just return the original stream
+		// as there is nothing to do here.
+		return normMap == null ? input : new MappingCharFilter(normMap,input);
+	  }
+
+	  // "source" => "target"
+	  internal static Pattern p = Pattern.compile("\"(.*)\"\\s*=>\\s*\"(.*)\"\\s*$");
+
+	  protected internal virtual void parseRules(IList<string> rules, NormalizeCharMap.Builder builder)
+	  {
+		foreach (string rule in rules)
+		{
+		  Matcher m = p.matcher(rule);
+		  if (!m.find())
+		  {
+			throw new System.ArgumentException("Invalid Mapping Rule : [" + rule + "], file = " + mapping);
+		  }
+		  builder.add(parseString(m.group(1)), parseString(m.group(2)));
+		}
+	  }
+
+	  internal char[] @out = new char[256];
+
+	  protected internal virtual string parseString(string s)
+	  {
+		int readPos = 0;
+		int len = s.Length;
+		int writePos = 0;
+		while (readPos < len)
+		{
+		  char c = s[readPos++];
+		  if (c == '\\')
+		  {
+			if (readPos >= len)
+			{
+			  throw new System.ArgumentException("Invalid escaped char in [" + s + "]");
+			}
+			c = s[readPos++];
+			switch (c)
+			{
+			  case '\\' :
+				  c = '\\';
+				  break;
+			  case '"' :
+				  c = '"';
+				  break;
+			  case 'n' :
+				  c = '\n';
+				  break;
+			  case 't' :
+				  c = '\t';
+				  break;
+			  case 'r' :
+				  c = '\r';
+				  break;
+			  case 'b' :
+				  c = '\b';
+				  break;
+			  case 'f' :
+				  c = '\f';
+				  break;
+			  case 'u' :
+				if (readPos + 3 >= len)
+				{
+				  throw new System.ArgumentException("Invalid escaped char in [" + s + "]");
+				}
+				c = (char)int.Parse(s.Substring(readPos, 4), 16);
+				readPos += 4;
+				break;
+			}
+		  }
+		  @out[writePos++] = c;
+		}
+		return new string(@out, 0, writePos);
+	  }
+
+	  public virtual AbstractAnalysisFactory MultiTermComponent
+	  {
+		  get
+		  {
+			return this;
+		  }
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/NormalizeCharMap.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/NormalizeCharMap.cs b/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/NormalizeCharMap.cs
new file mode 100644
index 0000000..ade4318
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/NormalizeCharMap.cs
@@ -0,0 +1,162 @@
+using System;
+using System.Diagnostics;
+using System.Collections.Generic;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace org.apache.lucene.analysis.charfilter
+{
+
+
+	using CharsRef = org.apache.lucene.util.CharsRef;
+	using IntsRef = org.apache.lucene.util.IntsRef;
+	using Builder = org.apache.lucene.util.fst.Builder;
+	using CharSequenceOutputs = org.apache.lucene.util.fst.CharSequenceOutputs;
+	using FST = org.apache.lucene.util.fst.FST;
+	using Outputs = org.apache.lucene.util.fst.Outputs;
+	using Util = org.apache.lucene.util.fst.Util;
+
+	// TODO: save/load?
+
+	/// <summary>
+	/// Holds a map of String input to String output, to be used
+	/// with <seealso cref="MappingCharFilter"/>.  Use the <seealso cref="Builder"/>
+	/// to create this.
+	/// </summary>
+	public class NormalizeCharMap
+	{
+
+	  internal readonly FST<CharsRef> map;
+	  internal readonly IDictionary<char?, FST.Arc<CharsRef>> cachedRootArcs = new Dictionary<char?, FST.Arc<CharsRef>>();
+
+	  // Use the builder to create:
+	  private NormalizeCharMap(FST<CharsRef> map)
+	  {
+		this.map = map;
+		if (map != null)
+		{
+		  try
+		  {
+			// Pre-cache root arcs:
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.util.fst.FST.Arc<org.apache.lucene.util.CharsRef> scratchArc = new org.apache.lucene.util.fst.FST.Arc<>();
+			FST.Arc<CharsRef> scratchArc = new FST.Arc<CharsRef>();
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.util.fst.FST.BytesReader fstReader = map.getBytesReader();
+			FST.BytesReader fstReader = map.BytesReader;
+			map.getFirstArc(scratchArc);
+			if (FST.targetHasArcs(scratchArc))
+			{
+			  map.readFirstRealTargetArc(scratchArc.target, scratchArc, fstReader);
+			  while (true)
+			  {
+				Debug.Assert(scratchArc.label != FST.END_LABEL);
+				cachedRootArcs[Convert.ToChar((char) scratchArc.label)] = (new FST.Arc<CharsRef>()).copyFrom(scratchArc);
+				if (scratchArc.Last)
+				{
+				  break;
+				}
+				map.readNextRealArc(scratchArc, fstReader);
+			  }
+			}
+			//System.out.println("cached " + cachedRootArcs.size() + " root arcs");
+		  }
+		  catch (IOException ioe)
+		  {
+			// Bogus FST IOExceptions!!  (will never happen)
+			throw new Exception(ioe);
+		  }
+		}
+	  }
+
+	  /// <summary>
+	  /// Builds an NormalizeCharMap.
+	  /// <para>
+	  /// Call add() until you have added all the mappings, then call build() to get a NormalizeCharMap
+	  /// @lucene.experimental
+	  /// </para>
+	  /// </summary>
+	  public class Builder
+	  {
+
+		internal readonly IDictionary<string, string> pendingPairs = new SortedDictionary<string, string>();
+
+		/// <summary>
+		/// Records a replacement to be applied to the input
+		///  stream.  Whenever <code>singleMatch</code> occurs in
+		///  the input, it will be replaced with
+		///  <code>replacement</code>.
+		/// </summary>
+		/// <param name="match"> input String to be replaced </param>
+		/// <param name="replacement"> output String </param>
+		/// <exception cref="IllegalArgumentException"> if
+		/// <code>match</code> is the empty string, or was
+		/// already previously added </exception>
+		public virtual void add(string match, string replacement)
+		{
+		  if (match.Length == 0)
+		  {
+			throw new System.ArgumentException("cannot match the empty string");
+		  }
+		  if (pendingPairs.ContainsKey(match))
+		  {
+			throw new System.ArgumentException("match \"" + match + "\" was already added");
+		  }
+		  pendingPairs[match] = replacement;
+		}
+
+		/// <summary>
+		/// Builds the NormalizeCharMap; call this once you
+		///  are done calling <seealso cref="#add"/>. 
+		/// </summary>
+		public virtual NormalizeCharMap build()
+		{
+
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.util.fst.FST<org.apache.lucene.util.CharsRef> map;
+		  FST<CharsRef> map;
+		  try
+		  {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.util.fst.Outputs<org.apache.lucene.util.CharsRef> outputs = org.apache.lucene.util.fst.CharSequenceOutputs.getSingleton();
+			Outputs<CharsRef> outputs = CharSequenceOutputs.Singleton;
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.util.fst.Builder<org.apache.lucene.util.CharsRef> builder = new org.apache.lucene.util.fst.Builder<>(org.apache.lucene.util.fst.FST.INPUT_TYPE.BYTE2, outputs);
+			Builder<CharsRef> builder = new Builder<CharsRef>(FST.INPUT_TYPE.BYTE2, outputs);
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.util.IntsRef scratch = new org.apache.lucene.util.IntsRef();
+			IntsRef scratch = new IntsRef();
+			foreach (KeyValuePair<string, string> ent in pendingPairs.SetOfKeyValuePairs())
+			{
+			  builder.add(Util.toUTF16(ent.Key, scratch), new CharsRef(ent.Value));
+			}
+			map = builder.finish();
+			pendingPairs.Clear();
+		  }
+		  catch (IOException ioe)
+		  {
+			// Bogus FST IOExceptions!!  (will never happen)
+			throw new Exception(ioe);
+		  }
+
+		  return new NormalizeCharMap(map);
+		}
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKAnalyzer.cs
new file mode 100644
index 0000000..801fd45
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKAnalyzer.cs
@@ -0,0 +1,118 @@
+using System;
+
+namespace org.apache.lucene.analysis.cjk
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+
+	using LowerCaseFilter = org.apache.lucene.analysis.core.LowerCaseFilter;
+	using StopFilter = org.apache.lucene.analysis.core.StopFilter;
+	using StandardTokenizer = org.apache.lucene.analysis.standard.StandardTokenizer;
+	using CharArraySet = org.apache.lucene.analysis.util.CharArraySet;
+	using StopwordAnalyzerBase = org.apache.lucene.analysis.util.StopwordAnalyzerBase;
+	using Version = org.apache.lucene.util.Version;
+
+	/// <summary>
+	/// An <seealso cref="Analyzer"/> that tokenizes text with <seealso cref="StandardTokenizer"/>,
+	/// normalizes content with <seealso cref="CJKWidthFilter"/>, folds case with
+	/// <seealso cref="LowerCaseFilter"/>, forms bigrams of CJK with <seealso cref="CJKBigramFilter"/>,
+	/// and filters stopwords with <seealso cref="StopFilter"/>
+	/// </summary>
+	public sealed class CJKAnalyzer : StopwordAnalyzerBase
+	{
+	  /// <summary>
+	  /// File containing default CJK stopwords.
+	  /// <p/>
+	  /// Currently it contains some common English words that are not usually
+	  /// useful for searching and some double-byte interpunctions.
+	  /// </summary>
+	  public const string DEFAULT_STOPWORD_FILE = "stopwords.txt";
+
+	  /// <summary>
+	  /// Returns an unmodifiable instance of the default stop-words set. </summary>
+	  /// <returns> an unmodifiable instance of the default stop-words set. </returns>
+	  public static CharArraySet DefaultStopSet
+	  {
+		  get
+		  {
+			return DefaultSetHolder.DEFAULT_STOP_SET;
+		  }
+	  }
+
+	  private class DefaultSetHolder
+	  {
+		internal static readonly CharArraySet DEFAULT_STOP_SET;
+
+		static DefaultSetHolder()
+		{
+		  try
+		  {
+			DEFAULT_STOP_SET = loadStopwordSet(false, typeof(CJKAnalyzer), DEFAULT_STOPWORD_FILE, "#");
+		  }
+		  catch (IOException)
+		  {
+			// default set should always be present as it is part of the
+			// distribution (JAR)
+			throw new Exception("Unable to load default stopword set");
+		  }
+		}
+	  }
+
+	  /// <summary>
+	  /// Builds an analyzer which removes words in <seealso cref="#getDefaultStopSet()"/>.
+	  /// </summary>
+	  public CJKAnalyzer(Version matchVersion) : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET)
+	  {
+	  }
+
+	  /// <summary>
+	  /// Builds an analyzer with the given stop words
+	  /// </summary>
+	  /// <param name="matchVersion">
+	  ///          lucene compatibility version </param>
+	  /// <param name="stopwords">
+	  ///          a stopword set </param>
+	  public CJKAnalyzer(Version matchVersion, CharArraySet stopwords) : base(matchVersion, stopwords)
+	  {
+	  }
+
+	  protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
+	  {
+		if (matchVersion.onOrAfter(Version.LUCENE_36))
+		{
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new org.apache.lucene.analysis.standard.StandardTokenizer(matchVersion, reader);
+		  Tokenizer source = new StandardTokenizer(matchVersion, reader);
+		  // run the widthfilter first before bigramming, it sometimes combines characters.
+		  TokenStream result = new CJKWidthFilter(source);
+		  result = new LowerCaseFilter(matchVersion, result);
+		  result = new CJKBigramFilter(result);
+		  return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
+		}
+		else
+		{
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.analysis.Tokenizer source = new CJKTokenizer(reader);
+		  Tokenizer source = new CJKTokenizer(reader);
+		  return new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords));
+		}
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilter.cs
new file mode 100644
index 0000000..4ad6f5f
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilter.cs
@@ -0,0 +1,420 @@
+namespace org.apache.lucene.analysis.cjk
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+	using StandardTokenizer = org.apache.lucene.analysis.standard.StandardTokenizer;
+	using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+	using OffsetAttribute = org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+	using PositionIncrementAttribute = org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+	using PositionLengthAttribute = org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
+	using TypeAttribute = org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+	using ArrayUtil = org.apache.lucene.util.ArrayUtil;
+
+	/// <summary>
+	/// Forms bigrams of CJK terms that are generated from StandardTokenizer
+	/// or ICUTokenizer.
+	/// <para>
+	/// CJK types are set by these tokenizers, but you can also use 
+	/// <seealso cref="#CJKBigramFilter(TokenStream, int)"/> to explicitly control which
+	/// of the CJK scripts are turned into bigrams.
+	/// </para>
+	/// <para>
+	/// By default, when a CJK character has no adjacent characters to form
+	/// a bigram, it is output in unigram form. If you want to always output
+	/// both unigrams and bigrams, set the <code>outputUnigrams</code>
+	/// flag in <seealso cref="CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)"/>.
+	/// This can be used for a combined unigram+bigram approach.
+	/// </para>
+	/// <para>
+	/// In all cases, all non-CJK input is passed thru unmodified.
+	/// </para>
+	/// </summary>
+	public sealed class CJKBigramFilter : TokenFilter
+	{
+	  // configuration
+	  /// <summary>
+	  /// bigram flag for Han Ideographs </summary>
+	  public const int HAN = 1;
+	  /// <summary>
+	  /// bigram flag for Hiragana </summary>
+	  public const int HIRAGANA = 2;
+	  /// <summary>
+	  /// bigram flag for Katakana </summary>
+	  public const int KATAKANA = 4;
+	  /// <summary>
+	  /// bigram flag for Hangul </summary>
+	  public const int HANGUL = 8;
+
+	  /// <summary>
+	  /// when we emit a bigram, its then marked as this type </summary>
+	  public const string DOUBLE_TYPE = "<DOUBLE>";
+	  /// <summary>
+	  /// when we emit a unigram, its then marked as this type </summary>
+	  public const string SINGLE_TYPE = "<SINGLE>";
+
+	  // the types from standardtokenizer
+	  private static readonly string HAN_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC];
+	  private static readonly string HIRAGANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA];
+	  private static readonly string KATAKANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA];
+	  private static readonly string HANGUL_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL];
+
+	  // sentinel value for ignoring a script 
+	  private static readonly object NO = new object();
+
+	  // these are set to either their type or NO if we want to pass them thru
+	  private readonly object doHan;
+	  private readonly object doHiragana;
+	  private readonly object doKatakana;
+	  private readonly object doHangul;
+
+	  // true if we should output unigram tokens always
+	  private readonly bool outputUnigrams;
+	  private bool ngramState; // false = output unigram, true = output bigram
+
+	  private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+	  private readonly TypeAttribute typeAtt = addAttribute(typeof(TypeAttribute));
+	  private readonly OffsetAttribute offsetAtt = addAttribute(typeof(OffsetAttribute));
+	  private readonly PositionIncrementAttribute posIncAtt = addAttribute(typeof(PositionIncrementAttribute));
+	  private readonly PositionLengthAttribute posLengthAtt = addAttribute(typeof(PositionLengthAttribute));
+
+	  // buffers containing codepoint and offsets in parallel
+	  internal int[] buffer = new int[8];
+	  internal int[] startOffset = new int[8];
+	  internal int[] endOffset = new int[8];
+	  // length of valid buffer
+	  internal int bufferLen;
+	  // current buffer index
+	  internal int index;
+
+	  // the last end offset, to determine if we should bigram across tokens
+	  internal int lastEndOffset;
+
+	  private bool exhausted;
+
+	  /// <summary>
+	  /// Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int)
+	  ///       CJKBigramFilter(in, HAN | HIRAGANA | KATAKANA | HANGUL)}
+	  /// </summary>
+	  public CJKBigramFilter(TokenStream @in) : this(@in, HAN | HIRAGANA | KATAKANA | HANGUL)
+	  {
+	  }
+
+	  /// <summary>
+	  /// Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)
+	  ///       CJKBigramFilter(in, flags, false)}
+	  /// </summary>
+	  public CJKBigramFilter(TokenStream @in, int flags) : this(@in, flags, false)
+	  {
+	  }
+
+	  /// <summary>
+	  /// Create a new CJKBigramFilter, specifying which writing systems should be bigrammed,
+	  /// and whether or not unigrams should also be output. </summary>
+	  /// <param name="flags"> OR'ed set from <seealso cref="CJKBigramFilter#HAN"/>, <seealso cref="CJKBigramFilter#HIRAGANA"/>, 
+	  ///        <seealso cref="CJKBigramFilter#KATAKANA"/>, <seealso cref="CJKBigramFilter#HANGUL"/> </param>
+	  /// <param name="outputUnigrams"> true if unigrams for the selected writing systems should also be output.
+	  ///        when this is false, this is only done when there are no adjacent characters to form
+	  ///        a bigram. </param>
+	  public CJKBigramFilter(TokenStream @in, int flags, bool outputUnigrams) : base(@in)
+	  {
+		doHan = (flags & HAN) == 0 ? NO : HAN_TYPE;
+		doHiragana = (flags & HIRAGANA) == 0 ? NO : HIRAGANA_TYPE;
+		doKatakana = (flags & KATAKANA) == 0 ? NO : KATAKANA_TYPE;
+		doHangul = (flags & HANGUL) == 0 ? NO : HANGUL_TYPE;
+		this.outputUnigrams = outputUnigrams;
+	  }
+
+	  /*
+	   * much of this complexity revolves around handling the special case of a 
+	   * "lone cjk character" where cjktokenizer would output a unigram. this 
+	   * is also the only time we ever have to captureState.
+	   */
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
+	  public override bool incrementToken()
+	  {
+		while (true)
+		{
+		  if (hasBufferedBigram())
+		  {
+
+			// case 1: we have multiple remaining codepoints buffered,
+			// so we can emit a bigram here.
+
+			if (outputUnigrams)
+			{
+
+			  // when also outputting unigrams, we output the unigram first,
+			  // then rewind back to revisit the bigram.
+			  // so an input of ABC is A + (rewind)AB + B + (rewind)BC + C
+			  // the logic in hasBufferedUnigram ensures we output the C, 
+			  // even though it did actually have adjacent CJK characters.
+
+			  if (ngramState)
+			  {
+				flushBigram();
+			  }
+			  else
+			  {
+				flushUnigram();
+				index--;
+			  }
+			  ngramState = !ngramState;
+			}
+			else
+			{
+			  flushBigram();
+			}
+			return true;
+		  }
+		  else if (doNext())
+		  {
+
+			// case 2: look at the token type. should we form any n-grams?
+
+			string type = typeAtt.type();
+			if (type == doHan || type == doHiragana || type == doKatakana || type == doHangul)
+			{
+
+			  // acceptable CJK type: we form n-grams from these.
+			  // as long as the offsets are aligned, we just add these to our current buffer.
+			  // otherwise, we clear the buffer and start over.
+
+			  if (offsetAtt.startOffset() != lastEndOffset) // unaligned, clear queue
+			  {
+				if (hasBufferedUnigram())
+				{
+
+				  // we have a buffered unigram, and we peeked ahead to see if we could form
+				  // a bigram, but we can't, because the offsets are unaligned. capture the state 
+				  // of this peeked data to be revisited next time thru the loop, and dump our unigram.
+
+				  loneState = captureState();
+				  flushUnigram();
+				  return true;
+				}
+				index = 0;
+				bufferLen = 0;
+			  }
+			  refill();
+			}
+			else
+			{
+
+			  // not a CJK type: we just return these as-is.
+
+			  if (hasBufferedUnigram())
+			  {
+
+				// we have a buffered unigram, and we peeked ahead to see if we could form
+				// a bigram, but we can't, because its not a CJK type. capture the state 
+				// of this peeked data to be revisited next time thru the loop, and dump our unigram.
+
+				loneState = captureState();
+				flushUnigram();
+				return true;
+			  }
+			  return true;
+			}
+		  }
+		  else
+		  {
+
+			// case 3: we have only zero or 1 codepoints buffered, 
+			// so not enough to form a bigram. But, we also have no
+			// more input. So if we have a buffered codepoint, emit
+			// a unigram, otherwise, its end of stream.
+
+			if (hasBufferedUnigram())
+			{
+			  flushUnigram(); // flush our remaining unigram
+			  return true;
+			}
+			return false;
+		  }
+		}
+	  }
+
+	  private State loneState; // rarely used: only for "lone cjk characters", where we emit unigrams
+
+	  /// <summary>
+	  /// looks at next input token, returning false is none is available 
+	  /// </summary>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: private boolean doNext() throws java.io.IOException
+	  private bool doNext()
+	  {
+		if (loneState != null)
+		{
+		  restoreState(loneState);
+		  loneState = null;
+		  return true;
+		}
+		else
+		{
+		  if (exhausted)
+		  {
+			return false;
+		  }
+		  else if (input.incrementToken())
+		  {
+			return true;
+		  }
+		  else
+		  {
+			exhausted = true;
+			return false;
+		  }
+		}
+	  }
+
+	  /// <summary>
+	  /// refills buffers with new data from the current token.
+	  /// </summary>
+	  private void refill()
+	  {
+		// compact buffers to keep them smallish if they become large
+		// just a safety check, but technically we only need the last codepoint
+		if (bufferLen > 64)
+		{
+		  int last = bufferLen - 1;
+		  buffer[0] = buffer[last];
+		  startOffset[0] = startOffset[last];
+		  endOffset[0] = endOffset[last];
+		  bufferLen = 1;
+		  index -= last;
+		}
+
+		char[] termBuffer = termAtt.buffer();
+		int len = termAtt.length();
+		int start = offsetAtt.startOffset();
+		int end = offsetAtt.endOffset();
+
+		int newSize = bufferLen + len;
+		buffer = ArrayUtil.grow(buffer, newSize);
+		startOffset = ArrayUtil.grow(startOffset, newSize);
+		endOffset = ArrayUtil.grow(endOffset, newSize);
+		lastEndOffset = end;
+
+		if (end - start != len)
+		{
+		  // crazy offsets (modified by synonym or charfilter): just preserve
+		  for (int i = 0, cp = 0; i < len; i += char.charCount(cp))
+		  {
+			cp = buffer[bufferLen] = char.codePointAt(termBuffer, i, len);
+			startOffset[bufferLen] = start;
+			endOffset[bufferLen] = end;
+			bufferLen++;
+		  }
+		}
+		else
+		{
+		  // normal offsets
+		  for (int i = 0, cp = 0, cpLen = 0; i < len; i += cpLen)
+		  {
+			cp = buffer[bufferLen] = char.codePointAt(termBuffer, i, len);
+			cpLen = char.charCount(cp);
+			startOffset[bufferLen] = start;
+			start = endOffset[bufferLen] = start + cpLen;
+			bufferLen++;
+		  }
+		}
+	  }
+
+	  /// <summary>
+	  /// Flushes a bigram token to output from our buffer 
+	  /// This is the normal case, e.g. ABC -> AB BC
+	  /// </summary>
+	  private void flushBigram()
+	  {
+		clearAttributes();
+		char[] termBuffer = termAtt.resizeBuffer(4); // maximum bigram length in code units (2 supplementaries)
+		int len1 = char.toChars(buffer[index], termBuffer, 0);
+		int len2 = len1 + char.toChars(buffer[index + 1], termBuffer, len1);
+		termAtt.Length = len2;
+		offsetAtt.setOffset(startOffset[index], endOffset[index + 1]);
+		typeAtt.Type = DOUBLE_TYPE;
+		// when outputting unigrams, all bigrams are synonyms that span two unigrams
+		if (outputUnigrams)
+		{
+		  posIncAtt.PositionIncrement = 0;
+		  posLengthAtt.PositionLength = 2;
+		}
+		index++;
+	  }
+
+	  /// <summary>
+	  /// Flushes a unigram token to output from our buffer.
+	  /// This happens when we encounter isolated CJK characters, either the whole
+	  /// CJK string is a single character, or we encounter a CJK character surrounded 
+	  /// by space, punctuation, english, etc, but not beside any other CJK.
+	  /// </summary>
+	  private void flushUnigram()
+	  {
+		clearAttributes();
+		char[] termBuffer = termAtt.resizeBuffer(2); // maximum unigram length (2 surrogates)
+		int len = char.toChars(buffer[index], termBuffer, 0);
+		termAtt.Length = len;
+		offsetAtt.setOffset(startOffset[index], endOffset[index]);
+		typeAtt.Type = SINGLE_TYPE;
+		index++;
+	  }
+
+	  /// <summary>
+	  /// True if we have multiple codepoints sitting in our buffer
+	  /// </summary>
+	  private bool hasBufferedBigram()
+	  {
+		return bufferLen - index > 1;
+	  }
+
+	  /// <summary>
+	  /// True if we have a single codepoint sitting in our buffer, where its future
+	  /// (whether it is emitted as unigram or forms a bigram) depends upon not-yet-seen
+	  /// inputs.
+	  /// </summary>
+	  private bool hasBufferedUnigram()
+	  {
+		if (outputUnigrams)
+		{
+		  // when outputting unigrams always
+		  return bufferLen - index == 1;
+		}
+		else
+		{
+		  // otherwise its only when we have a lone CJK character
+		  return bufferLen == 1 && index == 0;
+		}
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void reset() throws java.io.IOException
+	  public override void reset()
+	  {
+		base.reset();
+		bufferLen = 0;
+		index = 0;
+		lastEndOffset = 0;
+		loneState = null;
+		exhausted = false;
+		ngramState = false;
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilterFactory.cs
new file mode 100644
index 0000000..9783238
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Cjk/CJKBigramFilterFactory.cs
@@ -0,0 +1,79 @@
+using System.Collections.Generic;
+
+namespace org.apache.lucene.analysis.cjk
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+	using TokenFilterFactory = org.apache.lucene.analysis.util.TokenFilterFactory;
+
+	/// <summary>
+	/// Factory for <seealso cref="CJKBigramFilter"/>.
+	/// <pre class="prettyprint">
+	/// &lt;fieldType name="text_cjk" class="solr.TextField"&gt;
+	///   &lt;analyzer&gt;
+	///     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
+	///     &lt;filter class="solr.CJKWidthFilterFactory"/&gt;
+	///     &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
+	///     &lt;filter class="solr.CJKBigramFilterFactory" 
+	///       han="true" hiragana="true" 
+	///       katakana="true" hangul="true" outputUnigrams="false" /&gt;
+	///   &lt;/analyzer&gt;
+	/// &lt;/fieldType&gt;</pre>
+	/// </summary>
+	public class CJKBigramFilterFactory : TokenFilterFactory
+	{
+	  internal readonly int flags;
+	  internal readonly bool outputUnigrams;
+
+	  /// <summary>
+	  /// Creates a new CJKBigramFilterFactory </summary>
+	  public CJKBigramFilterFactory(IDictionary<string, string> args) : base(args)
+	  {
+		int flags = 0;
+		if (getBoolean(args, "han", true))
+		{
+		  flags |= CJKBigramFilter.HAN;
+		}
+		if (getBoolean(args, "hiragana", true))
+		{
+		  flags |= CJKBigramFilter.HIRAGANA;
+		}
+		if (getBoolean(args, "katakana", true))
+		{
+		  flags |= CJKBigramFilter.KATAKANA;
+		}
+		if (getBoolean(args, "hangul", true))
+		{
+		  flags |= CJKBigramFilter.HANGUL;
+		}
+		this.flags = flags;
+		this.outputUnigrams = getBoolean(args, "outputUnigrams", false);
+		if (args.Count > 0)
+		{
+		  throw new System.ArgumentException("Unknown parameters: " + args);
+		}
+	  }
+
+	  public override TokenStream create(TokenStream input)
+	  {
+		return new CJKBigramFilter(input, flags, outputUnigrams);
+	  }
+	}
+
+}
\ No newline at end of file


Mime
View raw message