lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From synhers...@apache.org
Subject [02/34] lucenenet git commit: Raw porting of Lucene.Net.Analysis.Common
Date Fri, 07 Nov 2014 23:12:06 GMT
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Util/TokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Util/TokenizerFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Util/TokenizerFactory.cs
new file mode 100644
index 0000000..65d7325
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Util/TokenizerFactory.cs
@@ -0,0 +1,93 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using Lucene.Net.Util;
+using org.apache.lucene.analysis.util;
+
+namespace Lucene.Net.Analysis.Util
+{
+
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+    /// <summary>
+    /// Abstract parent class for analysis factories that create <seealso cref="Tokenizer"/>
+    /// instances.
+    /// </summary>
+    public abstract class TokenizerFactory : AbstractAnalysisFactory
+    {
+
+        private static readonly AnalysisSPILoader<TokenizerFactory> loader = new AnalysisSPILoader<TokenizerFactory>(typeof(TokenizerFactory));
+
+        /// <summary>
+        /// looks up a tokenizer by name from context classpath </summary>
+        public static TokenizerFactory ForName(string name, IDictionary<string, string> args)
+        {
+            return loader.newInstance(name, args);
+        }
+
+        /// <summary>
+        /// looks up a tokenizer class by name from context classpath </summary>
+        public static Type LookupClass(string name)
+        {
+            return loader.lookupClass(name);
+        }
+
+        /// <summary>
+        /// returns a list of all available tokenizer names from context classpath </summary>
+        public static HashSet<string> AvailableTokenizers()
+        {
+            return loader.availableServices();
+        }
+
+        /// <summary>
+        /// Reloads the factory list from the given <seealso cref="ClassLoader"/>.
+        /// Changes to the factories are visible after the method ends, all
+        /// iterators (<seealso cref="#availableTokenizers()"/>,...) stay consistent. 
+        /// 
+        /// <para><b>NOTE:</b> Only new factories are added, existing ones are
+        /// never removed or replaced.
+        /// 
+        /// </para>
+        /// <para><em>This method is expensive and should only be called for discovery
+        /// of new factories on the given classpath/classloader!</em>
+        /// </para>
+        /// </summary>
+        public static void ReloadTokenizers(ClassLoader classloader)
+        {
+            loader.reload(classloader);
+        }
+
+        /// <summary>
+        /// Initialize this factory via a set of key-value pairs.
+        /// </summary>
+        protected internal TokenizerFactory(IDictionary<string, string> args)
+            : base(args)
+        {
+        }
+
+        /// <summary>
+        /// Creates a TokenStream of the specified input using the default attribute factory. </summary>
+        public Tokenizer Create(TextReader input)
+        {
+            return Create(AttributeSource.AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input);
+        }
+
+        /// <summary>
+        /// Creates a TokenStream of the specified input using the given AttributeFactory </summary>
+        public abstract Tokenizer Create(AttributeSource.AttributeFactory factory, TextReader input);
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Util/WordlistLoader.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Util/WordlistLoader.cs b/src/Lucene.Net.Analysis.Common/Analysis/Util/WordlistLoader.cs
new file mode 100644
index 0000000..baf3975
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Util/WordlistLoader.cs
@@ -0,0 +1,305 @@
+using System;
+using System.Collections.Generic;
+
+namespace org.apache.lucene.analysis.util
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+
+	using IOUtils = org.apache.lucene.util.IOUtils;
+	using Version = org.apache.lucene.util.Version;
+
+	/// <summary>
+	/// Loader for text files that represent a list of stopwords.
+	/// </summary>
+	/// <seealso cref= IOUtils to obtain <seealso cref="Reader"/> instances
+	/// @lucene.internal </seealso>
+	public class WordlistLoader
+	{
+
+	  private const int INITIAL_CAPACITY = 16;
+
+	  /// <summary>
+	  /// no instance </summary>
+	  private WordlistLoader()
+	  {
+	  }
+
+	  /// <summary>
+	  /// Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting
+	  /// leading and trailing whitespace). Every line of the Reader should contain only
+	  /// one word. The words need to be in lowercase if you make use of an
+	  /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
+	  /// </summary>
+	  /// <param name="reader"> Reader containing the wordlist </param>
+	  /// <param name="result"> the <seealso cref="CharArraySet"/> to fill with the readers words </param>
+	  /// <returns> the given <seealso cref="CharArraySet"/> with the reader's words </returns>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: public static CharArraySet getWordSet(java.io.Reader reader, CharArraySet result) throws java.io.IOException
+	  public static CharArraySet getWordSet(Reader reader, CharArraySet result)
+	  {
+		BufferedReader br = null;
+		try
+		{
+		  br = getBufferedReader(reader);
+		  string word = null;
+		  while ((word = br.readLine()) != null)
+		  {
+			result.add(word.Trim());
+		  }
+		}
+		finally
+		{
+		  IOUtils.close(br);
+		}
+		return result;
+	  }
+
+	  /// <summary>
+	  /// Reads lines from a Reader and adds every line as an entry to a CharArraySet (omitting
+	  /// leading and trailing whitespace). Every line of the Reader should contain only
+	  /// one word. The words need to be in lowercase if you make use of an
+	  /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
+	  /// </summary>
+	  /// <param name="reader"> Reader containing the wordlist </param>
+	  /// <param name="matchVersion"> the Lucene <seealso cref="Version"/> </param>
+	  /// <returns> A <seealso cref="CharArraySet"/> with the reader's words </returns>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: public static CharArraySet getWordSet(java.io.Reader reader, org.apache.lucene.util.Version matchVersion) throws java.io.IOException
+	  public static CharArraySet getWordSet(Reader reader, Version matchVersion)
+	  {
+		return getWordSet(reader, new CharArraySet(matchVersion, INITIAL_CAPACITY, false));
+	  }
+
+	  /// <summary>
+	  /// Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting
+	  /// leading and trailing whitespace). Every line of the Reader should contain only
+	  /// one word. The words need to be in lowercase if you make use of an
+	  /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
+	  /// </summary>
+	  /// <param name="reader"> Reader containing the wordlist </param>
+	  /// <param name="comment"> The string representing a comment. </param>
+	  /// <param name="matchVersion"> the Lucene <seealso cref="Version"/> </param>
+	  /// <returns> A CharArraySet with the reader's words </returns>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: public static CharArraySet getWordSet(java.io.Reader reader, String comment, org.apache.lucene.util.Version matchVersion) throws java.io.IOException
+	  public static CharArraySet getWordSet(Reader reader, string comment, Version matchVersion)
+	  {
+		return getWordSet(reader, comment, new CharArraySet(matchVersion, INITIAL_CAPACITY, false));
+	  }
+
+	  /// <summary>
+	  /// Reads lines from a Reader and adds every non-comment line as an entry to a CharArraySet (omitting
+	  /// leading and trailing whitespace). Every line of the Reader should contain only
+	  /// one word. The words need to be in lowercase if you make use of an
+	  /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer).
+	  /// </summary>
+	  /// <param name="reader"> Reader containing the wordlist </param>
+	  /// <param name="comment"> The string representing a comment. </param>
+	  /// <param name="result"> the <seealso cref="CharArraySet"/> to fill with the readers words </param>
+	  /// <returns> the given <seealso cref="CharArraySet"/> with the reader's words </returns>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: public static CharArraySet getWordSet(java.io.Reader reader, String comment, CharArraySet result) throws java.io.IOException
+	  public static CharArraySet getWordSet(Reader reader, string comment, CharArraySet result)
+	  {
+		BufferedReader br = null;
+		try
+		{
+		  br = getBufferedReader(reader);
+		  string word = null;
+		  while ((word = br.readLine()) != null)
+		  {
+			if (word.StartsWith(comment, StringComparison.Ordinal) == false)
+			{
+			  result.add(word.Trim());
+			}
+		  }
+		}
+		finally
+		{
+		  IOUtils.close(br);
+		}
+		return result;
+	  }
+
+
+	  /// <summary>
+	  /// Reads stopwords from a stopword list in Snowball format.
+	  /// <para>
+	  /// The snowball format is the following:
+	  /// <ul>
+	  /// <li>Lines may contain multiple words separated by whitespace.
+	  /// <li>The comment character is the vertical line (&#124;).
+	  /// <li>Lines may contain trailing comments.
+	  /// </ul>
+	  /// </para>
+	  /// </summary>
+	  /// <param name="reader"> Reader containing a Snowball stopword list </param>
+	  /// <param name="result"> the <seealso cref="CharArraySet"/> to fill with the readers words </param>
+	  /// <returns> the given <seealso cref="CharArraySet"/> with the reader's words </returns>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: public static CharArraySet getSnowballWordSet(java.io.Reader reader, CharArraySet result) throws java.io.IOException
+	  public static CharArraySet getSnowballWordSet(Reader reader, CharArraySet result)
+	  {
+		BufferedReader br = null;
+		try
+		{
+		  br = getBufferedReader(reader);
+		  string line = null;
+		  while ((line = br.readLine()) != null)
+		  {
+			int comment = line.IndexOf('|');
+			if (comment >= 0)
+			{
+				line = line.Substring(0, comment);
+			}
+			string[] words = line.Split("\\s+", true);
+			for (int i = 0; i < words.Length; i++)
+			{
+			  if (words[i].Length > 0)
+			  {
+				  result.add(words[i]);
+			  }
+			}
+		  }
+		}
+		finally
+		{
+		  IOUtils.close(br);
+		}
+		return result;
+	  }
+
+	  /// <summary>
+	  /// Reads stopwords from a stopword list in Snowball format.
+	  /// <para>
+	  /// The snowball format is the following:
+	  /// <ul>
+	  /// <li>Lines may contain multiple words separated by whitespace.
+	  /// <li>The comment character is the vertical line (&#124;).
+	  /// <li>Lines may contain trailing comments.
+	  /// </ul>
+	  /// </para>
+	  /// </summary>
+	  /// <param name="reader"> Reader containing a Snowball stopword list </param>
+	  /// <param name="matchVersion"> the Lucene <seealso cref="Version"/> </param>
+	  /// <returns> A <seealso cref="CharArraySet"/> with the reader's words </returns>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: public static CharArraySet getSnowballWordSet(java.io.Reader reader, org.apache.lucene.util.Version matchVersion) throws java.io.IOException
+	  public static CharArraySet getSnowballWordSet(Reader reader, Version matchVersion)
+	  {
+		return getSnowballWordSet(reader, new CharArraySet(matchVersion, INITIAL_CAPACITY, false));
+	  }
+
+
+	  /// <summary>
+	  /// Reads a stem dictionary. Each line contains:
+	  /// <pre>word<b>\t</b>stem</pre>
+	  /// (i.e. two tab separated words)
+	  /// </summary>
+	  /// <returns> stem dictionary that overrules the stemming algorithm </returns>
+	  /// <exception cref="IOException"> If there is a low-level I/O error. </exception>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: public static CharArrayMap<String> getStemDict(java.io.Reader reader, CharArrayMap<String> result) throws java.io.IOException
+	  public static CharArrayMap<string> getStemDict(Reader reader, CharArrayMap<string> result)
+	  {
+		BufferedReader br = null;
+		try
+		{
+		  br = getBufferedReader(reader);
+		  string line;
+		  while ((line = br.readLine()) != null)
+		  {
+			string[] wordstem = line.Split("\t", 2);
+			result.put(wordstem[0], wordstem[1]);
+		  }
+		}
+		finally
+		{
+		  IOUtils.close(br);
+		}
+		return result;
+	  }
+
+	  /// <summary>
+	  /// Accesses a resource by name and returns the (non comment) lines containing
+	  /// data using the given character encoding.
+	  /// 
+	  /// <para>
+	  /// A comment line is any line that starts with the character "#"
+	  /// </para>
+	  /// </summary>
+	  /// <returns> a list of non-blank non-comment lines with whitespace trimmed </returns>
+	  /// <exception cref="IOException"> If there is a low-level I/O error. </exception>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: public static java.util.List<String> getLines(java.io.InputStream stream, java.nio.charset.Charset charset) throws java.io.IOException
+	  public static IList<string> getLines(InputStream stream, Charset charset)
+	  {
+		BufferedReader input = null;
+		List<string> lines;
+		bool success = false;
+		try
+		{
+		  input = getBufferedReader(IOUtils.getDecodingReader(stream, charset));
+
+		  lines = new List<>();
+		  for (string word = null; (word = input.readLine()) != null;)
+		  {
+			// skip initial bom marker
+			if (lines.Count == 0 && word.Length > 0 && word[0] == '\uFEFF')
+			{
+			  word = word.Substring(1);
+			}
+			// skip comments
+			if (word.StartsWith("#", StringComparison.Ordinal))
+			{
+				continue;
+			}
+			word = word.Trim();
+			// skip blank lines
+			if (word.length() == 0)
+			{
+				continue;
+			}
+			lines.Add(word);
+		  }
+		  success = true;
+		  return lines;
+		}
+		finally
+		{
+		  if (success)
+		  {
+			IOUtils.close(input);
+		  }
+		  else
+		  {
+			IOUtils.closeWhileHandlingException(input);
+		  }
+		}
+	  }
+
+	  private static BufferedReader getBufferedReader(Reader reader)
+	  {
+		return (reader is BufferedReader) ? (BufferedReader) reader : new BufferedReader(reader);
+	  }
+
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Wikipedia/WikipediaTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Wikipedia/WikipediaTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Wikipedia/WikipediaTokenizer.cs
new file mode 100644
index 0000000..1fd76f8
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Wikipedia/WikipediaTokenizer.cs
@@ -0,0 +1,343 @@
+using System.Collections.Generic;
+using System.Text;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace org.apache.lucene.analysis.wikipedia
+{
+
+	using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+	using FlagsAttribute = org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+	using OffsetAttribute = org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+	using PositionIncrementAttribute = org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+	using TypeAttribute = org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+	using AttributeSource = org.apache.lucene.util.AttributeSource;
+
+
+
+	/// <summary>
+	/// Extension of StandardTokenizer that is aware of Wikipedia syntax.  It is based off of the
+	/// Wikipedia tutorial available at http://en.wikipedia.org/wiki/Wikipedia:Tutorial, but it may not be complete.
+	/// <p/>
+	/// <p/>
+	/// @lucene.experimental
+	/// </summary>
+	public sealed class WikipediaTokenizer : Tokenizer
+	{
+	  public const string INTERNAL_LINK = "il";
+	  public const string EXTERNAL_LINK = "el";
+	  //The URL part of the link, i.e. the first token
+	  public const string EXTERNAL_LINK_URL = "elu";
+	  public const string CITATION = "ci";
+	  public const string CATEGORY = "c";
+	  public const string BOLD = "b";
+	  public const string ITALICS = "i";
+	  public const string BOLD_ITALICS = "bi";
+	  public const string HEADING = "h";
+	  public const string SUB_HEADING = "sh";
+
+	  public const int ALPHANUM_ID = 0;
+	  public const int APOSTROPHE_ID = 1;
+	  public const int ACRONYM_ID = 2;
+	  public const int COMPANY_ID = 3;
+	  public const int EMAIL_ID = 4;
+	  public const int HOST_ID = 5;
+	  public const int NUM_ID = 6;
+	  public const int CJ_ID = 7;
+	  public const int INTERNAL_LINK_ID = 8;
+	  public const int EXTERNAL_LINK_ID = 9;
+	  public const int CITATION_ID = 10;
+	  public const int CATEGORY_ID = 11;
+	  public const int BOLD_ID = 12;
+	  public const int ITALICS_ID = 13;
+	  public const int BOLD_ITALICS_ID = 14;
+	  public const int HEADING_ID = 15;
+	  public const int SUB_HEADING_ID = 16;
+	  public const int EXTERNAL_LINK_URL_ID = 17;
+
+	  /// <summary>
+	  /// String token types that correspond to token type int constants </summary>
+	  public static readonly string[] TOKEN_TYPES = new string [] {"<ALPHANUM>", "<APOSTROPHE>", "<ACRONYM>", "<COMPANY>", "<EMAIL>", "<HOST>", "<NUM>", "<CJ>", INTERNAL_LINK, EXTERNAL_LINK, CITATION, CATEGORY, BOLD, ITALICS, BOLD_ITALICS, HEADING, SUB_HEADING, EXTERNAL_LINK_URL};
+
+	  /// <summary>
+	  /// Only output tokens
+	  /// </summary>
+	  public const int TOKENS_ONLY = 0;
+	  /// <summary>
+	  /// Only output untokenized tokens, which are tokens that would normally be split into several tokens
+	  /// </summary>
+	  public const int UNTOKENIZED_ONLY = 1;
+	  /// <summary>
+	  /// Output the both the untokenized token and the splits
+	  /// </summary>
+	  public const int BOTH = 2;
+	  /// <summary>
+	  /// This flag is used to indicate that the produced "Token" would, if <seealso cref="#TOKENS_ONLY"/> was used, produce multiple tokens.
+	  /// </summary>
+	  public const int UNTOKENIZED_TOKEN_FLAG = 1;
+	  /// <summary>
+	  /// A private instance of the JFlex-constructed scanner
+	  /// </summary>
+	  private readonly WikipediaTokenizerImpl scanner;
+
+	  private int tokenOutput = TOKENS_ONLY;
+	  private HashSet<string> untokenizedTypes = java.util.Collections.emptySet();
+	  private IEnumerator<AttributeSource.State> tokens = null;
+
+	  private readonly OffsetAttribute offsetAtt = addAttribute(typeof(OffsetAttribute));
+	  private readonly TypeAttribute typeAtt = addAttribute(typeof(TypeAttribute));
+	  private readonly PositionIncrementAttribute posIncrAtt = addAttribute(typeof(PositionIncrementAttribute));
+	  private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+	  private readonly FlagsAttribute flagsAtt = addAttribute(typeof(FlagsAttribute));
+
+	  private bool first;
+
+	  /// <summary>
+	  /// Creates a new instance of the <seealso cref="WikipediaTokenizer"/>. Attaches the
+	  /// <code>input</code> to a newly created JFlex scanner.
+	  /// </summary>
+	  /// <param name="input"> The Input Reader </param>
+	  public WikipediaTokenizer(Reader input) : this(input, TOKENS_ONLY, System.Linq.Enumerable.Empty<string>())
+	  {
+	  }
+
+	  /// <summary>
+	  /// Creates a new instance of the <seealso cref="org.apache.lucene.analysis.wikipedia.WikipediaTokenizer"/>.  Attaches the
+	  /// <code>input</code> to a the newly created JFlex scanner.
+	  /// </summary>
+	  /// <param name="input"> The input </param>
+	  /// <param name="tokenOutput"> One of <seealso cref="#TOKENS_ONLY"/>, <seealso cref="#UNTOKENIZED_ONLY"/>, <seealso cref="#BOTH"/> </param>
+	  public WikipediaTokenizer(Reader input, int tokenOutput, HashSet<string> untokenizedTypes) : base(input)
+	  {
+		this.scanner = new WikipediaTokenizerImpl(this.input);
+		init(tokenOutput, untokenizedTypes);
+	  }
+
+	  /// <summary>
+	  /// Creates a new instance of the <seealso cref="org.apache.lucene.analysis.wikipedia.WikipediaTokenizer"/>.  Attaches the
+	  /// <code>input</code> to a the newly created JFlex scanner. Uses the given <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/>.
+	  /// </summary>
+	  /// <param name="input"> The input </param>
+	  /// <param name="tokenOutput"> One of <seealso cref="#TOKENS_ONLY"/>, <seealso cref="#UNTOKENIZED_ONLY"/>, <seealso cref="#BOTH"/> </param>
+	  public WikipediaTokenizer(AttributeFactory factory, Reader input, int tokenOutput, HashSet<string> untokenizedTypes) : base(factory, input)
+	  {
+		this.scanner = new WikipediaTokenizerImpl(this.input);
+		init(tokenOutput, untokenizedTypes);
+	  }
+
+	  private void init(int tokenOutput, HashSet<string> untokenizedTypes)
+	  {
+		// TODO: cutover to enum
+		if (tokenOutput != TOKENS_ONLY && tokenOutput != UNTOKENIZED_ONLY && tokenOutput != BOTH)
+		{
+		  throw new System.ArgumentException("tokenOutput must be TOKENS_ONLY, UNTOKENIZED_ONLY or BOTH");
+		}
+		this.tokenOutput = tokenOutput;
+		this.untokenizedTypes = untokenizedTypes;
+	  }
+
+	  /*
+	  * (non-Javadoc)
+	  *
+	  * @see org.apache.lucene.analysis.TokenStream#next()
+	  */
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public final boolean incrementToken() throws java.io.IOException
+	  public override bool incrementToken()
+	  {
+//JAVA TO C# CONVERTER TODO TASK: Java iterators are only converted within the context of 'while' and 'for' loops:
+		if (tokens != null && tokens.hasNext())
+		{
+//JAVA TO C# CONVERTER TODO TASK: Java iterators are only converted within the context of 'while' and 'for' loops:
+		  AttributeSource.State state = tokens.next();
+		  restoreState(state);
+		  return true;
+		}
+		clearAttributes();
+		int tokenType = scanner.NextToken;
+
+		if (tokenType == WikipediaTokenizerImpl.YYEOF)
+		{
+		  return false;
+		}
+		string type = WikipediaTokenizerImpl.TOKEN_TYPES[tokenType];
+		if (tokenOutput == TOKENS_ONLY || untokenizedTypes.Contains(type) == false)
+		{
+		  setupToken();
+		}
+		else if (tokenOutput == UNTOKENIZED_ONLY && untokenizedTypes.Contains(type) == true)
+		{
+		  collapseTokens(tokenType);
+
+		}
+		else if (tokenOutput == BOTH)
+		{
+		  //collapse into a single token, add it to tokens AND output the individual tokens
+		  //output the untokenized Token first
+		  collapseAndSaveTokens(tokenType, type);
+		}
+		int posinc = scanner.PositionIncrement;
+		if (first && posinc == 0)
+		{
+		  posinc = 1; // don't emit posinc=0 for the first token!
+		}
+		posIncrAtt.PositionIncrement = posinc;
+		typeAtt.Type = type;
+		first = false;
+		return true;
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: private void collapseAndSaveTokens(int tokenType, String type) throws java.io.IOException
+	  private void collapseAndSaveTokens(int tokenType, string type)
+	  {
+		//collapse
+		StringBuilder buffer = new StringBuilder(32);
+		int numAdded = scanner.setText(buffer);
+		//TODO: how to know how much whitespace to add
+		int theStart = scanner.yychar();
+		int lastPos = theStart + numAdded;
+		int tmpTokType;
+		int numSeen = 0;
+		IList<AttributeSource.State> tmp = new List<AttributeSource.State>();
+		setupSavedToken(0, type);
+		tmp.Add(captureState());
+		//while we can get a token and that token is the same type and we have not transitioned to a new wiki-item of the same type
+		while ((tmpTokType = scanner.NextToken) != WikipediaTokenizerImpl.YYEOF && tmpTokType == tokenType && scanner.NumWikiTokensSeen > numSeen)
+		{
+		  int currPos = scanner.yychar();
+		  //append whitespace
+		  for (int i = 0; i < (currPos - lastPos); i++)
+		  {
+			buffer.Append(' ');
+		  }
+		  numAdded = scanner.setText(buffer);
+		  setupSavedToken(scanner.PositionIncrement, type);
+		  tmp.Add(captureState());
+		  numSeen++;
+		  lastPos = currPos + numAdded;
+		}
+		//trim the buffer
+		// TODO: this is inefficient
+		string s = buffer.ToString().Trim();
+		termAtt.setEmpty().append(s);
+		offsetAtt.setOffset(correctOffset(theStart), correctOffset(theStart + s.Length));
+		flagsAtt.Flags = UNTOKENIZED_TOKEN_FLAG;
+		//The way the loop is written, we will have proceeded to the next token.  We need to pushback the scanner to lastPos
+		if (tmpTokType != WikipediaTokenizerImpl.YYEOF)
+		{
+		  scanner.yypushback(scanner.yylength());
+		}
+		tokens = tmp.GetEnumerator();
+	  }
+
+	  private void setupSavedToken(int positionInc, string type)
+	  {
+		setupToken();
+		posIncrAtt.PositionIncrement = positionInc;
+		typeAtt.Type = type;
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: private void collapseTokens(int tokenType) throws java.io.IOException
+	  private void collapseTokens(int tokenType)
+	  {
+		//collapse
+		StringBuilder buffer = new StringBuilder(32);
+		int numAdded = scanner.setText(buffer);
+		//TODO: how to know how much whitespace to add
+		int theStart = scanner.yychar();
+		int lastPos = theStart + numAdded;
+		int tmpTokType;
+		int numSeen = 0;
+		//while we can get a token and that token is the same type and we have not transitioned to a new wiki-item of the same type
+		while ((tmpTokType = scanner.NextToken) != WikipediaTokenizerImpl.YYEOF && tmpTokType == tokenType && scanner.NumWikiTokensSeen > numSeen)
+		{
+		  int currPos = scanner.yychar();
+		  //append whitespace
+		  for (int i = 0; i < (currPos - lastPos); i++)
+		  {
+			buffer.Append(' ');
+		  }
+		  numAdded = scanner.setText(buffer);
+		  numSeen++;
+		  lastPos = currPos + numAdded;
+		}
+		//trim the buffer
+		// TODO: this is inefficient
+		string s = buffer.ToString().Trim();
+		termAtt.setEmpty().append(s);
+		offsetAtt.setOffset(correctOffset(theStart), correctOffset(theStart + s.Length));
+		flagsAtt.Flags = UNTOKENIZED_TOKEN_FLAG;
+		//The way the loop is written, we will have proceeded to the next token.  We need to pushback the scanner to lastPos
+		if (tmpTokType != WikipediaTokenizerImpl.YYEOF)
+		{
+		  scanner.yypushback(scanner.yylength());
+		}
+		else
+		{
+		  tokens = null;
+		}
+	  }
+
+	  private void setupToken()
+	  {
+		scanner.getText(termAtt);
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int start = scanner.yychar();
+		int start = scanner.yychar();
+		offsetAtt.setOffset(correctOffset(start), correctOffset(start + termAtt.length()));
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void close() throws java.io.IOException
+	  public override void close()
+	  {
+		base.close();
+		scanner.yyreset(input);
+	  }
+
+	  /*
+	  * (non-Javadoc)
+	  *
+	  * @see org.apache.lucene.analysis.TokenStream#reset()
+	  */
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void reset() throws java.io.IOException
+	  public override void reset()
+	  {
+		base.reset();
+		scanner.yyreset(input);
+		tokens = null;
+		scanner.reset();
+		first = true;
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void end() throws java.io.IOException
+	  public override void end()
+	  {
+		base.end();
+		// set final offset
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
+		int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
+		this.offsetAtt.setOffset(finalOffset, finalOffset);
+	  }
+	}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Wikipedia/WikipediaTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Wikipedia/WikipediaTokenizerFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Wikipedia/WikipediaTokenizerFactory.cs
new file mode 100644
index 0000000..ad7027f
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Wikipedia/WikipediaTokenizerFactory.cs
@@ -0,0 +1,57 @@
+using System.Collections.Generic;
+using TokenizerFactory = Lucene.Net.Analysis.Util.TokenizerFactory;
+
+namespace org.apache.lucene.analysis.wikipedia
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+
+	using TokenizerFactory = TokenizerFactory;
+	using AttributeFactory = org.apache.lucene.util.AttributeSource.AttributeFactory;
+
+	/// <summary>
+	/// Factory for <seealso cref="WikipediaTokenizer"/>.
+	/// <pre class="prettyprint">
+	/// &lt;fieldType name="text_wiki" class="solr.TextField" positionIncrementGap="100"&gt;
+	///   &lt;analyzer&gt;
+	///     &lt;tokenizer class="solr.WikipediaTokenizerFactory"/&gt;
+	///   &lt;/analyzer&gt;
+	/// &lt;/fieldType&gt;</pre>
+	/// </summary>
+	public class WikipediaTokenizerFactory : TokenizerFactory
+	{
+
+	  /// <summary>
+	  /// Creates a new WikipediaTokenizerFactory </summary>
+	  public WikipediaTokenizerFactory(IDictionary<string, string> args) : base(args)
+	  {
+		if (args.Count > 0)
+		{
+		  throw new System.ArgumentException("Unknown parameters: " + args);
+		}
+	  }
+
+	  // TODO: add support for WikipediaTokenizer's advanced options.
+	  public override WikipediaTokenizer create(AttributeFactory factory, Reader input)
+	  {
+		return new WikipediaTokenizer(factory, input, WikipediaTokenizer.TOKENS_ONLY, System.Linq.Enumerable.Empty<string>());
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Collation/CollationAttributeFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Collation/CollationAttributeFactory.cs b/src/Lucene.Net.Analysis.Common/Collation/CollationAttributeFactory.cs
new file mode 100644
index 0000000..cfdba3e
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Collation/CollationAttributeFactory.cs
@@ -0,0 +1,99 @@
+using System;
+using Lucene.Net.Util;
+using org.apache.lucene.collation.tokenattributes;
+
+namespace Lucene.Net.Collation
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+    /// <summary>
+	/// <para>
+	///   Converts each token into its <seealso cref="java.text.CollationKey"/>, and then
+	///   encodes the bytes as an index term.
+	/// </para>
+	/// <para>
+	///   <strong>WARNING:</strong> Make sure you use exactly the same Collator at
+	///   index and query time -- CollationKeys are only comparable when produced by
+	///   the same Collator.  Since <seealso cref="java.text.RuleBasedCollator"/>s are not
+	///   independently versioned, it is unsafe to search against stored
+	///   CollationKeys unless the following are exactly the same (best practice is
+	///   to store this information with the index and check that they remain the
+	///   same at query time):
+	/// </para>
+	/// <ol>
+	///   <li>JVM vendor</li>
+	///   <li>JVM version, including patch version</li>
+	///   <li>
+	///     The language (and country and variant, if specified) of the Locale
+	///     used when constructing the collator via
+	///     <seealso cref="Collator#getInstance(java.util.Locale)"/>.
+	///   </li>
+	///   <li>
+	///     The collation strength used - see <seealso cref="Collator#setStrength(int)"/>
+	///   </li>
+	/// </ol> 
+	/// <para>
+	///   The <code>ICUCollationAttributeFactory</code> in the analysis-icu package 
+	///   uses ICU4J's Collator, which makes its
+	///   version available, thus allowing collation to be versioned independently
+	///   from the JVM.  ICUCollationAttributeFactory is also significantly faster and
+	///   generates significantly shorter keys than CollationAttributeFactory.  See
+	///   <a href="http://site.icu-project.org/charts/collation-icu4j-sun"
+	///   >http://site.icu-project.org/charts/collation-icu4j-sun</a> for key
+	///   generation timing and key length comparisons between ICU4J and
+	///   java.text.Collator over several languages.
+	/// </para>
+	/// <para>
+	///   CollationKeys generated by java.text.Collators are not compatible
+	///   with those those generated by ICU Collators.  Specifically, if you use 
+	///   CollationAttributeFactory to generate index terms, do not use
+	///   ICUCollationAttributeFactory on the query side, or vice versa.
+	/// </para>
+	/// </summary>
+	public class CollationAttributeFactory : AttributeSource.AttributeFactory
+	{
+	  private readonly Collator collator;
+	  private readonly AttributeSource.AttributeFactory @delegate;
+
+	  /// <summary>
+	  /// Create a CollationAttributeFactory, using 
+	  /// <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory#DEFAULT_ATTRIBUTE_FACTORY"/> as the
+	  /// factory for all other attributes. </summary>
+	  /// <param name="collator"> CollationKey generator </param>
+	  public CollationAttributeFactory(Collator collator) : this(AttributeSource.AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, collator)
+	  {
+	  }
+
+	  /// <summary>
+	  /// Create a CollationAttributeFactory, using the supplied Attribute Factory 
+	  /// as the factory for all other attributes. </summary>
+	  /// <param name="delegate"> Attribute Factory </param>
+	  /// <param name="collator"> CollationKey generator </param>
+	  public CollationAttributeFactory(AttributeSource.AttributeFactory @delegate, Collator collator)
+	  {
+		this.@delegate = @delegate;
+		this.collator = collator;
+	  }
+
+	  public override AttributeImpl CreateAttributeInstance(Type attClass)
+	  {
+		return typeof(CollatedTermAttributeImpl).IsSubclassOf(attClass) ? new CollatedTermAttributeImpl(collator) : @delegate.createAttributeInstance(attClass);
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Collation/CollationKeyAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Collation/CollationKeyAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Collation/CollationKeyAnalyzer.cs
new file mode 100644
index 0000000..06fb9e0
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Collation/CollationKeyAnalyzer.cs
@@ -0,0 +1,129 @@
+using System;
+using Lucene.Net.Analysis.Core;
+using Lucene.Net.Collation;
+
+namespace org.apache.lucene.collation
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+
+	using Analyzer = org.apache.lucene.analysis.Analyzer;
+	using KeywordTokenizer = KeywordTokenizer;
+	using IndexableBinaryStringTools = org.apache.lucene.util.IndexableBinaryStringTools; // javadoc @link
+	using Version = org.apache.lucene.util.Version;
+
+
+	/// <summary>
+	/// <para>
+	///   Configures <seealso cref="KeywordTokenizer"/> with <seealso cref="CollationAttributeFactory"/>.
+	/// </para>
+	/// <para>
+	///   Converts the token into its <seealso cref="java.text.CollationKey"/>, and then
+	///   encodes the CollationKey either directly or with 
+	///   <seealso cref="IndexableBinaryStringTools"/> (see <a href="#version">below</a>), to allow 
+	///   it to be stored as an index term.
+	/// </para>
+	/// <para>
+	///   <strong>WARNING:</strong> Make sure you use exactly the same Collator at
+	///   index and query time -- CollationKeys are only comparable when produced by
+	///   the same Collator.  Since <seealso cref="java.text.RuleBasedCollator"/>s are not
+	///   independently versioned, it is unsafe to search against stored
+	///   CollationKeys unless the following are exactly the same (best practice is
+	///   to store this information with the index and check that they remain the
+	///   same at query time):
+	/// </para>
+	/// <ol>
+	///   <li>JVM vendor</li>
+	///   <li>JVM version, including patch version</li>
+	///   <li>
+	///     The language (and country and variant, if specified) of the Locale
+	///     used when constructing the collator via
+	///     <seealso cref="Collator#getInstance(java.util.Locale)"/>.
+	///   </li>
+	///   <li>
+	///     The collation strength used - see <seealso cref="Collator#setStrength(int)"/>
+	///   </li>
+	/// </ol> 
+	/// <para>
+	///   The <code>ICUCollationKeyAnalyzer</code> in the analysis-icu package 
+	///   uses ICU4J's Collator, which makes its
+	///   its version available, thus allowing collation to be versioned
+	///   independently from the JVM.  ICUCollationKeyAnalyzer is also significantly
+	///   faster and generates significantly shorter keys than CollationKeyAnalyzer.
+	///   See <a href="http://site.icu-project.org/charts/collation-icu4j-sun"
+	///   >http://site.icu-project.org/charts/collation-icu4j-sun</a> for key
+	///   generation timing and key length comparisons between ICU4J and
+	///   java.text.Collator over several languages.
+	/// </para>
+	/// <para>
+	///   CollationKeys generated by java.text.Collators are not compatible
+	///   with those those generated by ICU Collators.  Specifically, if you use 
+	///   CollationKeyAnalyzer to generate index terms, do not use
+	///   ICUCollationKeyAnalyzer on the query side, or vice versa.
+	/// </para>
+	/// <a name="version"/>
+	/// <para>You must specify the required <seealso cref="Version"/>
+	/// compatibility when creating CollationKeyAnalyzer:
+	/// <ul>
+	///   <li> As of 4.0, Collation Keys are directly encoded as bytes. Previous
+	///   versions will encode the bytes with <seealso cref="IndexableBinaryStringTools"/>.
+	/// </ul>
+	/// </para>
+	/// </summary>
+	public sealed class CollationKeyAnalyzer : Analyzer
+	{
+	  private readonly Collator collator;
+	  private readonly CollationAttributeFactory factory;
+	  private readonly Version matchVersion;
+
+	  /// <summary>
+	  /// Create a new CollationKeyAnalyzer, using the specified collator.
+	  /// </summary>
+	  /// <param name="matchVersion"> See <a href="#version">above</a> </param>
+	  /// <param name="collator"> CollationKey generator </param>
+	  public CollationKeyAnalyzer(Version matchVersion, Collator collator)
+	  {
+		this.matchVersion = matchVersion;
+		this.collator = collator;
+		this.factory = new CollationAttributeFactory(collator);
+	  }
+
+	  /// @deprecated Use <seealso cref="CollationKeyAnalyzer#CollationKeyAnalyzer(Version, Collator)"/>
+	  ///   and specify a version instead. This ctor will be removed in Lucene 5.0 
+	  [Obsolete("Use <seealso cref="CollationKeyAnalyzer#CollationKeyAnalyzer(org.apache.lucene.util.Version, java.text.Collator)"/>")]
+	  public CollationKeyAnalyzer(Collator collator) : this(Version.LUCENE_31, collator)
+	  {
+	  }
+
+	  protected internal override TokenStreamComponents createComponents(string fieldName, Reader reader)
+	  {
+		if (matchVersion.onOrAfter(Version.LUCENE_40))
+		{
+		  KeywordTokenizer tokenizer = new KeywordTokenizer(factory, reader, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
+		  return new TokenStreamComponents(tokenizer, tokenizer);
+		}
+		else
+		{
+		  KeywordTokenizer tokenizer = new KeywordTokenizer(reader);
+		  return new TokenStreamComponents(tokenizer, new CollationKeyFilter(tokenizer, collator));
+		}
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Collation/CollationKeyFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Collation/CollationKeyFilter.cs b/src/Lucene.Net.Analysis.Common/Collation/CollationKeyFilter.cs
new file mode 100644
index 0000000..a098632
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Collation/CollationKeyFilter.cs
@@ -0,0 +1,112 @@
+using System;
+
+namespace org.apache.lucene.collation
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+
+	using TokenFilter = org.apache.lucene.analysis.TokenFilter;
+	using TokenStream = org.apache.lucene.analysis.TokenStream;
+	using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+	using IndexableBinaryStringTools = org.apache.lucene.util.IndexableBinaryStringTools;
+
+
+
+	/// <summary>
+	/// <para>
+	///   Converts each token into its <seealso cref="java.text.CollationKey"/>, and then
+	///   encodes the CollationKey with <seealso cref="IndexableBinaryStringTools"/>, to allow 
+	///   it to be stored as an index term.
+	/// </para>
+	/// <para>
+	///   <strong>WARNING:</strong> Make sure you use exactly the same Collator at
+	///   index and query time -- CollationKeys are only comparable when produced by
+	///   the same Collator.  Since <seealso cref="java.text.RuleBasedCollator"/>s are not
+	///   independently versioned, it is unsafe to search against stored
+	///   CollationKeys unless the following are exactly the same (best practice is
+	///   to store this information with the index and check that they remain the
+	///   same at query time):
+	/// </para>
+	/// <ol>
+	///   <li>JVM vendor</li>
+	///   <li>JVM version, including patch version</li>
+	///   <li>
+	///     The language (and country and variant, if specified) of the Locale
+	///     used when constructing the collator via
+	///     <seealso cref="Collator#getInstance(java.util.Locale)"/>.
+	///   </li>
+	///   <li>
+	///     The collation strength used - see <seealso cref="Collator#setStrength(int)"/>
+	///   </li>
+	/// </ol> 
+	/// <para>
+	///   The <code>ICUCollationKeyFilter</code> in the analysis-icu package 
+	///   uses ICU4J's Collator, which makes its
+	///   version available, thus allowing collation to be versioned independently
+	///   from the JVM.  ICUCollationKeyFilter is also significantly faster and
+	///   generates significantly shorter keys than CollationKeyFilter.  See
+	///   <a href="http://site.icu-project.org/charts/collation-icu4j-sun"
+	///   >http://site.icu-project.org/charts/collation-icu4j-sun</a> for key
+	///   generation timing and key length comparisons between ICU4J and
+	///   java.text.Collator over several languages.
+	/// </para>
+	/// <para>
+	///   CollationKeys generated by java.text.Collators are not compatible
+	///   with those those generated by ICU Collators.  Specifically, if you use 
+	///   CollationKeyFilter to generate index terms, do not use
+	///   ICUCollationKeyFilter on the query side, or vice versa.
+	/// </para> </summary>
+	/// @deprecated Use <seealso cref="CollationAttributeFactory"/> instead, which encodes
+	///  terms directly as bytes. This filter will be removed in Lucene 5.0 
+	[Obsolete("Use <seealso cref="CollationAttributeFactory"/> instead, which encodes")]
+	public sealed class CollationKeyFilter : TokenFilter
+	{
+	  private readonly Collator collator;
+	  private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+
+	  /// <param name="input"> Source token stream </param>
+	  /// <param name="collator"> CollationKey generator </param>
+	  public CollationKeyFilter(TokenStream input, Collator collator) : base(input)
+	  {
+		// clone in case JRE doesnt properly sync,
+		// or to reduce contention in case they do
+		this.collator = (Collator) collator.clone();
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
+	  public override bool incrementToken()
+	  {
+		if (input.incrementToken())
+		{
+		  sbyte[] collationKey = collator.getCollationKey(termAtt.ToString()).toByteArray();
+		  int encodedLength = IndexableBinaryStringTools.getEncodedLength(collationKey, 0, collationKey.Length);
+		  termAtt.resizeBuffer(encodedLength);
+		  termAtt.Length = encodedLength;
+		  IndexableBinaryStringTools.encode(collationKey, 0, collationKey.Length, termAtt.buffer(), 0, encodedLength);
+		  return true;
+		}
+		else
+		{
+		  return false;
+		}
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Collation/CollationKeyFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Collation/CollationKeyFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Collation/CollationKeyFilterFactory.cs
new file mode 100644
index 0000000..7396e1f
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Collation/CollationKeyFilterFactory.cs
@@ -0,0 +1,254 @@
+using System;
+using System.Collections.Generic;
+using System.Text;
+using Lucene.Net.Analysis.Util;
+
+namespace org.apache.lucene.collation
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+
+	using TokenStream = org.apache.lucene.analysis.TokenStream;
+	using org.apache.lucene.analysis.util;
+	using IOUtils = org.apache.lucene.util.IOUtils;
+
+	/// <summary>
+	/// Factory for <seealso cref="CollationKeyFilter"/>.
+	/// <para>
+	/// This factory can be created in two ways: 
+	/// <ul>
+	///  <li>Based upon a system collator associated with a Locale.
+	///  <li>Based upon a tailored ruleset.
+	/// </ul>
+	/// </para>
+	/// <para>
+	/// Using a System collator:
+	/// <ul>
+	///  <li>language: ISO-639 language code (mandatory)
+	///  <li>country: ISO-3166 country code (optional)
+	///  <li>variant: vendor or browser-specific code (optional)
+	///  <li>strength: 'primary','secondary','tertiary', or 'identical' (optional)
+	///  <li>decomposition: 'no','canonical', or 'full' (optional)
+	/// </ul>
+	/// </para>
+	/// <para>
+	/// Using a Tailored ruleset:
+	/// <ul>
+	///  <li>custom: UTF-8 text file containing rules supported by RuleBasedCollator (mandatory)
+	///  <li>strength: 'primary','secondary','tertiary', or 'identical' (optional)
+	///  <li>decomposition: 'no','canonical', or 'full' (optional)
+	/// </ul>
+	/// 
+	/// <pre class="prettyprint" >
+	/// &lt;fieldType name="text_clltnky" class="solr.TextField" positionIncrementGap="100"&gt;
+	///   &lt;analyzer&gt;
+	///     &lt;tokenizer class="solr.KeywordTokenizerFactory"/&gt;
+	///     &lt;filter class="solr.CollationKeyFilterFactory" language="ja" country="JP"/&gt;
+	///   &lt;/analyzer&gt;
+	/// &lt;/fieldType&gt;</pre>
+	/// 
+	/// </para>
+	/// </summary>
+	/// <seealso cref= Collator </seealso>
+	/// <seealso cref= Locale </seealso>
+	/// <seealso cref= RuleBasedCollator
+	/// @since solr 3.1 </seealso>
+	/// @deprecated use <seealso cref="CollationKeyAnalyzer"/> instead. 
+	[Obsolete("use <seealso cref="CollationKeyAnalyzer"/> instead.")]
+	public class CollationKeyFilterFactory : TokenFilterFactory, MultiTermAwareComponent, ResourceLoaderAware
+	{
+	  private Collator collator;
+	  private readonly string custom;
+	  private readonly string language;
+	  private readonly string country;
+	  private readonly string variant;
+	  private readonly string strength;
+	  private readonly string decomposition;
+
+	  public CollationKeyFilterFactory(IDictionary<string, string> args) : base(args)
+	  {
+		custom = args.Remove("custom");
+		language = args.Remove("language");
+		country = args.Remove("country");
+		variant = args.Remove("variant");
+		strength = args.Remove("strength");
+		decomposition = args.Remove("decomposition");
+
+		if (custom == null && language == null)
+		{
+		  throw new System.ArgumentException("Either custom or language is required.");
+		}
+
+		if (custom != null && (language != null || country != null || variant != null))
+		{
+		  throw new System.ArgumentException("Cannot specify both language and custom. " + "To tailor rules for a built-in language, see the javadocs for RuleBasedCollator. " + "Then save the entire customized ruleset to a file, and use with the custom parameter");
+		}
+
+		if (args.Count > 0)
+		{
+		  throw new System.ArgumentException("Unknown parameters: " + args);
+		}
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: public void inform(ResourceLoader loader) throws java.io.IOException
+	  public virtual void inform(ResourceLoader loader)
+	  {
+		if (language != null)
+		{
+		  // create from a system collator, based on Locale.
+		  collator = createFromLocale(language, country, variant);
+		}
+		else
+		{
+		  // create from a custom ruleset
+		  collator = createFromRules(custom, loader);
+		}
+
+		// set the strength flag, otherwise it will be the default.
+		if (strength != null)
+		{
+		  if (strength.Equals("primary", StringComparison.CurrentCultureIgnoreCase))
+		  {
+			collator.Strength = Collator.PRIMARY;
+		  }
+		  else if (strength.Equals("secondary", StringComparison.CurrentCultureIgnoreCase))
+		  {
+			collator.Strength = Collator.SECONDARY;
+		  }
+		  else if (strength.Equals("tertiary", StringComparison.CurrentCultureIgnoreCase))
+		  {
+			collator.Strength = Collator.TERTIARY;
+		  }
+		  else if (strength.Equals("identical", StringComparison.CurrentCultureIgnoreCase))
+		  {
+			collator.Strength = Collator.IDENTICAL;
+		  }
+		  else
+		  {
+			throw new System.ArgumentException("Invalid strength: " + strength);
+		  }
+		}
+
+		// set the decomposition flag, otherwise it will be the default.
+		if (decomposition != null)
+		{
+		  if (decomposition.Equals("no", StringComparison.CurrentCultureIgnoreCase))
+		  {
+			collator.Decomposition = Collator.NO_DECOMPOSITION;
+		  }
+		  else if (decomposition.Equals("canonical", StringComparison.CurrentCultureIgnoreCase))
+		  {
+			collator.Decomposition = Collator.CANONICAL_DECOMPOSITION;
+		  }
+		  else if (decomposition.Equals("full", StringComparison.CurrentCultureIgnoreCase))
+		  {
+			collator.Decomposition = Collator.FULL_DECOMPOSITION;
+		  }
+		  else
+		  {
+			throw new System.ArgumentException("Invalid decomposition: " + decomposition);
+		  }
+		}
+	  }
+
+	  public override TokenStream create(TokenStream input)
+	  {
+		return new CollationKeyFilter(input, collator);
+	  }
+
+	  /*
+	   * Create a locale from language, with optional country and variant.
+	   * Then return the appropriate collator for the locale.
+	   */
+	  private Collator createFromLocale(string language, string country, string variant)
+	  {
+		Locale locale;
+
+		if (language != null && country == null && variant != null)
+		{
+		  throw new System.ArgumentException("To specify variant, country is required");
+		}
+		else if (language != null && country != null && variant != null)
+		{
+		  locale = new Locale(language, country, variant);
+		}
+		else if (language != null && country != null)
+		{
+		  locale = new Locale(language, country);
+		}
+		else
+		{
+		  locale = new Locale(language);
+		}
+
+		return Collator.getInstance(locale);
+	  }
+
+	  /*
+	   * Read custom rules from a file, and create a RuleBasedCollator
+	   * The file cannot support comments, as # might be in the rules!
+	   */
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: private java.text.Collator createFromRules(String fileName, ResourceLoader loader) throws java.io.IOException
+	  private Collator createFromRules(string fileName, ResourceLoader loader)
+	  {
+		InputStream input = null;
+		try
+		{
+		 input = loader.openResource(fileName);
+		 string rules = toUTF8String(input);
+		 return new RuleBasedCollator(rules);
+		}
+		catch (ParseException e)
+		{
+		  // invalid rules
+		  throw new IOException("ParseException thrown while parsing rules", e);
+		}
+		finally
+		{
+		  IOUtils.closeWhileHandlingException(input);
+		}
+	  }
+
+	  public virtual AbstractAnalysisFactory MultiTermComponent
+	  {
+		  get
+		  {
+			return this;
+		  }
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: private String toUTF8String(java.io.InputStream in) throws java.io.IOException
+	  private string toUTF8String(InputStream @in)
+	  {
+		StringBuilder sb = new StringBuilder();
+		char[] buffer = new char[1024];
+		Reader r = IOUtils.getDecodingReader(@in, StandardCharsets.UTF_8);
+		int len = 0;
+		while ((len = r.read(buffer)) > 0)
+		{
+		  sb.Append(buffer, 0, len);
+		}
+		return sb.ToString();
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Collation/TokenAttributes/CollatedTermAttributeImpl.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Collation/TokenAttributes/CollatedTermAttributeImpl.cs b/src/Lucene.Net.Analysis.Common/Collation/TokenAttributes/CollatedTermAttributeImpl.cs
new file mode 100644
index 0000000..89b57c5
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Collation/TokenAttributes/CollatedTermAttributeImpl.cs
@@ -0,0 +1,52 @@
+namespace org.apache.lucene.collation.tokenattributes
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+	using CharTermAttributeImpl = org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl;
+	using BytesRef = org.apache.lucene.util.BytesRef;
+
+	/// <summary>
+	/// Extension of <seealso cref="CharTermAttributeImpl"/> that encodes the term
+	/// text as a binary Unicode collation key instead of as UTF-8 bytes.
+	/// </summary>
+	public class CollatedTermAttributeImpl : CharTermAttributeImpl
+	{
+	  private readonly Collator collator;
+
+	  /// <summary>
+	  /// Create a new CollatedTermAttributeImpl </summary>
+	  /// <param name="collator"> Collation key generator </param>
+	  public CollatedTermAttributeImpl(Collator collator)
+	  {
+		// clone in case JRE doesn't properly sync,
+		// or to reduce contention in case they do
+		this.collator = (Collator) collator.clone();
+	  }
+
+	  public override void fillBytesRef()
+	  {
+		BytesRef bytes = BytesRef;
+		bytes.bytes = collator.getCollationKey(ToString()).toByteArray();
+		bytes.offset = 0;
+		bytes.length = bytes.bytes.length;
+	  }
+
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Lucene.Net.Analysis.Common.csproj
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Lucene.Net.Analysis.Common.csproj b/src/Lucene.Net.Analysis.Common/Lucene.Net.Analysis.Common.csproj
new file mode 100644
index 0000000..5ff0050
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Lucene.Net.Analysis.Common.csproj
@@ -0,0 +1,244 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="12.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
+  <PropertyGroup>
+    <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+    <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+    <ProjectGuid>{4ADD0BBC-B900-4715-9526-D871DE8EEA64}</ProjectGuid>
+    <OutputType>Library</OutputType>
+    <AppDesignerFolder>Properties</AppDesignerFolder>
+    <RootNamespace>Lucene.Net</RootNamespace>
+    <AssemblyName>Lucene.Net.Analysis.Common</AssemblyName>
+    <TargetFrameworkVersion>v4.5.1</TargetFrameworkVersion>
+    <FileAlignment>512</FileAlignment>
+  </PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
+    <DebugSymbols>true</DebugSymbols>
+    <DebugType>full</DebugType>
+    <Optimize>false</Optimize>
+    <OutputPath>bin\Debug\</OutputPath>
+    <DefineConstants>DEBUG;TRACE</DefineConstants>
+    <ErrorReport>prompt</ErrorReport>
+    <WarningLevel>4</WarningLevel>
+  </PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
+    <DebugType>pdbonly</DebugType>
+    <Optimize>true</Optimize>
+    <OutputPath>bin\Release\</OutputPath>
+    <DefineConstants>TRACE</DefineConstants>
+    <ErrorReport>prompt</ErrorReport>
+    <WarningLevel>4</WarningLevel>
+  </PropertyGroup>
+  <ItemGroup>
+    <Reference Include="System" />
+    <Reference Include="System.Core" />
+    <Reference Include="Microsoft.CSharp" />
+  </ItemGroup>
+  <ItemGroup>
+    <Compile Include="Analysis\CharFilter\BaseCharFilter.cs" />
+    <Compile Include="Analysis\CharFilter\HTMLStripCharFilterFactory.cs" />
+    <Compile Include="Analysis\CharFilter\MappingCharFilter.cs" />
+    <Compile Include="Analysis\CharFilter\MappingCharFilterFactory.cs" />
+    <Compile Include="Analysis\CharFilter\NormalizeCharMap.cs" />
+    <Compile Include="Analysis\CommonGrams\CommonGramsFilter.cs" />
+    <Compile Include="Analysis\CommonGrams\CommonGramsFilterFactory.cs" />
+    <Compile Include="Analysis\CommonGrams\CommonGramsQueryFilter.cs" />
+    <Compile Include="Analysis\CommonGrams\CommonGramsQueryFilterFactory.cs" />
+    <Compile Include="Analysis\Compound\CompoundWordTokenFilterBase.cs" />
+    <Compile Include="Analysis\Compound\DictionaryCompoundWordTokenFilter.cs" />
+    <Compile Include="Analysis\Compound\DictionaryCompoundWordTokenFilterFactory.cs" />
+    <Compile Include="Analysis\Compound\HyphenationCompoundWordTokenFilter.cs" />
+    <Compile Include="Analysis\Compound\HyphenationCompoundWordTokenFilterFactory.cs" />
+    <Compile Include="Analysis\Compound\hyphenation\ByteVector.cs" />
+    <Compile Include="Analysis\Compound\hyphenation\CharVector.cs" />
+    <Compile Include="Analysis\Compound\hyphenation\Hyphen.cs" />
+    <Compile Include="Analysis\Compound\hyphenation\Hyphenation.cs" />
+    <Compile Include="Analysis\Compound\hyphenation\HyphenationTree.cs" />
+    <Compile Include="Analysis\Compound\hyphenation\PatternConsumer.cs" />
+    <Compile Include="Analysis\Compound\hyphenation\PatternParser.cs" />
+    <Compile Include="Analysis\Compound\hyphenation\TernaryTree.cs" />
+    <Compile Include="Analysis\Core\KeywordAnalyzer.cs" />
+    <Compile Include="Analysis\Core\KeywordTokenizer.cs" />
+    <Compile Include="Analysis\Core\KeywordTokenizerFactory.cs" />
+    <Compile Include="Analysis\Core\LetterTokenizer.cs" />
+    <Compile Include="Analysis\Core\LetterTokenizerFactory.cs" />
+    <Compile Include="Analysis\Core\LowerCaseFilter.cs" />
+    <Compile Include="Analysis\Core\LowerCaseFilterFactory.cs" />
+    <Compile Include="Analysis\Core\LowerCaseTokenizer.cs" />
+    <Compile Include="Analysis\Core\LowerCaseTokenizerFactory.cs" />
+    <Compile Include="Analysis\Core\SimpleAnalyzer.cs" />
+    <Compile Include="Analysis\Core\StopAnalyzer.cs" />
+    <Compile Include="Analysis\Core\StopFilter.cs" />
+    <Compile Include="Analysis\Core\StopFilterFactory.cs" />
+    <Compile Include="Analysis\Core\TypeTokenFilter.cs" />
+    <Compile Include="Analysis\Core\TypeTokenFilterFactory.cs" />
+    <Compile Include="Analysis\Core\UpperCaseFilter.cs" />
+    <Compile Include="Analysis\Core\UpperCaseFilterFactory.cs" />
+    <Compile Include="Analysis\Core\WhitespaceAnalyzer.cs" />
+    <Compile Include="Analysis\Core\WhitespaceTokenizer.cs" />
+    <Compile Include="Analysis\Core\WhitespaceTokenizerFactory.cs" />
+    <Compile Include="Analysis\Miscellaneous\ASCIIFoldingFilter.cs" />
+    <Compile Include="Analysis\Miscellaneous\ASCIIFoldingFilterFactory.cs" />
+    <Compile Include="Analysis\Miscellaneous\CapitalizationFilter.cs" />
+    <Compile Include="Analysis\Miscellaneous\CapitalizationFilterFactory.cs" />
+    <Compile Include="Analysis\Miscellaneous\CodepointCountFilter.cs" />
+    <Compile Include="Analysis\Miscellaneous\CodepointCountFilterFactory.cs" />
+    <Compile Include="Analysis\Miscellaneous\EmptyTokenStream.cs" />
+    <Compile Include="Analysis\Miscellaneous\HyphenatedWordsFilter.cs" />
+    <Compile Include="Analysis\Miscellaneous\HyphenatedWordsFilterFactory.cs" />
+    <Compile Include="Analysis\Miscellaneous\KeepWordFilter.cs" />
+    <Compile Include="Analysis\Miscellaneous\KeepWordFilterFactory.cs" />
+    <Compile Include="Analysis\Miscellaneous\KeywordMarkerFilter.cs" />
+    <Compile Include="Analysis\Miscellaneous\KeywordMarkerFilterFactory.cs" />
+    <Compile Include="Analysis\Miscellaneous\KeywordRepeatFilter.cs" />
+    <Compile Include="Analysis\Miscellaneous\KeywordRepeatFilterFactory.cs" />
+    <Compile Include="Analysis\Miscellaneous\LengthFilter.cs" />
+    <Compile Include="Analysis\Miscellaneous\LengthFilterFactory.cs" />
+    <Compile Include="Analysis\Miscellaneous\LimitTokenCountAnalyzer.cs" />
+    <Compile Include="Analysis\Miscellaneous\LimitTokenCountFilter.cs" />
+    <Compile Include="Analysis\Miscellaneous\LimitTokenCountFilterFactory.cs" />
+    <Compile Include="Analysis\Miscellaneous\LimitTokenPositionFilter.cs" />
+    <Compile Include="Analysis\Miscellaneous\LimitTokenPositionFilterFactory.cs" />
+    <Compile Include="Analysis\Miscellaneous\Lucene47WordDelimiterFilter.cs" />
+    <Compile Include="Analysis\Miscellaneous\PatternAnalyzer.cs" />
+    <Compile Include="Analysis\Miscellaneous\PatternKeywordMarkerFilter.cs" />
+    <Compile Include="Analysis\Miscellaneous\PerFieldAnalyzerWrapper.cs" />
+    <Compile Include="Analysis\Miscellaneous\PrefixAndSuffixAwareTokenFilter.cs" />
+    <Compile Include="Analysis\Miscellaneous\PrefixAwareTokenFilter.cs" />
+    <Compile Include="Analysis\Miscellaneous\RemoveDuplicatesTokenFilter.cs" />
+    <Compile Include="Analysis\Miscellaneous\RemoveDuplicatesTokenFilterFactory.cs" />
+    <Compile Include="Analysis\Miscellaneous\ScandinavianFoldingFilter.cs" />
+    <Compile Include="Analysis\Miscellaneous\ScandinavianFoldingFilterFactory.cs" />
+    <Compile Include="Analysis\Miscellaneous\ScandinavianNormalizationFilter.cs" />
+    <Compile Include="Analysis\Miscellaneous\ScandinavianNormalizationFilterFactory.cs" />
+    <Compile Include="Analysis\Miscellaneous\SetKeywordMarkerFilter.cs" />
+    <Compile Include="Analysis\Miscellaneous\SingleTokenTokenStream.cs" />
+    <Compile Include="Analysis\Miscellaneous\StemmerOverrideFilter.cs" />
+    <Compile Include="Analysis\Miscellaneous\StemmerOverrideFilterFactory.cs" />
+    <Compile Include="Analysis\Miscellaneous\TrimFilter.cs" />
+    <Compile Include="Analysis\Miscellaneous\TrimFilterFactory.cs" />
+    <Compile Include="Analysis\Miscellaneous\TruncateTokenFilter.cs" />
+    <Compile Include="Analysis\Miscellaneous\TruncateTokenFilterFactory.cs" />
+    <Compile Include="Analysis\Miscellaneous\WordDelimiterFilter.cs" />
+    <Compile Include="Analysis\Miscellaneous\WordDelimiterFilterFactory.cs" />
+    <Compile Include="Analysis\Miscellaneous\WordDelimiterIterator.cs" />
+    <Compile Include="Analysis\Ngram\EdgeNGramFilterFactory.cs" />
+    <Compile Include="Analysis\Ngram\EdgeNGramTokenFilter.cs" />
+    <Compile Include="Analysis\Ngram\EdgeNGramTokenizer.cs" />
+    <Compile Include="Analysis\Ngram\EdgeNGramTokenizerFactory.cs" />
+    <Compile Include="Analysis\Ngram\Lucene43EdgeNGramTokenizer.cs" />
+    <Compile Include="Analysis\Ngram\Lucene43NGramTokenizer.cs" />
+    <Compile Include="Analysis\Ngram\NGramFilterFactory.cs" />
+    <Compile Include="Analysis\Ngram\NGramTokenFilter.cs" />
+    <Compile Include="Analysis\Ngram\NGramTokenizer.cs" />
+    <Compile Include="Analysis\Ngram\NGramTokenizerFactory.cs" />
+    <Compile Include="Analysis\Path\PathHierarchyTokenizer.cs" />
+    <Compile Include="Analysis\Path\PathHierarchyTokenizerFactory.cs" />
+    <Compile Include="Analysis\Path\ReversePathHierarchyTokenizer.cs" />
+    <Compile Include="Analysis\Pattern\PatternCaptureGroupFilterFactory.cs" />
+    <Compile Include="Analysis\Pattern\PatternCaptureGroupTokenFilter.cs" />
+    <Compile Include="Analysis\Pattern\PatternReplaceCharFilter.cs" />
+    <Compile Include="Analysis\Pattern\PatternReplaceCharFilterFactory.cs" />
+    <Compile Include="Analysis\Pattern\PatternReplaceFilter.cs" />
+    <Compile Include="Analysis\Pattern\PatternReplaceFilterFactory.cs" />
+    <Compile Include="Analysis\Pattern\PatternTokenizer.cs" />
+    <Compile Include="Analysis\Pattern\PatternTokenizerFactory.cs" />
+    <Compile Include="Analysis\Payloads\AbstractEncoder.cs" />
+    <Compile Include="Analysis\Payloads\DelimitedPayloadTokenFilter.cs" />
+    <Compile Include="Analysis\Payloads\DelimitedPayloadTokenFilterFactory.cs" />
+    <Compile Include="Analysis\Payloads\FloatEncoder.cs" />
+    <Compile Include="Analysis\Payloads\IdentityEncoder.cs" />
+    <Compile Include="Analysis\Payloads\IntegerEncoder.cs" />
+    <Compile Include="Analysis\Payloads\NumericPayloadTokenFilter.cs" />
+    <Compile Include="Analysis\Payloads\NumericPayloadTokenFilterFactory.cs" />
+    <Compile Include="Analysis\Payloads\PayloadEncoder.cs" />
+    <Compile Include="Analysis\Payloads\PayloadHelper.cs" />
+    <Compile Include="Analysis\Payloads\TokenOffsetPayloadTokenFilter.cs" />
+    <Compile Include="Analysis\Payloads\TokenOffsetPayloadTokenFilterFactory.cs" />
+    <Compile Include="Analysis\Payloads\TypeAsPayloadTokenFilter.cs" />
+    <Compile Include="Analysis\Payloads\TypeAsPayloadTokenFilterFactory.cs" />
+    <Compile Include="Analysis\Position\PositionFilter.cs" />
+    <Compile Include="Analysis\Position\PositionFilterFactory.cs" />
+    <Compile Include="Analysis\Query\QueryAutoStopWordAnalyzer.cs" />
+    <Compile Include="Analysis\Reverse\ReverseStringFilter.cs" />
+    <Compile Include="Analysis\Reverse\ReverseStringFilterFactory.cs" />
+    <Compile Include="Analysis\Shingle\ShingleAnalyzerWrapper.cs" />
+    <Compile Include="Analysis\Shingle\ShingleFilter.cs" />
+    <Compile Include="Analysis\Shingle\ShingleFilterFactory.cs" />
+    <Compile Include="Analysis\Sinks\DateRecognizerSinkFilter.cs" />
+    <Compile Include="Analysis\Sinks\TeeSinkTokenFilter.cs" />
+    <Compile Include="Analysis\Sinks\TokenRangeSinkFilter.cs" />
+    <Compile Include="Analysis\Sinks\TokenTypeSinkFilter.cs" />
+    <Compile Include="Analysis\Standard\ClassicAnalyzer.cs" />
+    <Compile Include="Analysis\Standard\ClassicFilter.cs" />
+    <Compile Include="Analysis\Standard\ClassicFilterFactory.cs" />
+    <Compile Include="Analysis\Standard\ClassicTokenizer.cs" />
+    <Compile Include="Analysis\Standard\ClassicTokenizerFactory.cs" />
+    <Compile Include="Analysis\Standard\ClassicTokenizerImpl.cs" />
+    <Compile Include="Analysis\Standard\StandardAnalyzer.cs" />
+    <Compile Include="Analysis\Standard\StandardFilter.cs" />
+    <Compile Include="Analysis\Standard\StandardFilterFactory.cs" />
+    <Compile Include="Analysis\Standard\StandardTokenizer.cs" />
+    <Compile Include="Analysis\Standard\StandardTokenizerFactory.cs" />
+    <Compile Include="Analysis\Standard\StandardTokenizerImpl.cs" />
+    <Compile Include="Analysis\Standard\StandardTokenizerInterface.cs" />
+    <Compile Include="Analysis\Standard\UAX29URLEmailAnalyzer.cs" />
+    <Compile Include="Analysis\Standard\UAX29URLEmailTokenizer.cs" />
+    <Compile Include="Analysis\Standard\UAX29URLEmailTokenizerFactory.cs" />
+    <Compile Include="Analysis\Synonym\FSTSynonymFilterFactory.cs" />
+    <Compile Include="Analysis\Synonym\SlowSynonymFilter.cs" />
+    <Compile Include="Analysis\Synonym\SlowSynonymFilterFactory.cs" />
+    <Compile Include="Analysis\Synonym\SlowSynonymMap.cs" />
+    <Compile Include="Analysis\Synonym\SolrSynonymParser.cs" />
+    <Compile Include="Analysis\Synonym\SynonymFilter.cs" />
+    <Compile Include="Analysis\Synonym\SynonymFilterFactory.cs" />
+    <Compile Include="Analysis\Synonym\SynonymMap.cs" />
+    <Compile Include="Analysis\Synonym\WordnetSynonymParser.cs" />
+    <Compile Include="Analysis\Util\AbstractAnalysisFactory.cs" />
+    <Compile Include="Analysis\Util\AnalysisSPILoader.cs" />
+    <Compile Include="Analysis\Util\CharacterUtils.cs" />
+    <Compile Include="Analysis\Util\CharArrayIterator.cs" />
+    <Compile Include="Analysis\Util\CharArrayMap.cs" />
+    <Compile Include="Analysis\Util\CharArraySet.cs" />
+    <Compile Include="Analysis\Util\CharFilterFactory.cs" />
+    <Compile Include="Analysis\Util\CharTokenizer.cs" />
+    <Compile Include="Analysis\Util\ClasspathResourceLoader.cs" />
+    <Compile Include="Analysis\Util\ElisionFilter.cs" />
+    <Compile Include="Analysis\Util\ElisionFilterFactory.cs" />
+    <Compile Include="Analysis\Util\FilesystemResourceLoader.cs" />
+    <Compile Include="Analysis\Util\FilteringTokenFilter.cs" />
+    <Compile Include="Analysis\Util\MultiTermAwareComponent.cs" />
+    <Compile Include="Analysis\Util\OpenStringBuilder.cs" />
+    <Compile Include="Analysis\Util\ResourceLoader.cs" />
+    <Compile Include="Analysis\Util\ResourceLoaderAware.cs" />
+    <Compile Include="Analysis\Util\RollingCharBuffer.cs" />
+    <Compile Include="Analysis\Util\SegmentingTokenizerBase.cs" />
+    <Compile Include="Analysis\Util\StemmerUtil.cs" />
+    <Compile Include="Analysis\Util\StopwordAnalyzerBase.cs" />
+    <Compile Include="Analysis\Util\TokenFilterFactory.cs" />
+    <Compile Include="Analysis\Util\TokenizerFactory.cs" />
+    <Compile Include="Analysis\Util\WordlistLoader.cs" />
+    <Compile Include="Analysis\Wikipedia\WikipediaTokenizer.cs" />
+    <Compile Include="Analysis\Wikipedia\WikipediaTokenizerFactory.cs" />
+    <Compile Include="Collation\CollationAttributeFactory.cs" />
+    <Compile Include="Collation\CollationKeyAnalyzer.cs" />
+    <Compile Include="Collation\CollationKeyFilter.cs" />
+    <Compile Include="Collation\CollationKeyFilterFactory.cs" />
+    <Compile Include="Collation\TokenAttributes\CollatedTermAttributeImpl.cs" />
+    <Compile Include="Properties\AssemblyInfo.cs" />
+  </ItemGroup>
+  <ItemGroup>
+    <ProjectReference Include="..\Lucene.Net.Core\Lucene.Net.csproj">
+      <Project>{5d4ad9be-1ffb-41ab-9943-25737971bf57}</Project>
+      <Name>Lucene.Net</Name>
+    </ProjectReference>
+  </ItemGroup>
+  <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
+  <!-- To modify your build process, add your task inside one of the targets below and uncomment it. 
+       Other similar extension points exist, see Microsoft.Common.targets.
+  <Target Name="BeforeBuild">
+  </Target>
+  <Target Name="AfterBuild">
+  </Target>
+  -->
+</Project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Properties/AssemblyInfo.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Properties/AssemblyInfo.cs b/src/Lucene.Net.Analysis.Common/Properties/AssemblyInfo.cs
new file mode 100644
index 0000000..83220d7
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Properties/AssemblyInfo.cs
@@ -0,0 +1,36 @@
+using System.Reflection;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+// General Information about an assembly is controlled through the following 
+// set of attributes. Change these attribute values to modify the information
+// associated with an assembly.
+[assembly: AssemblyTitle("Lucene.Net.Analysis.Common")]
+[assembly: AssemblyDescription("")]
+[assembly: AssemblyConfiguration("")]
+[assembly: AssemblyCompany("")]
+[assembly: AssemblyProduct("Lucene.Net.Analysis.Common")]
+[assembly: AssemblyCopyright("Copyright ©  2014")]
+[assembly: AssemblyTrademark("")]
+[assembly: AssemblyCulture("")]
+
+// Setting ComVisible to false makes the types in this assembly not visible 
+// to COM components.  If you need to access a type in this assembly from 
+// COM, set the ComVisible attribute to true on that type.
+[assembly: ComVisible(false)]
+
+// The following GUID is for the ID of the typelib if this project is exposed to COM
+[assembly: Guid("f57314a7-e71f-4b3c-860f-564046ca398b")]
+
+// Version information for an assembly consists of the following four values:
+//
+//      Major Version
+//      Minor Version 
+//      Build Number
+//      Revision
+//
+// You can specify all the values or you can default the Build and Revision Numbers 
+// by using the '*' as shown below:
+// [assembly: AssemblyVersion("1.0.*")]
+[assembly: AssemblyVersion("1.0.0.0")]
+[assembly: AssemblyFileVersion("1.0.0.0")]

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Core/Analysis/Analyzer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Core/Analysis/Analyzer.cs b/src/Lucene.Net.Core/Analysis/Analyzer.cs
index bf2fcf1..0ae0c8e 100644
--- a/src/Lucene.Net.Core/Analysis/Analyzer.cs
+++ b/src/Lucene.Net.Core/Analysis/Analyzer.cs
@@ -104,7 +104,7 @@ namespace Lucene.Net.Analysis
         /// <param name="reader">
         ///          the reader passed to the <seealso cref="Tokenizer"/> constructor </param>
         /// <returns> the <seealso cref="TokenStreamComponents"/> for this analyzer. </returns>
-        protected internal abstract TokenStreamComponents CreateComponents(string fieldName, TextReader reader);
+        protected abstract TokenStreamComponents CreateComponents(string fieldName, TextReader reader);
 
         /// <summary>
         /// Returns a TokenStream suitable for <code>fieldName</code>, tokenizing

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Core/Analysis/AnalyzerWrapper.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Core/Analysis/AnalyzerWrapper.cs b/src/Lucene.Net.Core/Analysis/AnalyzerWrapper.cs
index b18e17a..f58467b 100644
--- a/src/Lucene.Net.Core/Analysis/AnalyzerWrapper.cs
+++ b/src/Lucene.Net.Core/Analysis/AnalyzerWrapper.cs
@@ -95,7 +95,7 @@ namespace Lucene.Net.Analysis
             return reader;
         }
 
-        protected internal override sealed TokenStreamComponents CreateComponents(string fieldName, TextReader aReader)
+        protected override sealed TokenStreamComponents CreateComponents(string fieldName, TextReader aReader)
         {
             return WrapComponents(fieldName, GetWrappedAnalyzer(fieldName).CreateComponents(fieldName, aReader));
         }

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Core/Analysis/TokenStream.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Core/Analysis/TokenStream.cs b/src/Lucene.Net.Core/Analysis/TokenStream.cs
index 40cb92e..ccaed6b 100644
--- a/src/Lucene.Net.Core/Analysis/TokenStream.cs
+++ b/src/Lucene.Net.Core/Analysis/TokenStream.cs
@@ -1,5 +1,7 @@
 using Lucene.Net.Analysis.Tokenattributes;
 using System;
+using Lucene.Net.Documents;
+using Lucene.Net.Util;
 
 namespace Lucene.Net.Analysis
 {


Mime
View raw message