lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From synhers...@apache.org
Subject [30/34] lucenenet git commit: Raw porting of Lucene.Net.Analysis.Common
Date Fri, 07 Nov 2014 23:12:34 GMT
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/TernaryTree.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/TernaryTree.cs b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/TernaryTree.cs
new file mode 100644
index 0000000..cbbd16a
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Compound/hyphenation/TernaryTree.cs
@@ -0,0 +1,780 @@
+using System;
+using System.Collections.Generic;
+using System.Text;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace org.apache.lucene.analysis.compound.hyphenation
+{
+
+
+	/// <summary>
+	/// <h2>Ternary Search Tree.</h2>
+	/// 
+	/// <para>
+	/// A ternary search tree is a hybrid between a binary tree and a digital search
+	/// tree (trie). Keys are limited to strings. A data value of type char is stored
+	/// in each leaf node. It can be used as an index (or pointer) to the data.
+	/// Branches that only contain one key are compressed to one node by storing a
+	/// pointer to the trailer substring of the key. This class is intended to serve
+	/// as base class or helper class to implement Dictionary collections or the
+	/// like. Ternary trees have some nice properties as the following: the tree can
+	/// be traversed in sorted order, partial matches (wildcard) can be implemented,
+	/// retrieval of all keys within a given distance from the target, etc. The
+	/// storage requirements are higher than a binary tree but a lot less than a
+	/// trie. Performance is comparable with a hash table, sometimes it outperforms a
+	/// hash function (most of the time can determine a miss faster than a hash).
+	/// </para>
+	/// 
+	/// <para>
+	/// The main purpose of this java port is to serve as a base for implementing
+	/// TeX's hyphenation algorithm (see The TeXBook, appendix H). Each language
+	/// requires from 5000 to 15000 hyphenation patterns which will be keys in this
+	/// tree. The strings patterns are usually small (from 2 to 5 characters), but
+	/// each char in the tree is stored in a node. Thus memory usage is the main
+	/// concern. We will sacrifice 'elegance' to keep memory requirements to the
+	/// minimum. Using java's char type as pointer (yes, I know pointer it is a
+	/// forbidden word in java) we can keep the size of the node to be just 8 bytes
+	/// (3 pointers and the data char). This gives room for about 65000 nodes. In my
+	/// tests the english patterns took 7694 nodes and the german patterns 10055
+	/// nodes, so I think we are safe.
+	/// </para>
+	/// 
+	/// <para>
+	/// All said, this is a map with strings as keys and char as value. Pretty
+	/// limited!. It can be extended to a general map by using the string
+	/// representation of an object and using the char value as an index to an array
+	/// that contains the object values.
+	/// </para>
+	/// 
+	/// This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified. 
+	/// </summary>
+
+	public class TernaryTree : ICloneable
+	{
+
+	  /// <summary>
+	  /// We use 4 arrays to represent a node. I guess I should have created a proper
+	  /// node class, but somehow Knuth's pascal code made me forget we now have a
+	  /// portable language with virtual memory management and automatic garbage
+	  /// collection! And now is kind of late, furthermore, if it ain't broken, don't
+	  /// fix it.
+	  /// </summary>
+
+	  /// <summary>
+	  /// Pointer to low branch and to rest of the key when it is stored directly in
+	  /// this node, we don't have unions in java!
+	  /// </summary>
+	  protected internal char[] lo;
+
+	  /// <summary>
+	  /// Pointer to high branch.
+	  /// </summary>
+	  protected internal char[] hi;
+
+	  /// <summary>
+	  /// Pointer to equal branch and to data when this node is a string terminator.
+	  /// </summary>
+	  protected internal char[] eq;
+
+	  /// <summary>
+	  /// <P>
+	  /// The character stored in this node: splitchar. Two special values are
+	  /// reserved:
+	  /// </P>
+	  /// <ul>
+	  /// <li>0x0000 as string terminator</li>
+	  /// <li>0xFFFF to indicate that the branch starting at this node is compressed</li>
+	  /// </ul>
+	  /// <para>
+	  /// This shouldn't be a problem if we give the usual semantics to strings since
+	  /// 0xFFFF is guaranteed not to be an Unicode character.
+	  /// </para>
+	  /// </summary>
+	  protected internal char[] sc;
+
+	  /// <summary>
+	  /// This vector holds the trailing of the keys when the branch is compressed.
+	  /// </summary>
+	  protected internal CharVector kv;
+
+	  protected internal char root;
+
+	  protected internal char freenode;
+
+	  protected internal int length; // number of items in tree
+
+	  protected internal const int BLOCK_SIZE = 2048; // allocation size for arrays
+
+	  internal TernaryTree()
+	  {
+		init();
+	  }
+
+	  protected internal virtual void init()
+	  {
+		root = (char)0;
+		freenode = (char)1;
+		length = 0;
+		lo = new char[BLOCK_SIZE];
+		hi = new char[BLOCK_SIZE];
+		eq = new char[BLOCK_SIZE];
+		sc = new char[BLOCK_SIZE];
+		kv = new CharVector();
+	  }
+
+	  /// <summary>
+	  /// Branches are initially compressed, needing one node per key plus the size
+	  /// of the string key. They are decompressed as needed when another key with
+	  /// same prefix is inserted. This saves a lot of space, specially for long
+	  /// keys.
+	  /// </summary>
+	  public virtual void insert(string key, char val)
+	  {
+		// make sure we have enough room in the arrays
+		int len = key.Length + 1; // maximum number of nodes that may be generated
+		if (freenode + len > eq.Length)
+		{
+		  redimNodeArrays(eq.Length + BLOCK_SIZE);
+		}
+		char[] strkey = new char[len--];
+		key.CopyTo(0, strkey, 0, len - 0);
+		strkey[len] = (char)0;
+		root = insert(root, strkey, 0, val);
+	  }
+
+	  public virtual void insert(char[] key, int start, char val)
+	  {
+		int len = strlen(key) + 1;
+		if (freenode + len > eq.Length)
+		{
+		  redimNodeArrays(eq.Length + BLOCK_SIZE);
+		}
+		root = insert(root, key, start, val);
+	  }
+
+	  /// <summary>
+	  /// The actual insertion function, recursive version.
+	  /// </summary>
+	  private char insert(char p, char[] key, int start, char val)
+	  {
+		int len = strlen(key, start);
+		if (p == 0)
+		{
+		  // this means there is no branch, this node will start a new branch.
+		  // Instead of doing that, we store the key somewhere else and create
+		  // only one node with a pointer to the key
+		  p = freenode++;
+		  eq[p] = val; // holds data
+		  length++;
+		  hi[p] = (char)0;
+		  if (len > 0)
+		  {
+			sc[p] = (char)0xFFFF; // indicates branch is compressed
+			lo[p] = (char) kv.alloc(len + 1); // use 'lo' to hold pointer to key
+			strcpy(kv.Array, lo[p], key, start);
+		  }
+		  else
+		  {
+			sc[p] = (char)0;
+			lo[p] = (char)0;
+		  }
+		  return p;
+		}
+
+		if (sc[p] == 0xFFFF)
+		{
+		  // branch is compressed: need to decompress
+		  // this will generate garbage in the external key array
+		  // but we can do some garbage collection later
+		  char pp = freenode++;
+		  lo[pp] = lo[p]; // previous pointer to key
+		  eq[pp] = eq[p]; // previous pointer to data
+		  lo[p] = (char)0;
+		  if (len > 0)
+		  {
+			sc[p] = kv.get(lo[pp]);
+			eq[p] = pp;
+			lo[pp]++;
+			if (kv.get(lo[pp]) == 0)
+			{
+			  // key completly decompressed leaving garbage in key array
+			  lo[pp] = (char)0;
+			  sc[pp] = (char)0;
+			  hi[pp] = (char)0;
+			}
+			else
+			{
+			  // we only got first char of key, rest is still there
+			  sc[pp] = (char)0xFFFF;
+			}
+		  }
+		  else
+		  {
+			// In this case we can save a node by swapping the new node
+			// with the compressed node
+			sc[pp] = (char)0xFFFF;
+			hi[p] = pp;
+			sc[p] = (char)0;
+			eq[p] = val;
+			length++;
+			return p;
+		  }
+		}
+		char s = key[start];
+		if (s < sc[p])
+		{
+		  lo[p] = insert(lo[p], key, start, val);
+		}
+		else if (s == sc[p])
+		{
+		  if (s != 0)
+		  {
+			eq[p] = insert(eq[p], key, start + 1, val);
+		  }
+		  else
+		  {
+			// key already in tree, overwrite data
+			eq[p] = val;
+		  }
+		}
+		else
+		{
+		  hi[p] = insert(hi[p], key, start, val);
+		}
+		return p;
+	  }
+
+	  /// <summary>
+	  /// Compares 2 null terminated char arrays
+	  /// </summary>
+	  public static int strcmp(char[] a, int startA, char[] b, int startB)
+	  {
+		for (; a[startA] == b[startB]; startA++, startB++)
+		{
+		  if (a[startA] == 0)
+		  {
+			return 0;
+		  }
+		}
+		return a[startA] - b[startB];
+	  }
+
+	  /// <summary>
+	  /// Compares a string with null terminated char array
+	  /// </summary>
+	  public static int strcmp(string str, char[] a, int start)
+	  {
+		int i , d , len = str.Length;
+		for (i = 0; i < len; i++)
+		{
+		  d = (int) str[i] - a[start + i];
+		  if (d != 0)
+		  {
+			return d;
+		  }
+		  if (a[start + i] == 0)
+		  {
+			return d;
+		  }
+		}
+		if (a[start + i] != 0)
+		{
+		  return -a[start + i];
+		}
+		return 0;
+
+	  }
+
+	  public static void strcpy(char[] dst, int di, char[] src, int si)
+	  {
+		while (src[si] != 0)
+		{
+		  dst[di++] = src[si++];
+		}
+		dst[di] = (char)0;
+	  }
+
+	  public static int strlen(char[] a, int start)
+	  {
+		int len = 0;
+		for (int i = start; i < a.Length && a[i] != 0; i++)
+		{
+		  len++;
+		}
+		return len;
+	  }
+
+	  public static int strlen(char[] a)
+	  {
+		return strlen(a, 0);
+	  }
+
+	  public virtual int find(string key)
+	  {
+		int len = key.Length;
+		char[] strkey = new char[len + 1];
+		key.CopyTo(0, strkey, 0, len - 0);
+		strkey[len] = (char)0;
+
+		return find(strkey, 0);
+	  }
+
+	  public virtual int find(char[] key, int start)
+	  {
+		int d;
+		char p = root;
+		int i = start;
+		char c;
+
+		while (p != 0)
+		{
+		  if (sc[p] == 0xFFFF)
+		  {
+			if (strcmp(key, i, kv.Array, lo[p]) == 0)
+			{
+			  return eq[p];
+			}
+			else
+			{
+			  return -1;
+			}
+		  }
+		  c = key[i];
+		  d = c - sc[p];
+		  if (d == 0)
+		  {
+			if (c == 0)
+			{
+			  return eq[p];
+			}
+			i++;
+			p = eq[p];
+		  }
+		  else if (d < 0)
+		  {
+			p = lo[p];
+		  }
+		  else
+		  {
+			p = hi[p];
+		  }
+		}
+		return -1;
+	  }
+
+	  public virtual bool knows(string key)
+	  {
+		return (find(key) >= 0);
+	  }
+
+	  // redimension the arrays
+	  private void redimNodeArrays(int newsize)
+	  {
+		int len = newsize < lo.Length ? newsize : lo.Length;
+		char[] na = new char[newsize];
+		Array.Copy(lo, 0, na, 0, len);
+		lo = na;
+		na = new char[newsize];
+		Array.Copy(hi, 0, na, 0, len);
+		hi = na;
+		na = new char[newsize];
+		Array.Copy(eq, 0, na, 0, len);
+		eq = na;
+		na = new char[newsize];
+		Array.Copy(sc, 0, na, 0, len);
+		sc = na;
+	  }
+
+	  public virtual int size()
+	  {
+		return length;
+	  }
+
+	  public override TernaryTree clone()
+	  {
+		TernaryTree t = new TernaryTree();
+		t.lo = this.lo.Clone();
+		t.hi = this.hi.Clone();
+		t.eq = this.eq.Clone();
+		t.sc = this.sc.Clone();
+		t.kv = this.kv.clone();
+		t.root = this.root;
+		t.freenode = this.freenode;
+		t.length = this.length;
+
+		return t;
+	  }
+
+	  /// <summary>
+	  /// Recursively insert the median first and then the median of the lower and
+	  /// upper halves, and so on in order to get a balanced tree. The array of keys
+	  /// is assumed to be sorted in ascending order.
+	  /// </summary>
+	  protected internal virtual void insertBalanced(string[] k, char[] v, int offset, int n)
+	  {
+		int m;
+		if (n < 1)
+		{
+		  return;
+		}
+		m = n >> 1;
+
+		insert(k[m + offset], v[m + offset]);
+		insertBalanced(k, v, offset, m);
+
+		insertBalanced(k, v, offset + m + 1, n - m - 1);
+	  }
+
+	  /// <summary>
+	  /// Balance the tree for best search performance
+	  /// </summary>
+	  public virtual void balance()
+	  {
+		// System.out.print("Before root splitchar = ");
+		// System.out.println(sc[root]);
+
+		int i = 0, n = length;
+		string[] k = new string[n];
+		char[] v = new char[n];
+		Iterator iter = new Iterator(this);
+		while (iter.hasMoreElements())
+		{
+		  v[i] = iter.Value;
+		  k[i++] = iter.nextElement();
+		}
+		init();
+		insertBalanced(k, v, 0, n);
+
+		// With uniform letter distribution sc[root] should be around 'm'
+		// System.out.print("After root splitchar = ");
+		// System.out.println(sc[root]);
+	  }
+
+	  /// <summary>
+	  /// Each node stores a character (splitchar) which is part of some key(s). In a
+	  /// compressed branch (one that only contain a single string key) the trailer
+	  /// of the key which is not already in nodes is stored externally in the kv
+	  /// array. As items are inserted, key substrings decrease. Some substrings may
+	  /// completely disappear when the whole branch is totally decompressed. The
+	  /// tree is traversed to find the key substrings actually used. In addition,
+	  /// duplicate substrings are removed using a map (implemented with a
+	  /// TernaryTree!).
+	  /// 
+	  /// </summary>
+	  public virtual void trimToSize()
+	  {
+		// first balance the tree for best performance
+		balance();
+
+		// redimension the node arrays
+		redimNodeArrays(freenode);
+
+		// ok, compact kv array
+		CharVector kx = new CharVector();
+		kx.alloc(1);
+		TernaryTree map = new TernaryTree();
+		compact(kx, map, root);
+		kv = kx;
+		kv.trimToSize();
+	  }
+
+	  private void compact(CharVector kx, TernaryTree map, char p)
+	  {
+		int k;
+		if (p == 0)
+		{
+		  return;
+		}
+		if (sc[p] == 0xFFFF)
+		{
+		  k = map.find(kv.Array, lo[p]);
+		  if (k < 0)
+		  {
+			k = kx.alloc(strlen(kv.Array, lo[p]) + 1);
+			strcpy(kx.Array, k, kv.Array, lo[p]);
+			map.insert(kx.Array, k, (char) k);
+		  }
+		  lo[p] = (char) k;
+		}
+		else
+		{
+		  compact(kx, map, lo[p]);
+		  if (sc[p] != 0)
+		  {
+			compact(kx, map, eq[p]);
+		  }
+		  compact(kx, map, hi[p]);
+		}
+	  }
+
+	  public virtual IEnumerator<string> keys()
+	  {
+		return new Iterator(this);
+	  }
+
+	  public class Iterator : IEnumerator<string>
+	  {
+		  private readonly TernaryTree outerInstance;
+
+
+		/// <summary>
+		/// current node index
+		/// </summary>
+		internal int cur;
+
+		/// <summary>
+		/// current key
+		/// </summary>
+		internal string curkey;
+
+		private class Item : ICloneable
+		{
+			private readonly TernaryTree.Iterator outerInstance;
+
+		  internal char parent;
+
+		  internal char child;
+
+		  public Item(TernaryTree.Iterator outerInstance)
+		  {
+			  this.outerInstance = outerInstance;
+			parent = (char)0;
+			child = (char)0;
+		  }
+
+		  public Item(TernaryTree.Iterator outerInstance, char p, char c)
+		  {
+			  this.outerInstance = outerInstance;
+			parent = p;
+			child = c;
+		  }
+
+		  public override Item clone()
+		  {
+			return new Item(outerInstance, parent, child);
+		  }
+
+		}
+
+		/// <summary>
+		/// Node stack
+		/// </summary>
+		internal Stack<Item> ns;
+
+		/// <summary>
+		/// key stack implemented with a StringBuilder
+		/// </summary>
+		internal StringBuilder ks;
+
+		public Iterator(TernaryTree outerInstance)
+		{
+			this.outerInstance = outerInstance;
+		  cur = -1;
+		  ns = new Stack<>();
+		  ks = new StringBuilder();
+		  rewind();
+		}
+
+		public virtual void rewind()
+		{
+		  ns.removeAllElements();
+		  ks.Length = 0;
+		  cur = outerInstance.root;
+		  run();
+		}
+
+		public override string nextElement()
+		{
+		  string res = curkey;
+		  cur = up();
+		  run();
+		  return res;
+		}
+
+		public virtual char Value
+		{
+			get
+			{
+			  if (cur >= 0)
+			  {
+				return outerInstance.eq[cur];
+			  }
+			  return 0;
+			}
+		}
+
+		public override bool hasMoreElements()
+		{
+		  return (cur != -1);
+		}
+
+		/// <summary>
+		/// traverse upwards
+		/// </summary>
+		internal virtual int up()
+		{
+		  Item i = new Item(this);
+		  int res = 0;
+
+		  if (ns.Count == 0)
+		  {
+			return -1;
+		  }
+
+		  if (cur != 0 && outerInstance.sc[cur] == 0)
+		  {
+			return outerInstance.lo[cur];
+		  }
+
+		  bool climb = true;
+
+		  while (climb)
+		  {
+			i = ns.Pop();
+			i.child++;
+			switch (i.child)
+			{
+			  case 1:
+				if (outerInstance.sc[i.parent] != 0)
+				{
+				  res = outerInstance.eq[i.parent];
+				  ns.Push(i.clone());
+				  ks.Append(outerInstance.sc[i.parent]);
+				}
+				else
+				{
+				  i.child++;
+				  ns.Push(i.clone());
+				  res = outerInstance.hi[i.parent];
+				}
+				climb = false;
+				break;
+
+			  case 2:
+				res = outerInstance.hi[i.parent];
+				ns.Push(i.clone());
+				if (ks.Length > 0)
+				{
+				  ks.Length = ks.Length - 1; // pop
+				}
+				climb = false;
+				break;
+
+			  default:
+				if (ns.Count == 0)
+				{
+				  return -1;
+				}
+				climb = true;
+				break;
+			}
+		  }
+		  return res;
+		}
+
+		/// <summary>
+		/// traverse the tree to find next key
+		/// </summary>
+		internal virtual int run()
+		{
+		  if (cur == -1)
+		  {
+			return -1;
+		  }
+
+		  bool leaf = false;
+		  while (true)
+		  {
+			// first go down on low branch until leaf or compressed branch
+			while (cur != 0)
+			{
+			  if (outerInstance.sc[cur] == 0xFFFF)
+			  {
+				leaf = true;
+				break;
+			  }
+			  ns.Push(new Item(this, (char) cur, '\u0000'));
+			  if (outerInstance.sc[cur] == 0)
+			  {
+				leaf = true;
+				break;
+			  }
+			  cur = outerInstance.lo[cur];
+			}
+			if (leaf)
+			{
+			  break;
+			}
+			// nothing found, go up one node and try again
+			cur = up();
+			if (cur == -1)
+			{
+			  return -1;
+			}
+		  }
+		  // The current node should be a data node and
+		  // the key should be in the key stack (at least partially)
+		  StringBuilder buf = new StringBuilder(ks.ToString());
+		  if (outerInstance.sc[cur] == 0xFFFF)
+		  {
+			int p = outerInstance.lo[cur];
+			while (outerInstance.kv.get(p) != 0)
+			{
+			  buf.Append(outerInstance.kv.get(p++));
+			}
+		  }
+		  curkey = buf.ToString();
+		  return 0;
+		}
+
+	  }
+
+	  public virtual void printStats(PrintStream @out)
+	  {
+		@out.println("Number of keys = " + Convert.ToString(length));
+		@out.println("Node count = " + Convert.ToString(freenode));
+		// System.out.println("Array length = " + Integer.toString(eq.length));
+		@out.println("Key Array length = " + Convert.ToString(kv.length()));
+
+		/*
+		 * for(int i=0; i<kv.length(); i++) if ( kv.get(i) != 0 )
+		 * System.out.print(kv.get(i)); else System.out.println("");
+		 * System.out.println("Keys:"); for(Enumeration enum = keys();
+		 * enum.hasMoreElements(); ) System.out.println(enum.nextElement());
+		 */
+
+	  }
+	/*
+	  public static void main(String[] args) {
+	    TernaryTree tt = new TernaryTree();
+	    tt.insert("Carlos", 'C');
+	    tt.insert("Car", 'r');
+	    tt.insert("palos", 'l');
+	    tt.insert("pa", 'p');
+	    tt.trimToSize();
+	    System.out.println((char) tt.find("Car"));
+	    System.out.println((char) tt.find("Carlos"));
+	    System.out.println((char) tt.find("alto"));
+	    tt.printStats(System.out);
+	  }
+	  */
+
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Core/KeywordAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Core/KeywordAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Core/KeywordAnalyzer.cs
new file mode 100644
index 0000000..1f1a42b
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Core/KeywordAnalyzer.cs
@@ -0,0 +1,40 @@
+using System.IO;
+using org.apache.lucene.analysis.core;
+
+namespace Lucene.Net.Analysis.Core
+{
+
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+
+    /// <summary>
+    /// "Tokenizes" the entire stream as a single token. This is useful
+    /// for data like zip codes, ids, and some product names.
+    /// </summary>
+    public sealed class KeywordAnalyzer : Analyzer
+    {
+        public KeywordAnalyzer()
+        {
+        }
+
+        protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
+        {
+            return new TokenStreamComponents(new KeywordTokenizer(reader));
+        }
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Core/KeywordTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Core/KeywordTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Core/KeywordTokenizer.cs
new file mode 100644
index 0000000..6d2cbde
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Core/KeywordTokenizer.cs
@@ -0,0 +1,106 @@
+using System.IO;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+using Reader = System.IO.TextReader;
+
+namespace Lucene.Net.Analysis.Core
+{
+
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+    /// <summary>
+    /// Emits the entire input as a single token.
+    /// </summary>
+    public sealed class KeywordTokenizer : Tokenizer
+    {
+        /// <summary>
+        /// Default read buffer size </summary>
+        public const int DEFAULT_BUFFER_SIZE = 256;
+
+        private bool done = false;
+        private int finalOffset;
+        private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+        private OffsetAttribute offsetAtt = addAttribute(typeof(OffsetAttribute));
+
+        public KeywordTokenizer(TextReader input)
+            : this(input, DEFAULT_BUFFER_SIZE)
+        {
+        }
+
+        public KeywordTokenizer(TextReader input, int bufferSize)
+            : base(input)
+        {
+            if (bufferSize <= 0)
+            {
+                throw new System.ArgumentException("bufferSize must be > 0");
+            }
+            termAtt.ResizeBuffer(bufferSize);
+        }
+
+        public KeywordTokenizer(AttributeSource.AttributeFactory factory, Reader input, int bufferSize)
+            : base(factory, input)
+        {
+            if (bufferSize <= 0)
+            {
+                throw new System.ArgumentException("bufferSize must be > 0");
+            }
+            termAtt.ResizeBuffer(bufferSize);
+        }
+
+        public override bool IncrementToken()
+        {
+            if (!done)
+            {
+                ClearAttributes();
+                done = true;
+                int upto = 0;
+                char[] buffer = termAtt.Buffer();
+                while (true)
+                {
+                    int length = input.Read(buffer, upto, buffer.Length - upto);
+                    if (length == -1)
+                    {
+                        break;
+                    }
+                    upto += length;
+                    if (upto == buffer.Length)
+                    {
+                        buffer = termAtt.ResizeBuffer(1 + buffer.Length);
+                    }
+                }
+                termAtt.Length = upto;
+                finalOffset = CorrectOffset(upto);
+                offsetAtt.SetOffset(CorrectOffset(0), finalOffset);
+                return true;
+            }
+            return false;
+        }
+
+        public override void End()
+        {
+            base.End();
+            // set final offset 
+            offsetAtt.SetOffset(finalOffset, finalOffset);
+        }
+
+        public override void Reset()
+        {
+            base.Reset();
+            this.done = false;
+        }
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Core/KeywordTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Core/KeywordTokenizerFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Core/KeywordTokenizerFactory.cs
new file mode 100644
index 0000000..8c3929f
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Core/KeywordTokenizerFactory.cs
@@ -0,0 +1,53 @@
+using System.Collections.Generic;
+using System.IO;
+using Lucene.Net.Analysis.Util;
+using Lucene.Net.Util;
+using org.apache.lucene.analysis.util;
+
+namespace Lucene.Net.Analysis.Core
+{
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+    /// <summary>
+	/// Factory for <seealso cref="KeywordTokenizer"/>. 
+	/// <pre class="prettyprint">
+	/// &lt;fieldType name="text_keyword" class="solr.TextField" positionIncrementGap="100"&gt;
+	///   &lt;analyzer&gt;
+	///     &lt;tokenizer class="solr.KeywordTokenizerFactory"/&gt;
+	///   &lt;/analyzer&gt;
+	/// &lt;/fieldType&gt;</pre> 
+	/// </summary>
+	public class KeywordTokenizerFactory : TokenizerFactory
+	{
+
+	  /// <summary>
+	  /// Creates a new KeywordTokenizerFactory </summary>
+	  public KeywordTokenizerFactory(IDictionary<string, string> args) : base(args)
+	  {
+		if (args.Count > 0)
+		{
+		  throw new System.ArgumentException("Unknown parameters: " + args);
+		}
+	  }
+
+	  public override Tokenizer Create(AttributeSource.AttributeFactory factory, TextReader input)
+	  {
+		return new KeywordTokenizer(factory, input, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Core/LetterTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Core/LetterTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Core/LetterTokenizer.cs
new file mode 100644
index 0000000..3a85d5d
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Core/LetterTokenizer.cs
@@ -0,0 +1,84 @@
+using System.IO;
+using Lucene.Net.Analysis.Util;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using org.apache.lucene.analysis.util;
+
+namespace Lucene.Net.Analysis.Core
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+    /// <summary>
+	/// A LetterTokenizer is a tokenizer that divides text at non-letters. That's to
+	/// say, it defines tokens as maximal strings of adjacent letters, as defined by
+	/// java.lang.Character.isLetter() predicate.
+	/// <para>
+	/// Note: this does a decent job for most European languages, but does a terrible
+	/// job for some Asian languages, where words are not separated by spaces.
+	/// </para>
+	/// <para>
+	/// <a name="version"/>
+	/// You must specify the required <seealso cref="Version"/> compatibility when creating
+	/// <seealso cref="LetterTokenizer"/>:
+	/// <ul>
+	/// <li>As of 3.1, <seealso cref="CharTokenizer"/> uses an int based API to normalize and
+	/// detect token characters. See <seealso cref="CharTokenizer#isTokenChar(int)"/> and
+	/// <seealso cref="CharTokenizer#normalize(int)"/> for details.</li>
+	/// </ul>
+	/// </para>
+	/// </summary>
+
+	public class LetterTokenizer : CharTokenizer
+	{
+
+	  /// <summary>
+	  /// Construct a new LetterTokenizer.
+	  /// </summary>
+	  /// <param name="matchVersion">
+	  ///          Lucene version to match See <seealso cref="<a href="#version">above</a>"/> </param>
+	  /// <param name="in">
+	  ///          the input to split up into tokens </param>
+	  public LetterTokenizer(Version matchVersion, TextReader @in) : base(matchVersion, @in)
+	  {
+	  }
+
+	  /// <summary>
+	  /// Construct a new LetterTokenizer using a given
+	  /// <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/>.
+	  /// </summary>
+	  /// <param name="matchVersion">
+	  ///          Lucene version to match See <seealso cref="<a href="#version">above</a>"/> </param>
+	  /// <param name="factory">
+	  ///          the attribute factory to use for this <seealso cref="Tokenizer"/> </param>
+	  /// <param name="in">
+	  ///          the input to split up into tokens </param>
+	  public LetterTokenizer(Version matchVersion, AttributeSource.AttributeFactory factory, TextReader @in) : base(matchVersion, factory, @in)
+	  {
+	  }
+
+	  /// <summary>
+	  /// Collects only characters which satisfy
+	  /// <seealso cref="Character#isLetter(int)"/>.
+	  /// </summary>
+	  protected internal override bool isTokenChar(int c)
+	  {
+		return char.IsLetter(c);
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Core/LetterTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Core/LetterTokenizerFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Core/LetterTokenizerFactory.cs
new file mode 100644
index 0000000..8909bb3
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Core/LetterTokenizerFactory.cs
@@ -0,0 +1,54 @@
+using System.Collections.Generic;
+using System.IO;
+using Lucene.Net.Analysis.Util;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Analysis.Core
+{
+
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+    /// <summary>
+    /// Factory for <seealso cref="LetterTokenizer"/>. 
+    /// <pre class="prettyprint">
+    /// &lt;fieldType name="text_letter" class="solr.TextField" positionIncrementGap="100"&gt;
+    ///   &lt;analyzer&gt;
+    ///     &lt;tokenizer class="solr.LetterTokenizerFactory"/&gt;
+    ///   &lt;/analyzer&gt;
+    /// &lt;/fieldType&gt;</pre>
+    /// </summary>
+    public class LetterTokenizerFactory : TokenizerFactory
+    {
+
+        /// <summary>
+        /// Creates a new LetterTokenizerFactory </summary>
+        public LetterTokenizerFactory(IDictionary<string, string> args)
+            : base(args)
+        {
+            assureMatchVersion();
+            if (args.Count > 0)
+            {
+                throw new System.ArgumentException("Unknown parameters: " + args);
+            }
+        }
+
+        public override Tokenizer Create(AttributeSource.AttributeFactory factory, TextReader input)
+        {
+            return new LetterTokenizer(luceneMatchVersion, factory, input);
+        }
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Core/LowerCaseFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Core/LowerCaseFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Core/LowerCaseFilter.cs
new file mode 100644
index 0000000..097bc4b
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Core/LowerCaseFilter.cs
@@ -0,0 +1,62 @@
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+using org.apache.lucene.analysis.util;
+
+namespace Lucene.Net.Analysis.Core
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+    /// <summary>
+	/// Normalizes token text to lower case.
+	/// <a name="version"/>
+	/// <para>You must specify the required <seealso cref="Version"/>
+	/// compatibility when creating LowerCaseFilter:
+	/// <ul>
+	///   <li> As of 3.1, supplementary characters are properly lowercased.
+	/// </ul>
+	/// </para>
+	/// </summary>
+	public sealed class LowerCaseFilter : TokenFilter
+	{
+	  private readonly CharacterUtils charUtils;
+	  private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+
+	  /// <summary>
+	  /// Create a new LowerCaseFilter, that normalizes token text to lower case.
+	  /// </summary>
+	  /// <param name="matchVersion"> See <a href="#version">above</a> </param>
+	  /// <param name="in"> TokenStream to filter </param>
+	  public LowerCaseFilter(Version matchVersion, TokenStream @in) : base(@in)
+	  {
+		charUtils = CharacterUtils.getInstance(matchVersion);
+	  }
+
+	  public override bool IncrementToken()
+	  {
+		if (input.incrementToken())
+		{
+		  charUtils.ToLower(termAtt.Buffer(), 0, termAtt.Length);
+		  return true;
+		}
+		else
+		{
+		  return false;
+		}
+	  }
+	}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Core/LowerCaseFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Core/LowerCaseFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Core/LowerCaseFilterFactory.cs
new file mode 100644
index 0000000..c2efbd1
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Core/LowerCaseFilterFactory.cs
@@ -0,0 +1,62 @@
+using System.Collections.Generic;
+using Lucene.Net.Analysis.Util;
+using org.apache.lucene.analysis.util;
+
+namespace Lucene.Net.Analysis.Core
+{
+
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+    /// <summary>
+    /// Factory for <seealso cref="LowerCaseFilter"/>. 
+    /// <pre class="prettyprint">
+    /// &lt;fieldType name="text_lwrcase" class="solr.TextField" positionIncrementGap="100"&gt;
+    ///   &lt;analyzer&gt;
+    ///     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
+    ///     &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
+    ///   &lt;/analyzer&gt;
+    /// &lt;/fieldType&gt;</pre>
+    /// </summary>
+    public class LowerCaseFilterFactory : TokenFilterFactory, MultiTermAwareComponent
+    {
+
+        /// <summary>
+        /// Creates a new LowerCaseFilterFactory </summary>
+        public LowerCaseFilterFactory(IDictionary<string, string> args)
+            : base(args)
+        {
+            assureMatchVersion();
+            if (args.Count > 0)
+            {
+                throw new System.ArgumentException("Unknown parameters: " + args);
+            }
+        }
+
+        public override TokenStream Create(TokenStream input)
+        {
+            return new LowerCaseFilter(luceneMatchVersion, input);
+        }
+
+        public virtual AbstractAnalysisFactory MultiTermComponent
+        {
+            get
+            {
+                return this;
+            }
+        }
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Core/LowerCaseTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Core/LowerCaseTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Core/LowerCaseTokenizer.cs
new file mode 100644
index 0000000..659f9f3
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Core/LowerCaseTokenizer.cs
@@ -0,0 +1,84 @@
+using System.IO;
+using Lucene.Net.Analysis.Util;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Analysis.Core
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+    /// <summary>
+	/// LowerCaseTokenizer performs the function of LetterTokenizer
+	/// and LowerCaseFilter together.  It divides text at non-letters and converts
+	/// them to lower case.  While it is functionally equivalent to the combination
+	/// of LetterTokenizer and LowerCaseFilter, there is a performance advantage
+	/// to doing the two tasks at once, hence this (redundant) implementation.
+	/// <P>
+	/// Note: this does a decent job for most European languages, but does a terrible
+	/// job for some Asian languages, where words are not separated by spaces.
+	/// </p>
+	/// <para>
+	/// <a name="version"/>
+	/// You must specify the required <seealso cref="Version"/> compatibility when creating
+	/// <seealso cref="LowerCaseTokenizer"/>:
+	/// <ul>
+	/// <li>As of 3.1, <seealso cref="CharTokenizer"/> uses an int based API to normalize and
+	/// detect token characters. See <seealso cref="CharTokenizer#isTokenChar(int)"/> and
+	/// <seealso cref="CharTokenizer#normalize(int)"/> for details.</li>
+	/// </ul>
+	/// </para>
+	/// </summary>
+	public sealed class LowerCaseTokenizer : LetterTokenizer
+	{
+
+	  /// <summary>
+	  /// Construct a new LowerCaseTokenizer.
+	  /// </summary>
+	  /// <param name="matchVersion">
+	  ///          Lucene version to match See <seealso cref="<a href="#version">above</a>"/>
+	  /// </param>
+	  /// <param name="in">
+	  ///          the input to split up into tokens </param>
+	  public LowerCaseTokenizer(Version matchVersion, TextReader @in) : base(matchVersion, @in)
+	  {
+	  }
+
+	  /// <summary>
+	  /// Construct a new LowerCaseTokenizer using a given
+	  /// <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/>.
+	  /// </summary>
+	  /// <param name="matchVersion">
+	  ///          Lucene version to match See <seealso cref="<a href="#version">above</a>"/> </param>
+	  /// <param name="factory">
+	  ///          the attribute factory to use for this <seealso cref="Tokenizer"/> </param>
+	  /// <param name="in">
+	  ///          the input to split up into tokens </param>
+	  public LowerCaseTokenizer(Version matchVersion, AttributeFactory factory, TextReader @in) : base(matchVersion, factory, @in)
+	  {
+	  }
+
+	  /// <summary>
+	  /// Converts char to lower case
+	  /// <seealso cref="Character#toLowerCase(int)"/>.
+	  /// </summary>
+	  protected override int Normalize(int c)
+	  {
+		return char.ToLower(c);
+	  }
+	}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Core/LowerCaseTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Core/LowerCaseTokenizerFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Core/LowerCaseTokenizerFactory.cs
new file mode 100644
index 0000000..3d9b2e2
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Core/LowerCaseTokenizerFactory.cs
@@ -0,0 +1,63 @@
+using System.Collections.Generic;
+using System.IO;
+using Lucene.Net.Analysis.Util;
+using Lucene.Net.Util;
+using org.apache.lucene.analysis.util;
+
+namespace Lucene.Net.Analysis.Core
+{
+
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+    /// <summary>
+    /// Factory for <seealso cref="LowerCaseTokenizer"/>. 
+    /// <pre class="prettyprint">
+    /// &lt;fieldType name="text_lwrcase" class="solr.TextField" positionIncrementGap="100"&gt;
+    ///   &lt;analyzer&gt;
+    ///     &lt;tokenizer class="solr.LowerCaseTokenizerFactory"/&gt;
+    ///   &lt;/analyzer&gt;
+    /// &lt;/fieldType&gt;</pre>
+    /// </summary>
+    public class LowerCaseTokenizerFactory : TokenizerFactory, MultiTermAwareComponent
+    {
+
+        /// <summary>
+        /// Creates a new LowerCaseTokenizerFactory </summary>
+        public LowerCaseTokenizerFactory(IDictionary<string, string> args)
+            : base(args)
+        {
+            assureMatchVersion();
+            if (args.Count > 0)
+            {
+                throw new System.ArgumentException("Unknown parameters: " + args);
+            }
+        }
+
+        public override Tokenizer Create(AttributeSource.AttributeFactory factory, TextReader input)
+        {
+            return new LowerCaseTokenizer(luceneMatchVersion, factory, input);
+        }
+
+        public virtual AbstractAnalysisFactory MultiTermComponent
+        {
+            get
+		  {
+			return new LowerCaseFilterFactory(new Dictionary<>(OriginalArgs));
+		  }
+        }
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Core/SimpleAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Core/SimpleAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Core/SimpleAnalyzer.cs
new file mode 100644
index 0000000..56c9133
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Core/SimpleAnalyzer.cs
@@ -0,0 +1,58 @@
+using System.IO;
+using Lucene.Net.Analysis.Util;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Analysis.Core
+{
+
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+    /// <summary>
+    /// An <seealso cref="Analyzer"/> that filters <seealso cref="LetterTokenizer"/> 
+    ///  with <seealso cref="LowerCaseFilter"/> 
+    /// <para>
+    /// <a name="version">You must specify the required <seealso cref="Version"/> compatibility
+    /// when creating <seealso cref="CharTokenizer"/>:
+    /// <ul>
+    /// <li>As of 3.1, <seealso cref="LowerCaseTokenizer"/> uses an int based API to normalize and
+    /// detect token codepoints. See <seealso cref="CharTokenizer#isTokenChar(int)"/> and
+    /// <seealso cref="CharTokenizer#normalize(int)"/> for details.</li>
+    /// </ul>
+    /// </para>
+    /// <para>
+    /// 
+    /// </para>
+    /// </summary>
+    public sealed class SimpleAnalyzer : Analyzer
+    {
+
+        private readonly Version matchVersion;
+
+        /// <summary>
+        /// Creates a new <seealso cref="SimpleAnalyzer"/> </summary>
+        /// <param name="matchVersion"> Lucene version to match See <seealso cref="<a href="#version">above</a>"/> </param>
+        public SimpleAnalyzer(Version matchVersion)
+        {
+            this.matchVersion = matchVersion;
+        }
+
+        protected internal override TokenStreamComponents createComponents(string fieldName, TextReader reader)
+        {
+            return new TokenStreamComponents(new LowerCaseTokenizer(matchVersion, reader));
+        }
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Core/StopAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Core/StopAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Core/StopAnalyzer.cs
new file mode 100644
index 0000000..cc5a39e
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Core/StopAnalyzer.cs
@@ -0,0 +1,104 @@
+using System.Collections.Generic;
+using System.IO;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using org.apache.lucene.analysis.util;
+
+namespace Lucene.Net.Analysis.Core
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+    /// <summary>
+	/// Filters <seealso cref="LetterTokenizer"/> with <seealso cref="LowerCaseFilter"/> and <seealso cref="StopFilter"/>.
+	/// 
+	/// <a name="version"/>
+	/// <para>You must specify the required <seealso cref="Version"/>
+	/// compatibility when creating StopAnalyzer:
+	/// <ul>
+	///    <li> As of 3.1, StopFilter correctly handles Unicode 4.0
+	///         supplementary characters in stopwords
+	///   <li> As of 2.9, position increments are preserved
+	/// </ul>
+	/// </para>
+	/// </summary>
+
+	public sealed class StopAnalyzer : StopwordAnalyzerBase
+	{
+
+	  /// <summary>
+	  /// An unmodifiable set containing some common English words that are not usually useful
+	  /// for searching.
+	  /// </summary>
+	  public static readonly CharArraySet ENGLISH_STOP_WORDS_SET;
+
+	  static StopAnalyzer()
+	  {
+		IList<string> stopWords = Arrays.AsList("a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with");
+		var stopSet = new CharArraySet(Version.LUCENE_CURRENT, stopWords, false);
+		ENGLISH_STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet);
+	  }
+
+	  /// <summary>
+	  /// Builds an analyzer which removes words in
+	  ///  <seealso cref="#ENGLISH_STOP_WORDS_SET"/>. </summary>
+	  /// <param name="matchVersion"> See <a href="#version">above</a> </param>
+	  public StopAnalyzer(Version matchVersion) : this(matchVersion, ENGLISH_STOP_WORDS_SET)
+	  {
+	  }
+
+	  /// <summary>
+	  /// Builds an analyzer with the stop words from the given set. </summary>
+	  /// <param name="matchVersion"> See <a href="#version">above</a> </param>
+	  /// <param name="stopWords"> Set of stop words  </param>
+	  public StopAnalyzer(Version matchVersion, CharArraySet stopWords) : base(matchVersion, stopWords)
+	  {
+	  }
+
+	  /// <summary>
+	  /// Builds an analyzer with the stop words from the given file. </summary>
+	  /// <seealso cref= WordlistLoader#getWordSet(Reader, Version) </seealso>
+	  /// <param name="matchVersion"> See <a href="#version">above</a> </param>
+	  /// <param name="stopwordsFile"> File to load stop words from  </param>
+	  public StopAnalyzer(Version matchVersion, File stopwordsFile) : this(matchVersion, loadStopwordSet(stopwordsFile, matchVersion))
+	  {
+	  }
+
+	  /// <summary>
+	  /// Builds an analyzer with the stop words from the given reader. </summary>
+	  /// <seealso cref= WordlistLoader#getWordSet(Reader, Version) </seealso>
+	  /// <param name="matchVersion"> See <a href="#version">above</a> </param>
+	  /// <param name="stopwords"> Reader to load stop words from  </param>
+	  public StopAnalyzer(Version matchVersion, TextReader stopwords) : this(matchVersion, loadStopwordSet(stopwords, matchVersion))
+	  {
+	  }
+
+	  /// <summary>
+	  /// Creates
+	  /// <seealso cref="Analyzer.TokenStreamComponents"/>
+	  /// used to tokenize all the text in the provided <seealso cref="Reader"/>.
+	  /// </summary>
+	  /// <returns> <seealso cref="Analyzer.TokenStreamComponents"/>
+	  ///         built from a <seealso cref="LowerCaseTokenizer"/> filtered with
+	  ///         <seealso cref="StopFilter"/> </returns>
+	  protected internal override Analyzer.TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
+	  {
+		Tokenizer source = new LowerCaseTokenizer(matchVersion, reader);
+		return new Analyzer.TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords));
+	  }
+	}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Core/StopFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Core/StopFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Core/StopFilter.cs
new file mode 100644
index 0000000..aeaf324
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Core/StopFilter.cs
@@ -0,0 +1,129 @@
+using System.Collections.Generic;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Analysis.Util;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using org.apache.lucene.analysis.util;
+
+namespace Lucene.Net.Analysis.Core
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+    /// <summary>
+	/// Removes stop words from a token stream.
+	/// 
+	/// <a name="version"/>
+	/// <para>You must specify the required <seealso cref="Version"/>
+	/// compatibility when creating StopFilter:
+	/// <ul>
+	///   <li> As of 3.1, StopFilter correctly handles Unicode 4.0
+	///         supplementary characters in stopwords and position
+	///         increments are preserved
+	/// </ul>
+	/// </para>
+	/// </summary>
+	public sealed class StopFilter : FilteringTokenFilter
+	{
+
+	  private readonly CharArraySet stopWords;
+	  private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+
+	  /// <summary>
+	  /// Constructs a filter which removes words from the input TokenStream that are
+	  /// named in the Set.
+	  /// </summary>
+	  /// <param name="matchVersion">
+	  ///          Lucene version to enable correct Unicode 4.0 behavior in the stop
+	  ///          set if Version > 3.0.  See <a href="#version">above</a> for details. </param>
+	  /// <param name="in">
+	  ///          Input stream </param>
+	  /// <param name="stopWords">
+	  ///          A <seealso cref="CharArraySet"/> representing the stopwords. </param>
+	  /// <seealso cref= #makeStopSet(Version, java.lang.String...) </seealso>
+	  public StopFilter(Version matchVersion, TokenStream @in, CharArraySet stopWords) : base(matchVersion, @in)
+	  {
+		this.stopWords = stopWords;
+	  }
+
+	  /// <summary>
+	  /// Builds a Set from an array of stop words,
+	  /// appropriate for passing into the StopFilter constructor.
+	  /// This permits this stopWords construction to be cached once when
+	  /// an Analyzer is constructed.
+	  /// </summary>
+	  /// <param name="matchVersion"> Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0 </param>
+	  /// <param name="stopWords"> An array of stopwords </param>
+	  /// <seealso cref= #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase </seealso>
+	  public static CharArraySet makeStopSet(Version matchVersion, params string[] stopWords)
+	  {
+		return makeStopSet(matchVersion, stopWords, false);
+	  }
+
+	  /// <summary>
+	  /// Builds a Set from an array of stop words,
+	  /// appropriate for passing into the StopFilter constructor.
+	  /// This permits this stopWords construction to be cached once when
+	  /// an Analyzer is constructed.
+	  /// </summary>
+	  /// <param name="matchVersion"> Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0 </param>
+	  /// <param name="stopWords"> A List of Strings or char[] or any other toString()-able list representing the stopwords </param>
+	  /// <returns> A Set (<seealso cref="CharArraySet"/>) containing the words </returns>
+	  /// <seealso cref= #makeStopSet(Version, java.lang.String[], boolean) passing false to ignoreCase </seealso>
+	  public static CharArraySet MakeStopSet<T1>(Version matchVersion, IList<T1> stopWords)
+	  {
+		return makeStopSet(matchVersion, stopWords, false);
+	  }
+
+	  /// <summary>
+	  /// Creates a stopword set from the given stopword array.
+	  /// </summary>
+	  /// <param name="matchVersion"> Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0 </param>
+	  /// <param name="stopWords"> An array of stopwords </param>
+	  /// <param name="ignoreCase"> If true, all words are lower cased first. </param>
+	  /// <returns> a Set containing the words </returns>
+	  public static CharArraySet MakeStopSet(Version matchVersion, string[] stopWords, bool ignoreCase)
+	  {
+		CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.Length, ignoreCase);
+		stopSet.AddAll(Arrays.AsList(stopWords));
+		return stopSet;
+	  }
+
+	  /// <summary>
+	  /// Creates a stopword set from the given stopword list. </summary>
+	  /// <param name="matchVersion"> Lucene version to enable correct Unicode 4.0 behavior in the returned set if Version > 3.0 </param>
+	  /// <param name="stopWords"> A List of Strings or char[] or any other toString()-able list representing the stopwords </param>
+	  /// <param name="ignoreCase"> if true, all words are lower cased first </param>
+	  /// <returns> A Set (<seealso cref="CharArraySet"/>) containing the words </returns>
+	  public static CharArraySet makeStopSet<T1>(Version matchVersion, IList<T1> stopWords, bool ignoreCase)
+	  {
+		CharArraySet stopSet = new CharArraySet(matchVersion, stopWords.Count, ignoreCase);
+		stopSet.addAll(stopWords);
+		return stopSet;
+	  }
+
+	  /// <summary>
+	  /// Returns the next input Token whose term() is not a stop word.
+	  /// </summary>
+	  protected internal override bool Accept()
+	  {
+		return !stopWords.contains(termAtt.buffer(), 0, termAtt.length());
+	  }
+
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Core/StopFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Core/StopFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Core/StopFilterFactory.cs
new file mode 100644
index 0000000..c74874d
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Core/StopFilterFactory.cs
@@ -0,0 +1,162 @@
+using System;
+using System.Collections.Generic;
+using Lucene.Net.Analysis.Core;
+using TokenFilterFactory = Lucene.Net.Analysis.Util.TokenFilterFactory;
+
+namespace org.apache.lucene.analysis.core
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+	using CharArraySet = org.apache.lucene.analysis.util.CharArraySet;
+	using ResourceLoader = org.apache.lucene.analysis.util.ResourceLoader;
+	using ResourceLoaderAware = org.apache.lucene.analysis.util.ResourceLoaderAware;
+	using TokenFilterFactory = TokenFilterFactory;
+	using WordlistLoader = org.apache.lucene.analysis.util.WordlistLoader; // jdocs
+
+
+	/// <summary>
+	/// Factory for <seealso cref="StopFilter"/>.
+	/// 
+	/// <pre class="prettyprint">
+	/// &lt;fieldType name="text_stop" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true"&gt;
+	///   &lt;analyzer&gt;
+	///     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
+	///     &lt;filter class="solr.StopFilterFactory" ignoreCase="true"
+	///             words="stopwords.txt" format="wordset" /&gt;
+	///   &lt;/analyzer&gt;
+	/// &lt;/fieldType&gt;</pre>
+	/// 
+	/// <para>
+	/// All attributes are optional:
+	/// </para>
+	/// <ul>
+	///  <li><code>ignoreCase</code> defaults to <code>false</code></li>
+	///  <li><code>words</code> should be the name of a stopwords file to parse, if not 
+	///      specified the factory will use <seealso cref="StopAnalyzer#ENGLISH_STOP_WORDS_SET"/>
+	///  </li>
+	///  <li><code>format</code> defines how the <code>words</code> file will be parsed, 
+	///      and defaults to <code>wordset</code>.  If <code>words</code> is not specified, 
+	///      then <code>format</code> must not be specified.
+	///  </li>
+	/// </ul>
+	/// <para>
+	/// The valid values for the <code>format</code> option are:
+	/// </para>
+	/// <ul>
+	///  <li><code>wordset</code> - This is the default format, which supports one word per 
+	///      line (including any intra-word whitespace) and allows whole line comments 
+	///      begining with the "#" character.  Blank lines are ignored.  See 
+	///      <seealso cref="WordlistLoader#getLines WordlistLoader.getLines"/> for details.
+	///  </li>
+	///  <li><code>snowball</code> - This format allows for multiple words specified on each 
+	///      line, and trailing comments may be specified using the vertical line ("&#124;"). 
+	///      Blank lines are ignored.  See 
+	///      <seealso cref="WordlistLoader#getSnowballWordSet WordlistLoader.getSnowballWordSet"/> 
+	///      for details.
+	///  </li>
+	/// </ul>
+	/// </summary>
+	public class StopFilterFactory : TokenFilterFactory, ResourceLoaderAware
+	{
+	  public const string FORMAT_WORDSET = "wordset";
+	  public const string FORMAT_SNOWBALL = "snowball";
+
+	  private CharArraySet stopWords;
+	  private readonly string stopWordFiles;
+	  private readonly string format;
+	  private readonly bool ignoreCase;
+	  private readonly bool enablePositionIncrements;
+
+	  /// <summary>
+	  /// Creates a new StopFilterFactory </summary>
+	  public StopFilterFactory(IDictionary<string, string> args) : base(args)
+	  {
+		assureMatchVersion();
+		stopWordFiles = get(args, "words");
+		format = get(args, "format", (null == stopWordFiles ? null : FORMAT_WORDSET));
+		ignoreCase = getBoolean(args, "ignoreCase", false);
+		enablePositionIncrements = getBoolean(args, "enablePositionIncrements", true);
+		if (args.Count > 0)
+		{
+		  throw new System.ArgumentException("Unknown parameters: " + args);
+		}
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void inform(org.apache.lucene.analysis.util.ResourceLoader loader) throws java.io.IOException
+	  public virtual void inform(ResourceLoader loader)
+	  {
+		if (stopWordFiles != null)
+		{
+		  if (FORMAT_WORDSET.Equals(format, StringComparison.CurrentCultureIgnoreCase))
+		  {
+			stopWords = getWordSet(loader, stopWordFiles, ignoreCase);
+		  }
+		  else if (FORMAT_SNOWBALL.Equals(format, StringComparison.CurrentCultureIgnoreCase))
+		  {
+			stopWords = getSnowballWordSet(loader, stopWordFiles, ignoreCase);
+		  }
+		  else
+		  {
+			throw new System.ArgumentException("Unknown 'format' specified for 'words' file: " + format);
+		  }
+		}
+		else
+		{
+		  if (null != format)
+		  {
+			throw new System.ArgumentException("'format' can not be specified w/o an explicit 'words' file: " + format);
+		  }
+		  stopWords = new CharArraySet(luceneMatchVersion, StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase);
+		}
+	  }
+
+	  public virtual bool EnablePositionIncrements
+	  {
+		  get
+		  {
+			return enablePositionIncrements;
+		  }
+	  }
+
+	  public virtual bool IgnoreCase
+	  {
+		  get
+		  {
+			return ignoreCase;
+		  }
+	  }
+
+	  public virtual CharArraySet StopWords
+	  {
+		  get
+		  {
+			return stopWords;
+		  }
+	  }
+
+	  public override TokenStream create(TokenStream input)
+	  {
+		StopFilter stopFilter = new StopFilter(luceneMatchVersion,input,stopWords);
+		stopFilter.EnablePositionIncrements = enablePositionIncrements;
+		return stopFilter;
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Core/TypeTokenFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Core/TypeTokenFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Core/TypeTokenFilter.cs
new file mode 100644
index 0000000..c546f3a
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Core/TypeTokenFilter.cs
@@ -0,0 +1,83 @@
+using System;
+using System.Collections.Generic;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Analysis.Util;
+
+namespace Lucene.Net.Analysis.Core
+{
+
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+    /// <summary>
+    /// Removes tokens whose types appear in a set of blocked types from a token stream.
+    /// </summary>
+    public sealed class TypeTokenFilter : FilteringTokenFilter
+    {
+
+        private readonly HashSet<string> stopTypes;
+        private readonly TypeAttribute typeAttribute = addAttribute(typeof(TypeAttribute));
+        private readonly bool useWhiteList;
+
+        /// @deprecated enablePositionIncrements=false is not supported anymore as of Lucene 4.4. 
+        [Obsolete("enablePositionIncrements=false is not supported anymore as of Lucene 4.4.")]
+        public TypeTokenFilter(Version version, bool enablePositionIncrements, TokenStream input, HashSet<string> stopTypes, bool useWhiteList)
+            : base(version, enablePositionIncrements, input)
+        {
+            this.stopTypes = stopTypes;
+            this.useWhiteList = useWhiteList;
+        }
+
+        /// @deprecated enablePositionIncrements=false is not supported anymore as of Lucene 4.4. 
+        [Obsolete("enablePositionIncrements=false is not supported anymore as of Lucene 4.4.")]
+        public TypeTokenFilter(Version version, bool enablePositionIncrements, TokenStream input, HashSet<string> stopTypes)
+            : this(version, enablePositionIncrements, input, stopTypes, false)
+        {
+        }
+
+        /// <summary>
+        /// Create a new <seealso cref="TypeTokenFilter"/>. </summary>
+        /// <param name="version">      the Lucene match version </param>
+        /// <param name="input">        the <seealso cref="TokenStream"/> to consume </param>
+        /// <param name="stopTypes">    the types to filter </param>
+        /// <param name="useWhiteList"> if true, then tokens whose type is in stopTypes will
+        ///                     be kept, otherwise they will be filtered out </param>
+        public TypeTokenFilter(Version version, TokenStream input, HashSet<string> stopTypes, bool useWhiteList)
+            : base(version, input)
+        {
+            this.stopTypes = stopTypes;
+            this.useWhiteList = useWhiteList;
+        }
+
+        /// <summary>
+        /// Create a new <seealso cref="TypeTokenFilter"/> that filters tokens out
+        /// (useWhiteList=false). </summary>
+        /// <seealso cref= #TypeTokenFilter(Version, TokenStream, Set, boolean) </seealso>
+        public TypeTokenFilter(Version version, TokenStream input, HashSet<string> stopTypes)
+            : this(version, input, stopTypes, false)
+        {
+        }
+
+        /// <summary>
+        /// By default accept the token if its type is not a stop type.
+        /// When the useWhiteList parameter is set to true then accept the token if its type is contained in the stopTypes
+        /// </summary>
+        protected internal override bool Accept()
+        {
+            return useWhiteList == stopTypes.Contains(typeAttribute.Type);
+        }
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Core/TypeTokenFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Core/TypeTokenFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Core/TypeTokenFilterFactory.cs
new file mode 100644
index 0000000..42e82d2
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Core/TypeTokenFilterFactory.cs
@@ -0,0 +1,94 @@
+using System.Collections.Generic;
+using Lucene.Net.Analysis.Util;
+using Lucene.Net.Support;
+using org.apache.lucene.analysis.util;
+
+namespace Lucene.Net.Analysis.Core
+{
+
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+    /// <summary>
+    /// Factory class for <seealso cref="TypeTokenFilter"/>.
+    /// <pre class="prettyprint">
+    /// &lt;fieldType name="chars" class="solr.TextField" positionIncrementGap="100"&gt;
+    ///   &lt;analyzer&gt;
+    ///     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
+    ///     &lt;filter class="solr.TypeTokenFilterFactory" types="stoptypes.txt"
+    ///                   useWhitelist="false"/&gt;
+    ///   &lt;/analyzer&gt;
+    /// &lt;/fieldType&gt;</pre>
+    /// </summary>
+    public class TypeTokenFilterFactory : TokenFilterFactory, ResourceLoaderAware
+    {
+        private readonly bool useWhitelist;
+        private readonly bool enablePositionIncrements;
+        private readonly string stopTypesFiles;
+        private HashSet<string> stopTypes;
+
+        /// <summary>
+        /// Creates a new TypeTokenFilterFactory </summary>
+        public TypeTokenFilterFactory(IDictionary<string, string> args)
+            : base(args)
+        {
+            stopTypesFiles = require(args, "types");
+            enablePositionIncrements = getBoolean(args, "enablePositionIncrements", true);
+            useWhitelist = getBoolean(args, "useWhitelist", false);
+            if (args.Count > 0)
+            {
+                throw new System.ArgumentException("Unknown parameters: " + args);
+            }
+        }
+
+        public virtual void inform(ResourceLoader loader)
+        {
+            IList<string> files = splitFileNames(stopTypesFiles);
+            if (files.Count > 0)
+            {
+                stopTypes = new HashSet<string>();
+                foreach (string file in files)
+                {
+                    IList<string> typesLines = getLines(loader, file.Trim());
+                    stopTypes.AddAll(typesLines);
+                }
+            }
+        }
+
+        public virtual bool EnablePositionIncrements
+        {
+            get
+            {
+                return enablePositionIncrements;
+            }
+        }
+
+        public virtual HashSet<string> StopTypes
+        {
+            get
+            {
+                return stopTypes;
+            }
+        }
+
+        public override TokenStream Create(TokenStream input)
+        {
+            TokenStream filter = new TypeTokenFilter(luceneMatchVersion, enablePositionIncrements, input, stopTypes, useWhitelist);
+            return filter;
+        }
+    }
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Core/UpperCaseFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Core/UpperCaseFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Core/UpperCaseFilter.cs
new file mode 100644
index 0000000..d5b7f10
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Core/UpperCaseFilter.cs
@@ -0,0 +1,71 @@
+using Lucene.Net.Analysis.Core;
+
+namespace org.apache.lucene.analysis.core
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+	using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+	using CharacterUtils = org.apache.lucene.analysis.util.CharacterUtils;
+	using Version = org.apache.lucene.util.Version;
+
+	/// <summary>
+	/// Normalizes token text to UPPER CASE.
+	/// <a name="version"/>
+	/// <para>You must specify the required <seealso cref="Version"/>
+	/// compatibility when creating UpperCaseFilter
+	/// 
+	/// </para>
+	/// <para><b>NOTE:</b> In Unicode, this transformation may lose information when the
+	/// upper case character represents more than one lower case character. Use this filter
+	/// when you require uppercase tokens.  Use the <seealso cref="LowerCaseFilter"/> for 
+	/// general search matching
+	/// </para>
+	/// </summary>
+	public sealed class UpperCaseFilter : TokenFilter
+	{
+	  private readonly CharacterUtils charUtils;
+	  private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+
+	  /// <summary>
+	  /// Create a new UpperCaseFilter, that normalizes token text to upper case.
+	  /// </summary>
+	  /// <param name="matchVersion"> See <a href="#version">above</a> </param>
+	  /// <param name="in"> TokenStream to filter </param>
+	  public UpperCaseFilter(Version matchVersion, TokenStream @in) : base(@in)
+	  {
+		charUtils = CharacterUtils.getInstance(matchVersion);
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public final boolean incrementToken() throws java.io.IOException
+	  public override bool incrementToken()
+	  {
+		if (input.incrementToken())
+		{
+		  charUtils.ToUpper(termAtt.buffer(), 0, termAtt.length());
+		  return true;
+		}
+		else
+		{
+		  return false;
+		}
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Core/UpperCaseFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Core/UpperCaseFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Core/UpperCaseFilterFactory.cs
new file mode 100644
index 0000000..df3580f
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Core/UpperCaseFilterFactory.cs
@@ -0,0 +1,74 @@
+using System.Collections.Generic;
+using Lucene.Net.Analysis.Core;
+using Lucene.Net.Analysis.Util;
+using TokenFilterFactory = Lucene.Net.Analysis.Util.TokenFilterFactory;
+
+namespace org.apache.lucene.analysis.core
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+	using AbstractAnalysisFactory = AbstractAnalysisFactory;
+	using MultiTermAwareComponent = org.apache.lucene.analysis.util.MultiTermAwareComponent;
+	using TokenFilterFactory = TokenFilterFactory;
+
+	/// <summary>
+	/// Factory for <seealso cref="UpperCaseFilter"/>. 
+	/// <pre class="prettyprint">
+	/// &lt;fieldType name="text_uppercase" class="solr.TextField" positionIncrementGap="100"&gt;
+	///   &lt;analyzer&gt;
+	///     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
+	///     &lt;filter class="solr.UpperCaseFilterFactory"/&gt;
+	///   &lt;/analyzer&gt;
+	/// &lt;/fieldType&gt;</pre>
+	/// 
+	/// <para><b>NOTE:</b> In Unicode, this transformation may lose information when the
+	/// upper case character represents more than one lower case character. Use this filter
+	/// when you require uppercase tokens.  Use the <seealso cref="LowerCaseFilterFactory"/> for 
+	/// general search matching
+	/// </para>
+	/// </summary>
+	public class UpperCaseFilterFactory : TokenFilterFactory, MultiTermAwareComponent
+	{
+
+	  /// <summary>
+	  /// Creates a new UpperCaseFilterFactory </summary>
+	  public UpperCaseFilterFactory(IDictionary<string, string> args) : base(args)
+	  {
+		assureMatchVersion();
+		if (args.Count > 0)
+		{
+		  throw new System.ArgumentException("Unknown parameters: " + args);
+		}
+	  }
+
+	  public override UpperCaseFilter create(TokenStream input)
+	  {
+		return new UpperCaseFilter(luceneMatchVersion,input);
+	  }
+
+	  public virtual AbstractAnalysisFactory MultiTermComponent
+	  {
+		  get
+		  {
+			return this;
+		  }
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Core/WhitespaceAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Core/WhitespaceAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Core/WhitespaceAnalyzer.cs
new file mode 100644
index 0000000..7e77c8d
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Core/WhitespaceAnalyzer.cs
@@ -0,0 +1,58 @@
+using System.IO;
+using Lucene.Net.Analysis.Util;
+using Lucene.Net.Util;
+using org.apache.lucene.analysis.core;
+
+namespace Lucene.Net.Analysis.Core
+{
+
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+    /// <summary>
+    /// An Analyzer that uses <seealso cref="WhitespaceTokenizer"/>.
+    /// <para>
+    /// <a name="version">You must specify the required <seealso cref="Version"/> compatibility
+    /// when creating <seealso cref="CharTokenizer"/>:
+    /// <ul>
+    /// <li>As of 3.1, <seealso cref="WhitespaceTokenizer"/> uses an int based API to normalize and
+    /// detect token codepoints. See <seealso cref="CharTokenizer#isTokenChar(int)"/> and
+    /// <seealso cref="CharTokenizer#normalize(int)"/> for details.</li>
+    /// </ul>
+    /// </para>
+    /// <para>
+    /// 
+    /// </para>
+    /// </summary>
+    public sealed class WhitespaceAnalyzer : Analyzer
+    {
+
+        private readonly Version matchVersion;
+
+        /// <summary>
+        /// Creates a new <seealso cref="WhitespaceAnalyzer"/> </summary>
+        /// <param name="matchVersion"> Lucene version to match See <seealso cref="<a href="#version">above</a>"/> </param>
+        public WhitespaceAnalyzer(Version matchVersion)
+        {
+            this.matchVersion = matchVersion;
+        }
+
+        protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
+        {
+            return new TokenStreamComponents(new WhitespaceTokenizer(matchVersion, reader));
+        }
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Core/WhitespaceTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Core/WhitespaceTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Core/WhitespaceTokenizer.cs
new file mode 100644
index 0000000..1ee9e69
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Core/WhitespaceTokenizer.cs
@@ -0,0 +1,75 @@
+using System.IO;
+using Lucene.Net.Analysis.Util;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Analysis.Core
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+    /// <summary>
+	/// A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
+	/// Adjacent sequences of non-Whitespace characters form tokens. <a
+	/// name="version"/>
+	/// <para>
+	/// You must specify the required <seealso cref="Version"/> compatibility when creating
+	/// <seealso cref="WhitespaceTokenizer"/>:
+	/// <ul>
+	/// <li>As of 3.1, <seealso cref="CharTokenizer"/> uses an int based API to normalize and
+	/// detect token characters. See <seealso cref="CharTokenizer#isTokenChar(int)"/> and
+	/// <seealso cref="CharTokenizer#normalize(int)"/> for details.</li>
+	/// </ul>
+	/// </para>
+	/// </summary>
+	public sealed class WhitespaceTokenizer : CharTokenizer
+	{
+
+	  /// Construct a new WhitespaceTokenizer. * <param name="matchVersion"> Lucene version
+	  /// to match See <seealso cref="<a href="#version">above</a>"/>
+	  /// </param>
+	  /// <param name="in">
+	  ///          the input to split up into tokens </param>
+	  public WhitespaceTokenizer(Version matchVersion, TextReader @in) : base(matchVersion, @in)
+	  {
+	  }
+
+	  /// <summary>
+	  /// Construct a new WhitespaceTokenizer using a given
+	  /// <seealso cref="org.apache.lucene.util.AttributeSource.AttributeFactory"/>.
+	  /// 
+	  /// @param
+	  ///          matchVersion Lucene version to match See
+	  ///          <seealso cref="<a href="#version">above</a>"/> </summary>
+	  /// <param name="factory">
+	  ///          the attribute factory to use for this <seealso cref="Tokenizer"/> </param>
+	  /// <param name="in">
+	  ///          the input to split up into tokens </param>
+	  public WhitespaceTokenizer(Version matchVersion, AttributeFactory factory, TextReader @in) : base(matchVersion, factory, @in)
+	  {
+	  }
+
+	  /// <summary>
+	  /// Collects only characters which do not satisfy
+	  /// <seealso cref="Character#isWhitespace(int)"/>.
+	  /// </summary>
+	  protected internal override bool IsTokenChar(char c)
+	  {
+		return !char.IsWhiteSpace(c);
+	  }
+	}
+}
\ No newline at end of file


Mime
View raw message