lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From synhers...@apache.org
Subject [14/34] lucenenet git commit: Raw porting of Lucene.Net.Analysis.Common
Date Fri, 07 Nov 2014 23:12:18 GMT
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/SingleTokenTokenStream.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/SingleTokenTokenStream.cs b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/SingleTokenTokenStream.cs
new file mode 100644
index 0000000..435247c
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/SingleTokenTokenStream.cs
@@ -0,0 +1,79 @@
+using System.Diagnostics;
+
+namespace org.apache.lucene.analysis.miscellaneous
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+	using AttributeImpl = org.apache.lucene.util.AttributeImpl;
+	using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+	/// <summary>
+	/// A <seealso cref="TokenStream"/> containing a single token.
+	/// </summary>
+	public sealed class SingleTokenTokenStream : TokenStream
+	{
+
+	  private bool exhausted = false;
+
+	  // The token needs to be immutable, so work with clones!
+	  private Token singleToken;
+	  private readonly AttributeImpl tokenAtt;
+
+	  public SingleTokenTokenStream(Token token) : base(Token.TOKEN_ATTRIBUTE_FACTORY)
+	  {
+
+		Debug.Assert(token != null);
+		this.singleToken = token.clone();
+
+		tokenAtt = (AttributeImpl) addAttribute(typeof(CharTermAttribute));
+		assert(tokenAtt is Token);
+	  }
+
+	  public override bool incrementToken()
+	  {
+		if (exhausted)
+		{
+		  return false;
+		}
+		else
+		{
+		  clearAttributes();
+		  singleToken.copyTo(tokenAtt);
+		  exhausted = true;
+		  return true;
+		}
+	  }
+
+	  public override void reset()
+	  {
+		exhausted = false;
+	  }
+
+	  public Token getToken()
+	  {
+		return singleToken.clone();
+	  }
+
+	  public void setToken(Token token)
+	  {
+		this.singleToken = token.clone();
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/StemmerOverrideFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/StemmerOverrideFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/StemmerOverrideFilter.cs
new file mode 100644
index 0000000..078ff66
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/StemmerOverrideFilter.cs
@@ -0,0 +1,265 @@
+using System.Collections.Generic;
+
+namespace org.apache.lucene.analysis.miscellaneous
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+	using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+	using KeywordAttribute = org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+	using BytesRef = org.apache.lucene.util.BytesRef;
+	using BytesRefHash = org.apache.lucene.util.BytesRefHash;
+	using CharsRef = org.apache.lucene.util.CharsRef;
+	using IntsRef = org.apache.lucene.util.IntsRef;
+	using UnicodeUtil = org.apache.lucene.util.UnicodeUtil;
+	using ByteSequenceOutputs = org.apache.lucene.util.fst.ByteSequenceOutputs;
+	using FST = org.apache.lucene.util.fst.FST;
+	using Arc = org.apache.lucene.util.fst.FST.Arc;
+	using BytesReader = org.apache.lucene.util.fst.FST.BytesReader;
+
+
+	/// <summary>
+	/// Provides the ability to override any <seealso cref="KeywordAttribute"/> aware stemmer
+	/// with custom dictionary-based stemming.
+	/// </summary>
+	public sealed class StemmerOverrideFilter : TokenFilter
+	{
+	  private readonly StemmerOverrideMap stemmerOverrideMap;
+
+	  private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+	  private readonly KeywordAttribute keywordAtt = addAttribute(typeof(KeywordAttribute));
+	  private readonly FST.BytesReader fstReader;
+	  private readonly FST.Arc<BytesRef> scratchArc = new FST.Arc<BytesRef>();
+	  private readonly CharsRef spare = new CharsRef();
+
+	  /// <summary>
+	  /// Create a new StemmerOverrideFilter, performing dictionary-based stemming
+	  /// with the provided <code>dictionary</code>.
+	  /// <para>
+	  /// Any dictionary-stemmed terms will be marked with <seealso cref="KeywordAttribute"/>
+	  /// so that they will not be stemmed with stemmers down the chain.
+	  /// </para>
+	  /// </summary>
+//JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET:
+//ORIGINAL LINE: public StemmerOverrideFilter(final org.apache.lucene.analysis.TokenStream input, final StemmerOverrideMap stemmerOverrideMap)
+	  public StemmerOverrideFilter(TokenStream input, StemmerOverrideMap stemmerOverrideMap) : base(input)
+	  {
+		this.stemmerOverrideMap = stemmerOverrideMap;
+		fstReader = stemmerOverrideMap.BytesReader;
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
+	  public override bool incrementToken()
+	  {
+		if (input.incrementToken())
+		{
+		  if (fstReader == null)
+		  {
+			// No overrides
+			return true;
+		  }
+		  if (!keywordAtt.Keyword) // don't muck with already-keyworded terms
+		  {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final org.apache.lucene.util.BytesRef stem = stemmerOverrideMap.get(termAtt.buffer(), termAtt.length(), scratchArc, fstReader);
+			BytesRef stem = stemmerOverrideMap.get(termAtt.buffer(), termAtt.length(), scratchArc, fstReader);
+			if (stem != null)
+			{
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final char[] buffer = spare.chars = termAtt.buffer();
+			  char[] buffer = spare.chars = termAtt.buffer();
+			  UnicodeUtil.UTF8toUTF16(stem.bytes, stem.offset, stem.length, spare);
+			  if (spare.chars != buffer)
+			  {
+				termAtt.copyBuffer(spare.chars, spare.offset, spare.length);
+			  }
+			  termAtt.Length = spare.length;
+			  keywordAtt.Keyword = true;
+			}
+		  }
+		  return true;
+		}
+		else
+		{
+		  return false;
+		}
+	  }
+
+	  /// <summary>
+	  /// A read-only 4-byte FST backed map that allows fast case-insensitive key
+	  /// value lookups for <seealso cref="StemmerOverrideFilter"/>
+	  /// </summary>
+	  // TODO maybe we can generalize this and reuse this map somehow?
+	  public sealed class StemmerOverrideMap
+	  {
+		internal readonly FST<BytesRef> fst;
+		internal readonly bool ignoreCase;
+
+		/// <summary>
+		/// Creates a new <seealso cref="StemmerOverrideMap"/> </summary>
+		/// <param name="fst"> the fst to lookup the overrides </param>
+		/// <param name="ignoreCase"> if the keys case should be ingored </param>
+		public StemmerOverrideMap(FST<BytesRef> fst, bool ignoreCase)
+		{
+		  this.fst = fst;
+		  this.ignoreCase = ignoreCase;
+		}
+
+		/// <summary>
+		/// Returns a <seealso cref="BytesReader"/> to pass to the <seealso cref="#get(char[], int, FST.Arc, FST.BytesReader)"/> method.
+		/// </summary>
+		public FST.BytesReader BytesReader
+		{
+			get
+			{
+			  if (fst == null)
+			  {
+				return null;
+			  }
+			  else
+			  {
+				return fst.BytesReader;
+			  }
+			}
+		}
+
+		/// <summary>
+		/// Returns the value mapped to the given key or <code>null</code> if the key is not in the FST dictionary.
+		/// </summary>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: public org.apache.lucene.util.BytesRef get(char[] buffer, int bufferLen, org.apache.lucene.util.fst.FST.Arc<org.apache.lucene.util.BytesRef> scratchArc, org.apache.lucene.util.fst.FST.BytesReader fstReader) throws java.io.IOException
+		public BytesRef get(char[] buffer, int bufferLen, FST.Arc<BytesRef> scratchArc, FST.BytesReader fstReader)
+		{
+		  BytesRef pendingOutput = fst.outputs.NoOutput;
+		  BytesRef matchOutput = null;
+		  int bufUpto = 0;
+		  fst.getFirstArc(scratchArc);
+		  while (bufUpto < bufferLen)
+		  {
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen);
+			int codePoint = char.codePointAt(buffer, bufUpto, bufferLen);
+			if (fst.findTargetArc(ignoreCase ? char.ToLower(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null)
+			{
+			  return null;
+			}
+			pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
+			bufUpto += char.charCount(codePoint);
+		  }
+		  if (scratchArc.Final)
+		  {
+			matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput);
+		  }
+		  return matchOutput;
+		}
+
+	  }
+	  /// <summary>
+	  /// This builder builds an <seealso cref="FST"/> for the <seealso cref="StemmerOverrideFilter"/>
+	  /// </summary>
+	  public class Builder
+	  {
+		internal readonly BytesRefHash hash = new BytesRefHash();
+		internal readonly BytesRef spare = new BytesRef();
+		internal readonly List<CharSequence> outputValues = new List<CharSequence>();
+		internal readonly bool ignoreCase;
+		internal readonly CharsRef charsSpare = new CharsRef();
+
+		/// <summary>
+		/// Creates a new <seealso cref="Builder"/> with ignoreCase set to <code>false</code> 
+		/// </summary>
+		public Builder() : this(false)
+		{
+		}
+
+		/// <summary>
+		/// Creates a new <seealso cref="Builder"/> </summary>
+		/// <param name="ignoreCase"> if the input case should be ignored. </param>
+		public Builder(bool ignoreCase)
+		{
+		  this.ignoreCase = ignoreCase;
+		}
+
+		/// <summary>
+		/// Adds an input string and it's stemmer override output to this builder.
+		/// </summary>
+		/// <param name="input"> the input char sequence </param>
+		/// <param name="output"> the stemmer override output char sequence </param>
+		/// <returns> <code>false</code> iff the input has already been added to this builder otherwise <code>true</code>. </returns>
+		public virtual bool add(CharSequence input, CharSequence output)
+		{
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int length = input.length();
+		  int length = input.length();
+		  if (ignoreCase)
+		  {
+			// convert on the fly to lowercase
+			charsSpare.grow(length);
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final char[] buffer = charsSpare.chars;
+			char[] buffer = charsSpare.chars;
+			for (int i = 0; i < length;)
+			{
+				i += char.toChars(char.ToLower(char.codePointAt(input, i)), buffer, i);
+			}
+			UnicodeUtil.UTF16toUTF8(buffer, 0, length, spare);
+		  }
+		  else
+		  {
+			UnicodeUtil.UTF16toUTF8(input, 0, length, spare);
+		  }
+		  if (hash.add(spare) >= 0)
+		  {
+			outputValues.Add(output);
+			return true;
+		  }
+		  return false;
+		}
+
+		/// <summary>
+		/// Returns an <seealso cref="StemmerOverrideMap"/> to be used with the <seealso cref="StemmerOverrideFilter"/> </summary>
+		/// <returns> an <seealso cref="StemmerOverrideMap"/> to be used with the <seealso cref="StemmerOverrideFilter"/> </returns>
+		/// <exception cref="IOException"> if an <seealso cref="IOException"/> occurs; </exception>
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: public StemmerOverrideMap build() throws java.io.IOException
+		public virtual StemmerOverrideMap build()
+		{
+		  ByteSequenceOutputs outputs = ByteSequenceOutputs.Singleton;
+		  org.apache.lucene.util.fst.Builder<BytesRef> builder = new org.apache.lucene.util.fst.Builder<BytesRef>(FST.INPUT_TYPE.BYTE4, outputs);
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int[] sort = hash.sort(org.apache.lucene.util.BytesRef.getUTF8SortedAsUnicodeComparator());
+		  int[] sort = hash.sort(BytesRef.UTF8SortedAsUnicodeComparator);
+		  IntsRef intsSpare = new IntsRef();
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+//ORIGINAL LINE: final int size = hash.size();
+		  int size = hash.size();
+		  for (int i = 0; i < size; i++)
+		  {
+			int id = sort[i];
+			BytesRef bytesRef = hash.get(id, spare);
+			UnicodeUtil.UTF8toUTF32(bytesRef, intsSpare);
+			builder.add(intsSpare, new BytesRef(outputValues[id]));
+		  }
+		  return new StemmerOverrideMap(builder.finish(), ignoreCase);
+		}
+
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/StemmerOverrideFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/StemmerOverrideFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/StemmerOverrideFilterFactory.cs
new file mode 100644
index 0000000..f755c1f
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/StemmerOverrideFilterFactory.cs
@@ -0,0 +1,97 @@
+using System.Collections.Generic;
+using TokenFilterFactory = Lucene.Net.Analysis.Util.TokenFilterFactory;
+
+namespace org.apache.lucene.analysis.miscellaneous
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+
+	using StemmerOverrideMap = org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap;
+	using ResourceLoader = org.apache.lucene.analysis.util.ResourceLoader;
+	using ResourceLoaderAware = org.apache.lucene.analysis.util.ResourceLoaderAware;
+	using TokenFilterFactory = TokenFilterFactory;
+
+	/// <summary>
+	/// Factory for <seealso cref="StemmerOverrideFilter"/>.
+	/// <pre class="prettyprint">
+	/// &lt;fieldType name="text_dicstem" class="solr.TextField" positionIncrementGap="100"&gt;
+	///   &lt;analyzer&gt;
+	///     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
+	///     &lt;filter class="solr.StemmerOverrideFilterFactory" dictionary="dictionary.txt" ignoreCase="false"/&gt;
+	///   &lt;/analyzer&gt;
+	/// &lt;/fieldType&gt;</pre>
+	/// </summary>
+	public class StemmerOverrideFilterFactory : TokenFilterFactory, ResourceLoaderAware
+	{
+	  private StemmerOverrideMap dictionary;
+	  private readonly string dictionaryFiles;
+	  private readonly bool ignoreCase;
+
+	  /// <summary>
+	  /// Creates a new StemmerOverrideFilterFactory </summary>
+	  public StemmerOverrideFilterFactory(IDictionary<string, string> args) : base(args)
+	  {
+		dictionaryFiles = get(args, "dictionary");
+		ignoreCase = getBoolean(args, "ignoreCase", false);
+		if (args.Count > 0)
+		{
+		  throw new System.ArgumentException("Unknown parameters: " + args);
+		}
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void inform(org.apache.lucene.analysis.util.ResourceLoader loader) throws java.io.IOException
+	  public virtual void inform(ResourceLoader loader)
+	  {
+		if (dictionaryFiles != null)
+		{
+		  assureMatchVersion();
+		  IList<string> files = splitFileNames(dictionaryFiles);
+		  if (files.Count > 0)
+		  {
+			StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(ignoreCase);
+			foreach (string file in files)
+			{
+			  IList<string> list = getLines(loader, file.Trim());
+			  foreach (string line in list)
+			  {
+				string[] mapping = line.Split("\t", 2);
+				builder.add(mapping[0], mapping[1]);
+			  }
+			}
+			dictionary = builder.build();
+		  }
+		}
+	  }
+
+	  public virtual bool IgnoreCase
+	  {
+		  get
+		  {
+			return ignoreCase;
+		  }
+	  }
+
+	  public override TokenStream create(TokenStream input)
+	  {
+		return dictionary == null ? input : new StemmerOverrideFilter(input, dictionary);
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/TrimFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/TrimFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/TrimFilter.cs
new file mode 100644
index 0000000..5a511bc
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/TrimFilter.cs
@@ -0,0 +1,114 @@
+using System;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace org.apache.lucene.analysis.miscellaneous
+{
+
+	using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+	using OffsetAttribute = org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+	using Version = org.apache.lucene.util.Version;
+
+	/// <summary>
+	/// Trims leading and trailing whitespace from Tokens in the stream.
+	/// <para>As of Lucene 4.4, this filter does not support updateOffsets=true anymore
+	/// as it can lead to broken token streams.
+	/// </para>
+	/// </summary>
+	public sealed class TrimFilter : TokenFilter
+	{
+
+	  internal readonly bool updateOffsets;
+	  private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+	  private readonly OffsetAttribute offsetAtt = addAttribute(typeof(OffsetAttribute));
+
+	  /// <summary>
+	  /// Create a new <seealso cref="TrimFilter"/>. </summary>
+	  /// <param name="version">       the Lucene match version </param>
+	  /// <param name="in">            the stream to consume </param>
+	  /// <param name="updateOffsets"> whether to update offsets </param>
+	  /// @deprecated Offset updates are not supported anymore as of Lucene 4.4. 
+	  [Obsolete("Offset updates are not supported anymore as of Lucene 4.4.")]
+	  public TrimFilter(Version version, TokenStream @in, bool updateOffsets) : base(@in)
+	  {
+		if (updateOffsets && version.onOrAfter(Version.LUCENE_44))
+		{
+		  throw new System.ArgumentException("updateOffsets=true is not supported anymore as of Lucene 4.4");
+		}
+		this.updateOffsets = updateOffsets;
+	  }
+
+	  /// <summary>
+	  /// Create a new <seealso cref="TrimFilter"/> on top of <code>in</code>. </summary>
+	  public TrimFilter(Version version, TokenStream @in) : this(version, @in, false)
+	  {
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
+	  public override bool incrementToken()
+	  {
+		if (!input.incrementToken())
+		{
+			return false;
+		}
+
+		char[] termBuffer = termAtt.buffer();
+		int len = termAtt.length();
+		//TODO: Is this the right behavior or should we return false?  Currently, "  ", returns true, so I think this should
+		//also return true
+		if (len == 0)
+		{
+		  return true;
+		}
+		int start = 0;
+		int end = 0;
+		int endOff = 0;
+
+		// eat the first characters
+		for (start = 0; start < len && char.IsWhiteSpace(termBuffer[start]); start++)
+		{
+		}
+		// eat the end characters
+		for (end = len; end >= start && char.IsWhiteSpace(termBuffer[end - 1]); end--)
+		{
+		  endOff++;
+		}
+		if (start > 0 || end < len)
+		{
+		  if (start < end)
+		  {
+			termAtt.copyBuffer(termBuffer, start, (end - start));
+		  }
+		  else
+		  {
+			termAtt.setEmpty();
+		  }
+		  if (updateOffsets && len == offsetAtt.endOffset() - offsetAtt.startOffset())
+		  {
+			int newStart = offsetAtt.startOffset() + start;
+			int newEnd = offsetAtt.endOffset() - (start < end ? endOff:0);
+			offsetAtt.setOffset(newStart, newEnd);
+		  }
+		}
+
+		return true;
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/TrimFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/TrimFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/TrimFilterFactory.cs
new file mode 100644
index 0000000..212d555
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/TrimFilterFactory.cs
@@ -0,0 +1,63 @@
+using System.Collections.Generic;
+using TokenFilterFactory = Lucene.Net.Analysis.Util.TokenFilterFactory;
+
+namespace org.apache.lucene.analysis.miscellaneous
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+	using TokenFilterFactory = TokenFilterFactory;
+
+	/// <summary>
+	/// Factory for <seealso cref="TrimFilter"/>.
+	/// <pre class="prettyprint">
+	/// &lt;fieldType name="text_trm" class="solr.TextField" positionIncrementGap="100"&gt;
+	///   &lt;analyzer&gt;
+	///     &lt;tokenizer class="solr.NGramTokenizerFactory"/&gt;
+	///     &lt;filter class="solr.TrimFilterFactory" /&gt;
+	///   &lt;/analyzer&gt;
+	/// &lt;/fieldType&gt;</pre>
+	/// </summary>
+	/// <seealso cref= TrimFilter </seealso>
+	public class TrimFilterFactory : TokenFilterFactory
+	{
+
+	  protected internal readonly bool updateOffsets;
+
+	  /// <summary>
+	  /// Creates a new TrimFilterFactory </summary>
+	  public TrimFilterFactory(IDictionary<string, string> args) : base(args)
+	  {
+		updateOffsets = getBoolean(args, "updateOffsets", false);
+		if (args.Count > 0)
+		{
+		  throw new System.ArgumentException("Unknown parameters: " + args);
+		}
+	  }
+
+	  public override TrimFilter create(TokenStream input)
+	  {
+//JAVA TO C# CONVERTER TODO TASK: Most Java annotations will not have direct .NET equivalent attributes:
+//ORIGINAL LINE: @SuppressWarnings("deprecation") final org.apache.lucene.analysis.miscellaneous.TrimFilter filter = new org.apache.lucene.analysis.miscellaneous.TrimFilter(luceneMatchVersion, input, updateOffsets);
+//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+		  TrimFilter filter = new TrimFilter(luceneMatchVersion, input, updateOffsets);
+		return filter;
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/TruncateTokenFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/TruncateTokenFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/TruncateTokenFilter.cs
new file mode 100644
index 0000000..df01d3a
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/TruncateTokenFilter.cs
@@ -0,0 +1,66 @@
+namespace org.apache.lucene.analysis.miscellaneous
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+	using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+	using KeywordAttribute = org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
+
+	/// <summary>
+	/// A token filter for truncating the terms into a specific length.
+	/// Fixed prefix truncation, as a stemming method, produces good results on Turkish language.
+	/// It is reported that F5, using first 5 characters, produced best results in
+	/// <a href="http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf">
+	/// Information Retrieval on Turkish Texts</a>
+	/// </summary>
+	public sealed class TruncateTokenFilter : TokenFilter
+	{
+
+	  private readonly CharTermAttribute termAttribute = addAttribute(typeof(CharTermAttribute));
+	  private readonly KeywordAttribute keywordAttr = addAttribute(typeof(KeywordAttribute));
+
+	  private readonly int length;
+
+	  public TruncateTokenFilter(TokenStream input, int length) : base(input)
+	  {
+		if (length < 1)
+		{
+		  throw new System.ArgumentException("length parameter must be a positive number: " + length);
+		}
+		this.length = length;
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public final boolean incrementToken() throws java.io.IOException
+	  public override bool incrementToken()
+	  {
+		if (input.incrementToken())
+		{
+		  if (!keywordAttr.Keyword && termAttribute.length() > length)
+		  {
+			termAttribute.Length = length;
+		  }
+		  return true;
+		}
+		else
+		{
+		  return false;
+		}
+	  }
+	}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/TruncateTokenFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/TruncateTokenFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/TruncateTokenFilterFactory.cs
new file mode 100644
index 0000000..06fd1d0
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/TruncateTokenFilterFactory.cs
@@ -0,0 +1,66 @@
+using System.Collections.Generic;
+using TokenFilterFactory = Lucene.Net.Analysis.Util.TokenFilterFactory;
+
+namespace org.apache.lucene.analysis.miscellaneous
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+	using TokenFilterFactory = TokenFilterFactory;
+
+	/// <summary>
+	/// Factory for <seealso cref="org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter"/>. The following type is recommended for "<i>diacritics-insensitive search</i>" for Turkish.
+	/// <pre class="prettyprint">
+	/// &lt;fieldType name="text_tr_ascii_f5" class="solr.TextField" positionIncrementGap="100"&gt;
+	///   &lt;analyzer&gt;
+	///     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
+	///     &lt;filter class="solr.ApostropheFilterFactory"/&gt;
+	///     &lt;filter class="solr.TurkishLowerCaseFilterFactory"/&gt;
+	///     &lt;filter class="solr.ASCIIFoldingFilterFactory" preserveOriginal="true"/&gt;
+	///     &lt;filter class="solr.KeywordRepeatFilterFactory"/&gt;
+	///     &lt;filter class="solr.TruncateTokenFilterFactory" prefixLength="5"/&gt;
+	///     &lt;filter class="solr.RemoveDuplicatesTokenFilterFactory"/&gt;
+	///   &lt;/analyzer&gt;
+	/// &lt;/fieldType&gt;</pre>
+	/// </summary>
+	public class TruncateTokenFilterFactory : TokenFilterFactory
+	{
+
+	  public const string PREFIX_LENGTH_KEY = "prefixLength";
+	  private readonly sbyte prefixLength;
+
+	  public TruncateTokenFilterFactory(IDictionary<string, string> args) : base(args)
+	  {
+		prefixLength = sbyte.Parse(get(args, PREFIX_LENGTH_KEY, "5"));
+		if (prefixLength < 1)
+		{
+		  throw new System.ArgumentException(PREFIX_LENGTH_KEY + " parameter must be a positive number: " + prefixLength);
+		}
+		if (args.Count > 0)
+		{
+		  throw new System.ArgumentException("Unknown parameter(s): " + args);
+		}
+	  }
+
+	  public override TokenStream create(TokenStream input)
+	  {
+		return new TruncateTokenFilter(input, prefixLength);
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/WordDelimiterFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/WordDelimiterFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/WordDelimiterFilter.cs
new file mode 100644
index 0000000..16575e6
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/WordDelimiterFilter.cs
@@ -0,0 +1,761 @@
+using System;
+using System.Text;
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+using Lucene.Net.Analysis.Core;
+
+namespace org.apache.lucene.analysis.miscellaneous
+{
+
+	using WhitespaceTokenizer = WhitespaceTokenizer;
+	using StandardTokenizer = org.apache.lucene.analysis.standard.StandardTokenizer;
+	using OffsetAttribute = org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+	using PositionIncrementAttribute = org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+	using CharTermAttribute = org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+	using TypeAttribute = org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+	using CharArraySet = org.apache.lucene.analysis.util.CharArraySet;
+	using ArrayUtil = org.apache.lucene.util.ArrayUtil;
+	using AttributeSource = org.apache.lucene.util.AttributeSource;
+	using InPlaceMergeSorter = org.apache.lucene.util.InPlaceMergeSorter;
+	using RamUsageEstimator = org.apache.lucene.util.RamUsageEstimator;
+	using Version = org.apache.lucene.util.Version;
+
+
+	/// <summary>
+	/// Splits words into subwords and performs optional transformations on subword
+	/// groups. Words are split into subwords with the following rules:
+	/// <ul>
+	/// <li>split on intra-word delimiters (by default, all non alpha-numeric
+	/// characters): <code>"Wi-Fi"</code> &#8594; <code>"Wi", "Fi"</code></li>
+	/// <li>split on case transitions: <code>"PowerShot"</code> &#8594;
+	/// <code>"Power", "Shot"</code></li>
+	/// <li>split on letter-number transitions: <code>"SD500"</code> &#8594;
+	/// <code>"SD", "500"</code></li>
+	/// <li>leading and trailing intra-word delimiters on each subword are ignored:
+	/// <code>"//hello---there, 'dude'"</code> &#8594;
+	/// <code>"hello", "there", "dude"</code></li>
+	/// <li>trailing "'s" are removed for each subword: <code>"O'Neil's"</code>
+	/// &#8594; <code>"O", "Neil"</code>
+	/// <ul>
+	/// <li>Note: this step isn't performed in a separate filter because of possible
+	/// subword combinations.</li>
+	/// </ul>
+	/// </li>
+	/// </ul>
+	/// 
+	/// The <b>combinations</b> parameter affects how subwords are combined:
+	/// <ul>
+	/// <li>combinations="0" causes no subword combinations: <code>"PowerShot"</code>
+	/// &#8594; <code>0:"Power", 1:"Shot"</code> (0 and 1 are the token positions)</li>
+	/// <li>combinations="1" means that in addition to the subwords, maximum runs of
+	/// non-numeric subwords are catenated and produced at the same position of the
+	/// last subword in the run:
+	/// <ul>
+	/// <li><code>"PowerShot"</code> &#8594;
+	/// <code>0:"Power", 1:"Shot" 1:"PowerShot"</code></li>
+	/// <li><code>"A's+B's&C's"</code> -gt; <code>0:"A", 1:"B", 2:"C", 2:"ABC"</code>
+	/// </li>
+	/// <li><code>"Super-Duper-XL500-42-AutoCoder!"</code> &#8594;
+	/// <code>0:"Super", 1:"Duper", 2:"XL", 2:"SuperDuperXL", 3:"500" 4:"42", 5:"Auto", 6:"Coder", 6:"AutoCoder"</code>
+	/// </li>
+	/// </ul>
+	/// </li>
+	/// </ul>
+	/// One use for <seealso cref="WordDelimiterFilter"/> is to help match words with different
+	/// subword delimiters. For example, if the source text contained "wi-fi" one may
+	/// want "wifi" "WiFi" "wi-fi" "wi+fi" queries to all match. One way of doing so
+	/// is to specify combinations="1" in the analyzer used for indexing, and
+	/// combinations="0" (the default) in the analyzer used for querying. Given that
+	/// the current <seealso cref="StandardTokenizer"/> immediately removes many intra-word
+	/// delimiters, it is recommended that this filter be used after a tokenizer that
+	/// does not do this (such as <seealso cref="WhitespaceTokenizer"/>).
+	/// </summary>
+	public sealed class WordDelimiterFilter : TokenFilter
+	{
+		private bool InstanceFieldsInitialized = false;
+
+		private void InitializeInstanceFields()
+		{
+			concat = new WordDelimiterConcatenation(this);
+			concatAll = new WordDelimiterConcatenation(this);
+			sorter = new OffsetSorter(this);
+		}
+
+
+	  public const int LOWER = 0x01;
+	  public const int UPPER = 0x02;
+	  public const int DIGIT = 0x04;
+	  public const int SUBWORD_DELIM = 0x08;
+
+	  // combinations: for testing, not for setting bits
+	  public const int ALPHA = 0x03;
+	  public const int ALPHANUM = 0x07;
+
+	  /// <summary>
+	  /// Causes parts of words to be generated:
+	  /// <p/>
+	  /// "PowerShot" => "Power" "Shot"
+	  /// </summary>
+	  public const int GENERATE_WORD_PARTS = 1;
+
+	  /// <summary>
+	  /// Causes number subwords to be generated:
+	  /// <p/>
+	  /// "500-42" => "500" "42"
+	  /// </summary>
+	  public const int GENERATE_NUMBER_PARTS = 2;
+
+	  /// <summary>
+	  /// Causes maximum runs of word parts to be catenated:
+	  /// <p/>
+	  /// "wi-fi" => "wifi"
+	  /// </summary>
+	  public const int CATENATE_WORDS = 4;
+
+	  /// <summary>
+	  /// Causes maximum runs of word parts to be catenated:
+	  /// <p/>
+	  /// "wi-fi" => "wifi"
+	  /// </summary>
+	  public const int CATENATE_NUMBERS = 8;
+
+	  /// <summary>
+	  /// Causes all subword parts to be catenated:
+	  /// <p/>
+	  /// "wi-fi-4000" => "wifi4000"
+	  /// </summary>
+	  public const int CATENATE_ALL = 16;
+
+	  /// <summary>
+	  /// Causes original words are preserved and added to the subword list (Defaults to false)
+	  /// <p/>
+	  /// "500-42" => "500" "42" "500-42"
+	  /// </summary>
+	  public const int PRESERVE_ORIGINAL = 32;
+
+	  /// <summary>
+	  /// If not set, causes case changes to be ignored (subwords will only be generated
+	  /// given SUBWORD_DELIM tokens)
+	  /// </summary>
+	  public const int SPLIT_ON_CASE_CHANGE = 64;
+
+	  /// <summary>
+	  /// If not set, causes numeric changes to be ignored (subwords will only be generated
+	  /// given SUBWORD_DELIM tokens).
+	  /// </summary>
+	  public const int SPLIT_ON_NUMERICS = 128;
+
+	  /// <summary>
+	  /// Causes trailing "'s" to be removed for each subword
+	  /// <p/>
+	  /// "O'Neil's" => "O", "Neil"
+	  /// </summary>
+	  public const int STEM_ENGLISH_POSSESSIVE = 256;
+
+	  /// <summary>
+	  /// If not null is the set of tokens to protect from being delimited
+	  /// 
+	  /// </summary>
+	  internal readonly CharArraySet protWords;
+
+	  private readonly int flags;
+
+	  private readonly CharTermAttribute termAttribute = addAttribute(typeof(CharTermAttribute));
+	  private readonly OffsetAttribute offsetAttribute = addAttribute(typeof(OffsetAttribute));
+	  private readonly PositionIncrementAttribute posIncAttribute = addAttribute(typeof(PositionIncrementAttribute));
+	  private readonly TypeAttribute typeAttribute = addAttribute(typeof(TypeAttribute));
+
+	  // used for iterating word delimiter breaks
+	  private readonly WordDelimiterIterator iterator;
+
+	  // used for concatenating runs of similar typed subwords (word,number)
+	  private WordDelimiterConcatenation concat;
+	  // number of subwords last output by concat.
+	  private int lastConcatCount = 0;
+
+	  // used for catenate all
+	  private WordDelimiterConcatenation concatAll;
+
+	  // used for accumulating position increment gaps
+	  private int accumPosInc = 0;
+
+	  private char[] savedBuffer = new char[1024];
+	  private int savedStartOffset;
+	  private int savedEndOffset;
+	  private string savedType;
+	  private bool hasSavedState = false;
+	  // if length by start + end offsets doesn't match the term text then assume
+	  // this is a synonym and don't adjust the offsets.
+	  private bool hasIllegalOffsets = false;
+
+	  // for a run of the same subword type within a word, have we output anything?
+	  private bool hasOutputToken = false;
+	  // when preserve original is on, have we output any token following it?
+	  // this token must have posInc=0!
+	  private bool hasOutputFollowingOriginal = false;
+
+	  /// <summary>
+	  /// Creates a new WordDelimiterFilter
+	  /// </summary>
+	  /// <param name="in"> TokenStream to be filtered </param>
+	  /// <param name="charTypeTable"> table containing character types </param>
+	  /// <param name="configurationFlags"> Flags configuring the filter </param>
+	  /// <param name="protWords"> If not null is the set of tokens to protect from being delimited </param>
+	  public WordDelimiterFilter(Version matchVersion, TokenStream @in, sbyte[] charTypeTable, int configurationFlags, CharArraySet protWords) : base(@in)
+	  {
+		  if (!InstanceFieldsInitialized)
+		  {
+			  InitializeInstanceFields();
+			  InstanceFieldsInitialized = true;
+		  }
+		if (!matchVersion.onOrAfter(Version.LUCENE_48))
+		{
+		  throw new System.ArgumentException("This class only works with Lucene 4.8+. To emulate the old (broken) behavior of WordDelimiterFilter, use Lucene47WordDelimiterFilter");
+		}
+		this.flags = configurationFlags;
+		this.protWords = protWords;
+		this.iterator = new WordDelimiterIterator(charTypeTable, has(SPLIT_ON_CASE_CHANGE), has(SPLIT_ON_NUMERICS), has(STEM_ENGLISH_POSSESSIVE));
+	  }
+
+	  /// <summary>
+	  /// Creates a new WordDelimiterFilter using <seealso cref="WordDelimiterIterator#DEFAULT_WORD_DELIM_TABLE"/>
+	  /// as its charTypeTable
+	  /// </summary>
+	  /// <param name="in"> TokenStream to be filtered </param>
+	  /// <param name="configurationFlags"> Flags configuring the filter </param>
+	  /// <param name="protWords"> If not null is the set of tokens to protect from being delimited </param>
+	  public WordDelimiterFilter(Version matchVersion, TokenStream @in, int configurationFlags, CharArraySet protWords) : this(matchVersion, @in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, configurationFlags, protWords)
+	  {
+		  if (!InstanceFieldsInitialized)
+		  {
+			  InitializeInstanceFields();
+			  InstanceFieldsInitialized = true;
+		  }
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public boolean incrementToken() throws java.io.IOException
+	  public override bool incrementToken()
+	  {
+		while (true)
+		{
+		  if (!hasSavedState)
+		  {
+			// process a new input word
+			if (!input.incrementToken())
+			{
+			  return false;
+			}
+
+			int termLength = termAttribute.length();
+			char[] termBuffer = termAttribute.buffer();
+
+			accumPosInc += posIncAttribute.PositionIncrement;
+
+			iterator.setText(termBuffer, termLength);
+			iterator.next();
+
+			// word of no delimiters, or protected word: just return it
+			if ((iterator.current == 0 && iterator.end == termLength) || (protWords != null && protWords.contains(termBuffer, 0, termLength)))
+			{
+			  posIncAttribute.PositionIncrement = accumPosInc;
+			  accumPosInc = 0;
+			  first = false;
+			  return true;
+			}
+
+			// word of simply delimiters
+			if (iterator.end == WordDelimiterIterator.DONE && !has(PRESERVE_ORIGINAL))
+			{
+			  // if the posInc is 1, simply ignore it in the accumulation
+			  // TODO: proper hole adjustment (FilteringTokenFilter-like) instead of this previous logic!
+			  if (posIncAttribute.PositionIncrement == 1 && !first)
+			  {
+				accumPosInc--;
+			  }
+			  continue;
+			}
+
+			saveState();
+
+			hasOutputToken = false;
+			hasOutputFollowingOriginal = !has(PRESERVE_ORIGINAL);
+			lastConcatCount = 0;
+
+			if (has(PRESERVE_ORIGINAL))
+			{
+			  posIncAttribute.PositionIncrement = accumPosInc;
+			  accumPosInc = 0;
+			  first = false;
+			  return true;
+			}
+		  }
+
+		  // at the end of the string, output any concatenations
+		  if (iterator.end == WordDelimiterIterator.DONE)
+		  {
+			if (!concat.Empty)
+			{
+			  if (flushConcatenation(concat))
+			  {
+				buffer();
+				continue;
+			  }
+			}
+
+			if (!concatAll.Empty)
+			{
+			  // only if we haven't output this same combo above!
+			  if (concatAll.subwordCount > lastConcatCount)
+			  {
+				concatAll.writeAndClear();
+				buffer();
+				continue;
+			  }
+			  concatAll.clear();
+			}
+
+			if (bufferedPos < bufferedLen)
+			{
+			  if (bufferedPos == 0)
+			  {
+				sorter.sort(0, bufferedLen);
+			  }
+			  clearAttributes();
+			  restoreState(buffered[bufferedPos++]);
+			  if (first && posIncAttribute.PositionIncrement == 0)
+			  {
+				// can easily happen with strange combinations (e.g. not outputting numbers, but concat-all)
+				posIncAttribute.PositionIncrement = 1;
+			  }
+			  first = false;
+			  return true;
+			}
+
+			// no saved concatenations, on to the next input word
+			bufferedPos = bufferedLen = 0;
+			hasSavedState = false;
+			continue;
+		  }
+
+		  // word surrounded by delimiters: always output
+		  if (iterator.SingleWord)
+		  {
+			generatePart(true);
+			iterator.next();
+			first = false;
+			return true;
+		  }
+
+		  int wordType = iterator.type();
+
+		  // do we already have queued up incompatible concatenations?
+		  if (!concat.Empty && (concat.type & wordType) == 0)
+		  {
+			if (flushConcatenation(concat))
+			{
+			  hasOutputToken = false;
+			  buffer();
+			  continue;
+			}
+			hasOutputToken = false;
+		  }
+
+		  // add subwords depending upon options
+		  if (shouldConcatenate(wordType))
+		  {
+			if (concat.Empty)
+			{
+			  concat.type = wordType;
+			}
+			concatenate(concat);
+		  }
+
+		  // add all subwords (catenateAll)
+		  if (has(CATENATE_ALL))
+		  {
+			concatenate(concatAll);
+		  }
+
+		  // if we should output the word or number part
+		  if (shouldGenerateParts(wordType))
+		  {
+			generatePart(false);
+			buffer();
+		  }
+
+		  iterator.next();
+		}
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void reset() throws java.io.IOException
+	  public override void reset()
+	  {
+		base.reset();
+		hasSavedState = false;
+		concat.clear();
+		concatAll.clear();
+		accumPosInc = bufferedPos = bufferedLen = 0;
+		first = true;
+	  }
+
+	  // ================================================= Helper Methods ================================================
+
+
+	  private AttributeSource.State[] buffered = new AttributeSource.State[8];
+	  private int[] startOff = new int[8];
+	  private int[] posInc = new int[8];
+	  private int bufferedLen = 0;
+	  private int bufferedPos = 0;
+	  private bool first;
+
+	  private class OffsetSorter : InPlaceMergeSorter
+	  {
+		  private readonly WordDelimiterFilter outerInstance;
+
+		  public OffsetSorter(WordDelimiterFilter outerInstance)
+		  {
+			  this.outerInstance = outerInstance;
+		  }
+
+		protected internal override int compare(int i, int j)
+		{
+		  int cmp = int.compare(outerInstance.startOff[i], outerInstance.startOff[j]);
+		  if (cmp == 0)
+		  {
+			cmp = int.compare(outerInstance.posInc[j], outerInstance.posInc[i]);
+		  }
+		  return cmp;
+		}
+
+		protected internal override void swap(int i, int j)
+		{
+		  AttributeSource.State tmp = outerInstance.buffered[i];
+		  outerInstance.buffered[i] = outerInstance.buffered[j];
+		  outerInstance.buffered[j] = tmp;
+
+		  int tmp2 = outerInstance.startOff[i];
+		  outerInstance.startOff[i] = outerInstance.startOff[j];
+		  outerInstance.startOff[j] = tmp2;
+
+		  tmp2 = outerInstance.posInc[i];
+		  outerInstance.posInc[i] = outerInstance.posInc[j];
+		  outerInstance.posInc[j] = tmp2;
+		}
+	  }
+
+	  internal OffsetSorter sorter;
+
+	  private void buffer()
+	  {
+		if (bufferedLen == buffered.Length)
+		{
+		  int newSize = ArrayUtil.oversize(bufferedLen + 1, 8);
+		  buffered = Arrays.copyOf(buffered, newSize);
+		  startOff = Arrays.copyOf(startOff, newSize);
+		  posInc = Arrays.copyOf(posInc, newSize);
+		}
+		startOff[bufferedLen] = offsetAttribute.startOffset();
+		posInc[bufferedLen] = posIncAttribute.PositionIncrement;
+		buffered[bufferedLen] = captureState();
+		bufferedLen++;
+	  }
+
+	  /// <summary>
+	  /// Saves the existing attribute states
+	  /// </summary>
+	  private void saveState()
+	  {
+		// otherwise, we have delimiters, save state
+		savedStartOffset = offsetAttribute.startOffset();
+		savedEndOffset = offsetAttribute.endOffset();
+		// if length by start + end offsets doesn't match the term text then assume this is a synonym and don't adjust the offsets.
+		hasIllegalOffsets = (savedEndOffset - savedStartOffset != termAttribute.length());
+		savedType = typeAttribute.type();
+
+		if (savedBuffer.Length < termAttribute.length())
+		{
+		  savedBuffer = new char[ArrayUtil.oversize(termAttribute.length(), RamUsageEstimator.NUM_BYTES_CHAR)];
+		}
+
+		Array.Copy(termAttribute.buffer(), 0, savedBuffer, 0, termAttribute.length());
+		iterator.text = savedBuffer;
+
+		hasSavedState = true;
+	  }
+
+	  /// <summary>
+	  /// Flushes the given WordDelimiterConcatenation by either writing its concat and then clearing, or just clearing.
+	  /// </summary>
+	  /// <param name="concatenation"> WordDelimiterConcatenation that will be flushed </param>
+	  /// <returns> {@code true} if the concatenation was written before it was cleared, {@code false} otherwise </returns>
+	  private bool flushConcatenation(WordDelimiterConcatenation concatenation)
+	  {
+		lastConcatCount = concatenation.subwordCount;
+		if (concatenation.subwordCount != 1 || !shouldGenerateParts(concatenation.type))
+		{
+		  concatenation.writeAndClear();
+		  return true;
+		}
+		concatenation.clear();
+		return false;
+	  }
+
+	  /// <summary>
+	  /// Determines whether to concatenate a word or number if the current word is the given type
+	  /// </summary>
+	  /// <param name="wordType"> Type of the current word used to determine if it should be concatenated </param>
+	  /// <returns> {@code true} if concatenation should occur, {@code false} otherwise </returns>
+	  private bool shouldConcatenate(int wordType)
+	  {
+		return (has(CATENATE_WORDS) && isAlpha(wordType)) || (has(CATENATE_NUMBERS) && isDigit(wordType));
+	  }
+
+	  /// <summary>
+	  /// Determines whether a word/number part should be generated for a word of the given type
+	  /// </summary>
+	  /// <param name="wordType"> Type of the word used to determine if a word/number part should be generated </param>
+	  /// <returns> {@code true} if a word/number part should be generated, {@code false} otherwise </returns>
+	  private bool shouldGenerateParts(int wordType)
+	  {
+		return (has(GENERATE_WORD_PARTS) && isAlpha(wordType)) || (has(GENERATE_NUMBER_PARTS) && isDigit(wordType));
+	  }
+
+	  /// <summary>
+	  /// Concatenates the saved buffer to the given WordDelimiterConcatenation
+	  /// </summary>
+	  /// <param name="concatenation"> WordDelimiterConcatenation to concatenate the buffer to </param>
+	  private void concatenate(WordDelimiterConcatenation concatenation)
+	  {
+		if (concatenation.Empty)
+		{
+		  concatenation.startOffset = savedStartOffset + iterator.current;
+		}
+		concatenation.append(savedBuffer, iterator.current, iterator.end - iterator.current);
+		concatenation.endOffset = savedStartOffset + iterator.end;
+	  }
+
+	  /// <summary>
+	  /// Generates a word/number part, updating the appropriate attributes
+	  /// </summary>
+	  /// <param name="isSingleWord"> {@code true} if the generation is occurring from a single word, {@code false} otherwise </param>
+	  private void generatePart(bool isSingleWord)
+	  {
+		clearAttributes();
+		termAttribute.copyBuffer(savedBuffer, iterator.current, iterator.end - iterator.current);
+
+		int startOffset = savedStartOffset + iterator.current;
+		int endOffset = savedStartOffset + iterator.end;
+
+		if (hasIllegalOffsets)
+		{
+		  // historically this filter did this regardless for 'isSingleWord', 
+		  // but we must do a sanity check:
+		  if (isSingleWord && startOffset <= savedEndOffset)
+		  {
+			offsetAttribute.setOffset(startOffset, savedEndOffset);
+		  }
+		  else
+		  {
+			offsetAttribute.setOffset(savedStartOffset, savedEndOffset);
+		  }
+		}
+		else
+		{
+		  offsetAttribute.setOffset(startOffset, endOffset);
+		}
+		posIncAttribute.PositionIncrement = position(false);
+		typeAttribute.Type = savedType;
+	  }
+
+	  /// <summary>
+	  /// Get the position increment gap for a subword or concatenation
+	  /// </summary>
+	  /// <param name="inject"> true if this token wants to be injected </param>
+	  /// <returns> position increment gap </returns>
+	  private int position(bool inject)
+	  {
+		int posInc = accumPosInc;
+
+		if (hasOutputToken)
+		{
+		  accumPosInc = 0;
+		  return inject ? 0 : Math.Max(1, posInc);
+		}
+
+		hasOutputToken = true;
+
+		if (!hasOutputFollowingOriginal)
+		{
+		  // the first token following the original is 0 regardless
+		  hasOutputFollowingOriginal = true;
+		  return 0;
+		}
+		// clear the accumulated position increment
+		accumPosInc = 0;
+		return Math.Max(1, posInc);
+	  }
+
+	  /// <summary>
+	  /// Checks if the given word type includes <seealso cref="#ALPHA"/>
+	  /// </summary>
+	  /// <param name="type"> Word type to check </param>
+	  /// <returns> {@code true} if the type contains ALPHA, {@code false} otherwise </returns>
+	  internal static bool isAlpha(int type)
+	  {
+		return (type & ALPHA) != 0;
+	  }
+
+	  /// <summary>
+	  /// Checks if the given word type includes <seealso cref="#DIGIT"/>
+	  /// </summary>
+	  /// <param name="type"> Word type to check </param>
+	  /// <returns> {@code true} if the type contains DIGIT, {@code false} otherwise </returns>
+	  internal static bool isDigit(int type)
+	  {
+		return (type & DIGIT) != 0;
+	  }
+
+	  /// <summary>
+	  /// Checks if the given word type includes <seealso cref="#SUBWORD_DELIM"/>
+	  /// </summary>
+	  /// <param name="type"> Word type to check </param>
+	  /// <returns> {@code true} if the type contains SUBWORD_DELIM, {@code false} otherwise </returns>
+	  internal static bool isSubwordDelim(int type)
+	  {
+		return (type & SUBWORD_DELIM) != 0;
+	  }
+
+	  /// <summary>
+	  /// Checks if the given word type includes <seealso cref="#UPPER"/>
+	  /// </summary>
+	  /// <param name="type"> Word type to check </param>
+	  /// <returns> {@code true} if the type contains UPPER, {@code false} otherwise </returns>
+	  internal static bool isUpper(int type)
+	  {
+		return (type & UPPER) != 0;
+	  }
+
+	  /// <summary>
+	  /// Determines whether the given flag is set
+	  /// </summary>
+	  /// <param name="flag"> Flag to see if set </param>
+	  /// <returns> {@code true} if flag is set </returns>
+	  private bool has(int flag)
+	  {
+		return (flags & flag) != 0;
+	  }
+
+	  // ================================================= Inner Classes =================================================
+
+	  /// <summary>
+	  /// A WDF concatenated 'run'
+	  /// </summary>
+	  internal sealed class WordDelimiterConcatenation
+	  {
+		  private readonly WordDelimiterFilter outerInstance;
+
+		  public WordDelimiterConcatenation(WordDelimiterFilter outerInstance)
+		  {
+			  this.outerInstance = outerInstance;
+		  }
+
+		internal readonly StringBuilder buffer = new StringBuilder();
+		internal int startOffset;
+		internal int endOffset;
+		internal int type;
+		internal int subwordCount;
+
+		/// <summary>
+		/// Appends the given text of the given length, to the concetenation at the given offset
+		/// </summary>
+		/// <param name="text"> Text to append </param>
+		/// <param name="offset"> Offset in the concetenation to add the text </param>
+		/// <param name="length"> Length of the text to append </param>
+		internal void append(char[] text, int offset, int length)
+		{
+		  buffer.Append(text, offset, length);
+		  subwordCount++;
+		}
+
+		/// <summary>
+		/// Writes the concatenation to the attributes
+		/// </summary>
+		internal void write()
+		{
+		  clearAttributes();
+		  if (outerInstance.termAttribute.length() < buffer.Length)
+		  {
+			outerInstance.termAttribute.resizeBuffer(buffer.Length);
+		  }
+		  char[] termbuffer = outerInstance.termAttribute.buffer();
+
+		  buffer.getChars(0, buffer.Length, termbuffer, 0);
+		  outerInstance.termAttribute.Length = buffer.Length;
+
+		  if (outerInstance.hasIllegalOffsets)
+		  {
+			outerInstance.offsetAttribute.setOffset(outerInstance.savedStartOffset, outerInstance.savedEndOffset);
+		  }
+		  else
+		  {
+			outerInstance.offsetAttribute.setOffset(startOffset, endOffset);
+		  }
+		  outerInstance.posIncAttribute.PositionIncrement = outerInstance.position(true);
+		  outerInstance.typeAttribute.Type = outerInstance.savedType;
+		  outerInstance.accumPosInc = 0;
+		}
+
+		/// <summary>
+		/// Determines if the concatenation is empty
+		/// </summary>
+		/// <returns> {@code true} if the concatenation is empty, {@code false} otherwise </returns>
+		internal bool Empty
+		{
+			get
+			{
+			  return buffer.Length == 0;
+			}
+		}
+
+		/// <summary>
+		/// Clears the concatenation and resets its state
+		/// </summary>
+		internal void clear()
+		{
+		  buffer.Length = 0;
+		  startOffset = endOffset = type = subwordCount = 0;
+		}
+
+		/// <summary>
+		/// Convenience method for the common scenario of having to write the concetenation and then clearing its state
+		/// </summary>
+		internal void writeAndClear()
+		{
+		  write();
+		  clear();
+		}
+	  }
+	  // questions:
+	  // negative numbers?  -42 indexed as just 42?
+	  // dollar sign?  $42
+	  // percent sign?  33%
+	  // downsides:  if source text is "powershot" then a query of "PowerShot" won't match!
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/WordDelimiterFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/WordDelimiterFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/WordDelimiterFilterFactory.cs
new file mode 100644
index 0000000..747ed48
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/WordDelimiterFilterFactory.cs
@@ -0,0 +1,270 @@
+using System;
+using System.Collections.Generic;
+using TokenFilterFactory = Lucene.Net.Analysis.Util.TokenFilterFactory;
+
+namespace org.apache.lucene.analysis.miscellaneous
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+	using CharArraySet = org.apache.lucene.analysis.util.CharArraySet;
+	using ResourceLoader = org.apache.lucene.analysis.util.ResourceLoader;
+	using ResourceLoaderAware = org.apache.lucene.analysis.util.ResourceLoaderAware;
+	using TokenFilterFactory = TokenFilterFactory;
+	using Version = org.apache.lucene.util.Version;
+
+
+	using org.apache.lucene.analysis.miscellaneous;
+//JAVA TO C# CONVERTER TODO TASK: This Java 'import static' statement cannot be converted to C#:
+//	import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.*;
+
+	/// <summary>
+	/// Factory for <seealso cref="WordDelimiterFilter"/>.
+	/// <pre class="prettyprint">
+	/// &lt;fieldType name="text_wd" class="solr.TextField" positionIncrementGap="100"&gt;
+	///   &lt;analyzer&gt;
+	///     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
+	///     &lt;filter class="solr.WordDelimiterFilterFactory" protected="protectedword.txt"
+	///             preserveOriginal="0" splitOnNumerics="1" splitOnCaseChange="1"
+	///             catenateWords="0" catenateNumbers="0" catenateAll="0"
+	///             generateWordParts="1" generateNumberParts="1" stemEnglishPossessive="1"
+	///             types="wdfftypes.txt" /&gt;
+	///   &lt;/analyzer&gt;
+	/// &lt;/fieldType&gt;</pre>
+	/// </summary>
+	public class WordDelimiterFilterFactory : TokenFilterFactory, ResourceLoaderAware
+	{
+	  public const string PROTECTED_TOKENS = "protected";
+	  public const string TYPES = "types";
+
+	  private readonly string wordFiles;
+	  private readonly string types;
+	  private readonly int flags;
+	  internal sbyte[] typeTable = null;
+	  private CharArraySet protectedWords = null;
+
+	  /// <summary>
+	  /// Creates a new WordDelimiterFilterFactory </summary>
+	  public WordDelimiterFilterFactory(IDictionary<string, string> args) : base(args)
+	  {
+		assureMatchVersion();
+		int flags = 0;
+		if (getInt(args, "generateWordParts", 1) != 0)
+		{
+		  flags |= WordDelimiterFilter.GENERATE_WORD_PARTS;
+		}
+		if (getInt(args, "generateNumberParts", 1) != 0)
+		{
+		  flags |= WordDelimiterFilter.GENERATE_NUMBER_PARTS;
+		}
+		if (getInt(args, "catenateWords", 0) != 0)
+		{
+		  flags |= WordDelimiterFilter.CATENATE_WORDS;
+		}
+		if (getInt(args, "catenateNumbers", 0) != 0)
+		{
+		  flags |= WordDelimiterFilter.CATENATE_NUMBERS;
+		}
+		if (getInt(args, "catenateAll", 0) != 0)
+		{
+		  flags |= WordDelimiterFilter.CATENATE_ALL;
+		}
+		if (getInt(args, "splitOnCaseChange", 1) != 0)
+		{
+		  flags |= WordDelimiterFilter.SPLIT_ON_CASE_CHANGE;
+		}
+		if (getInt(args, "splitOnNumerics", 1) != 0)
+		{
+		  flags |= WordDelimiterFilter.SPLIT_ON_NUMERICS;
+		}
+		if (getInt(args, "preserveOriginal", 0) != 0)
+		{
+		  flags |= WordDelimiterFilter.PRESERVE_ORIGINAL;
+		}
+		if (getInt(args, "stemEnglishPossessive", 1) != 0)
+		{
+		  flags |= WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE;
+		}
+		wordFiles = get(args, PROTECTED_TOKENS);
+		types = get(args, TYPES);
+		this.flags = flags;
+		if (args.Count > 0)
+		{
+		  throw new System.ArgumentException("Unknown parameters: " + args);
+		}
+	  }
+
+//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+//ORIGINAL LINE: @Override public void inform(org.apache.lucene.analysis.util.ResourceLoader loader) throws java.io.IOException
+	  public virtual void inform(ResourceLoader loader)
+	  {
+		if (wordFiles != null)
+		{
+		  protectedWords = getWordSet(loader, wordFiles, false);
+		}
+		if (types != null)
+		{
+		  IList<string> files = splitFileNames(types);
+		  IList<string> wlist = new List<string>();
+		  foreach (string file in files)
+		  {
+			IList<string> lines = getLines(loader, file.Trim());
+			wlist.AddRange(lines);
+		  }
+		  typeTable = parseTypes(wlist);
+		}
+	  }
+
+	  public override TokenFilter create(TokenStream input)
+	  {
+		if (luceneMatchVersion.onOrAfter(Version.LUCENE_48))
+		{
+		  return new WordDelimiterFilter(luceneMatchVersion, input, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable, flags, protectedWords);
+		}
+		else
+		{
+		  return new Lucene47WordDelimiterFilter(input, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable, flags, protectedWords);
+		}
+	  }
+
+	  // source => type
+	  private static Pattern typePattern = Pattern.compile("(.*)\\s*=>\\s*(.*)\\s*$");
+
+	  // parses a list of MappingCharFilter style rules into a custom byte[] type table
+	  private sbyte[] parseTypes(IList<string> rules)
+	  {
+		SortedMap<char?, sbyte?> typeMap = new SortedDictionary<char?, sbyte?>();
+		foreach (string rule in rules)
+		{
+		  Matcher m = typePattern.matcher(rule);
+		  if (!m.find())
+		  {
+			throw new System.ArgumentException("Invalid Mapping Rule : [" + rule + "]");
+		  }
+		  string lhs = parseString(m.group(1).Trim());
+		  sbyte? rhs = parseType(m.group(2).Trim());
+		  if (lhs.Length != 1)
+		  {
+			throw new System.ArgumentException("Invalid Mapping Rule : [" + rule + "]. Only a single character is allowed.");
+		  }
+		  if (rhs == null)
+		  {
+			throw new System.ArgumentException("Invalid Mapping Rule : [" + rule + "]. Illegal type.");
+		  }
+		  typeMap.put(lhs[0], rhs);
+		}
+
+		// ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for performance
+		sbyte[] types = new sbyte[Math.Max(typeMap.lastKey() + 1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.Length)];
+		for (int i = 0; i < types.Length; i++)
+		{
+		  types[i] = WordDelimiterIterator.getType(i);
+		}
+		foreach (KeyValuePair<char?, sbyte?> mapping in typeMap.entrySet())
+		{
+		  types[mapping.Key] = mapping.Value;
+		}
+		return types;
+	  }
+
+	  private sbyte? parseType(string s)
+	  {
+		if (s.Equals("LOWER"))
+		{
+		  return WordDelimiterFilter.LOWER;
+		}
+		else if (s.Equals("UPPER"))
+		{
+		  return WordDelimiterFilter.UPPER;
+		}
+		else if (s.Equals("ALPHA"))
+		{
+		  return WordDelimiterFilter.ALPHA;
+		}
+		else if (s.Equals("DIGIT"))
+		{
+		  return WordDelimiterFilter.DIGIT;
+		}
+		else if (s.Equals("ALPHANUM"))
+		{
+		  return WordDelimiterFilter.ALPHANUM;
+		}
+		else if (s.Equals("SUBWORD_DELIM"))
+		{
+		  return WordDelimiterFilter.SUBWORD_DELIM;
+		}
+		else
+		{
+		  return null;
+		}
+	  }
+
+	  internal char[] @out = new char[256];
+
+	  private string parseString(string s)
+	  {
+		int readPos = 0;
+		int len = s.Length;
+		int writePos = 0;
+		while (readPos < len)
+		{
+		  char c = s[readPos++];
+		  if (c == '\\')
+		  {
+			if (readPos >= len)
+			{
+			  throw new System.ArgumentException("Invalid escaped char in [" + s + "]");
+			}
+			c = s[readPos++];
+			switch (c)
+			{
+			  case '\\' :
+				  c = '\\';
+				  break;
+			  case 'n' :
+				  c = '\n';
+				  break;
+			  case 't' :
+				  c = '\t';
+				  break;
+			  case 'r' :
+				  c = '\r';
+				  break;
+			  case 'b' :
+				  c = '\b';
+				  break;
+			  case 'f' :
+				  c = '\f';
+				  break;
+			  case 'u' :
+				if (readPos + 3 >= len)
+				{
+				  throw new System.ArgumentException("Invalid escaped char in [" + s + "]");
+				}
+				c = (char)int.Parse(s.Substring(readPos, 4), 16);
+				readPos += 4;
+				break;
+			}
+		  }
+		  @out[writePos++] = c;
+		}
+		return new string(@out, 0, writePos);
+	  }
+	}
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/WordDelimiterIterator.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/WordDelimiterIterator.cs b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/WordDelimiterIterator.cs
new file mode 100644
index 0000000..154176b
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/WordDelimiterIterator.cs
@@ -0,0 +1,367 @@
+namespace org.apache.lucene.analysis.miscellaneous
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+	using org.apache.lucene.analysis.miscellaneous;
+//JAVA TO C# CONVERTER TODO TASK: This Java 'import static' statement cannot be converted to C#:
+//	import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.*;
+
+	/// <summary>
+	/// A BreakIterator-like API for iterating over subwords in text, according to WordDelimiterFilter rules.
+	/// @lucene.internal
+	/// </summary>
+	public sealed class WordDelimiterIterator
+	{
+
+	  /// <summary>
+	  /// Indicates the end of iteration </summary>
+	  public const int DONE = -1;
+
+	  public static readonly sbyte[] DEFAULT_WORD_DELIM_TABLE;
+
+	  internal char[] text;
+	  internal int length;
+
+	  /// <summary>
+	  /// start position of text, excluding leading delimiters </summary>
+	  internal int startBounds;
+	  /// <summary>
+	  /// end position of text, excluding trailing delimiters </summary>
+	  internal int endBounds;
+
+	  /// <summary>
+	  /// Beginning of subword </summary>
+	  internal int current;
+	  /// <summary>
+	  /// End of subword </summary>
+	  internal int end;
+
+	  /* does this string end with a possessive such as 's */
+	  private bool hasFinalPossessive = false;
+
+	  /// <summary>
+	  /// If false, causes case changes to be ignored (subwords will only be generated
+	  /// given SUBWORD_DELIM tokens). (Defaults to true)
+	  /// </summary>
+	  internal readonly bool splitOnCaseChange;
+
+	  /// <summary>
+	  /// If false, causes numeric changes to be ignored (subwords will only be generated
+	  /// given SUBWORD_DELIM tokens). (Defaults to true)
+	  /// </summary>
+	  internal readonly bool splitOnNumerics;
+
+	  /// <summary>
+	  /// If true, causes trailing "'s" to be removed for each subword. (Defaults to true)
+	  /// <p/>
+	  /// "O'Neil's" => "O", "Neil"
+	  /// </summary>
+	  internal readonly bool stemEnglishPossessive;
+
+	  private readonly sbyte[] charTypeTable;
+
+	  /// <summary>
+	  /// if true, need to skip over a possessive found in the last call to next() </summary>
+	  private bool skipPossessive = false;
+
+	  // TODO: should there be a WORD_DELIM category for chars that only separate words (no catenation of subwords will be
+	  // done if separated by these chars?) "," would be an obvious candidate...
+	  static WordDelimiterIterator()
+	  {
+		sbyte[] tab = new sbyte[256];
+		for (int i = 0; i < 256; i++)
+		{
+		  sbyte code = 0;
+		  if (char.IsLower(i))
+		  {
+			code |= (sbyte)WordDelimiterFilter.LOWER;
+		  }
+		  else if (char.IsUpper(i))
+		  {
+			code |= (sbyte)WordDelimiterFilter.UPPER;
+		  }
+		  else if (char.IsDigit(i))
+		  {
+			code |= (sbyte)WordDelimiterFilter.DIGIT;
+		  }
+		  if (code == 0)
+		  {
+			code = WordDelimiterFilter.SUBWORD_DELIM;
+		  }
+		  tab[i] = code;
+		}
+		DEFAULT_WORD_DELIM_TABLE = tab;
+	  }
+
+	  /// <summary>
+	  /// Create a new WordDelimiterIterator operating with the supplied rules.
+	  /// </summary>
+	  /// <param name="charTypeTable"> table containing character types </param>
+	  /// <param name="splitOnCaseChange"> if true, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards) </param>
+	  /// <param name="splitOnNumerics"> if true, causes "j2se" to be three tokens; "j" "2" "se" </param>
+	  /// <param name="stemEnglishPossessive"> if true, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil" </param>
+	  internal WordDelimiterIterator(sbyte[] charTypeTable, bool splitOnCaseChange, bool splitOnNumerics, bool stemEnglishPossessive)
+	  {
+		this.charTypeTable = charTypeTable;
+		this.splitOnCaseChange = splitOnCaseChange;
+		this.splitOnNumerics = splitOnNumerics;
+		this.stemEnglishPossessive = stemEnglishPossessive;
+	  }
+
+	  /// <summary>
+	  /// Advance to the next subword in the string.
+	  /// </summary>
+	  /// <returns> index of the next subword, or <seealso cref="#DONE"/> if all subwords have been returned </returns>
+	  internal int next()
+	  {
+		current = end;
+		if (current == DONE)
+		{
+		  return DONE;
+		}
+
+		if (skipPossessive)
+		{
+		  current += 2;
+		  skipPossessive = false;
+		}
+
+		int lastType = 0;
+
+		while (current < endBounds && (WordDelimiterFilter.isSubwordDelim(lastType = charType(text[current]))))
+		{
+		  current++;
+		}
+
+		if (current >= endBounds)
+		{
+		  return end = DONE;
+		}
+
+		for (end = current + 1; end < endBounds; end++)
+		{
+		  int type_Renamed = charType(text[end]);
+		  if (isBreak(lastType, type_Renamed))
+		  {
+			break;
+		  }
+		  lastType = type_Renamed;
+		}
+
+		if (end < endBounds - 1 && endsWithPossessive(end + 2))
+		{
+		  skipPossessive = true;
+		}
+
+		return end;
+	  }
+
+
+	  /// <summary>
+	  /// Return the type of the current subword.
+	  /// This currently uses the type of the first character in the subword.
+	  /// </summary>
+	  /// <returns> type of the current word </returns>
+	  internal int type()
+	  {
+		if (end == DONE)
+		{
+		  return 0;
+		}
+
+		int type_Renamed = charType(text[current]);
+		switch (type_Renamed)
+		{
+		  // return ALPHA word type for both lower and upper
+		  case WordDelimiterFilter.LOWER:
+		  case WordDelimiterFilter.UPPER:
+			return WordDelimiterFilter.ALPHA;
+		  default:
+			return type_Renamed;
+		}
+	  }
+
+	  /// <summary>
+	  /// Reset the text to a new value, and reset all state
+	  /// </summary>
+	  /// <param name="text"> New text </param>
+	  /// <param name="length"> length of the text </param>
+	  internal void setText(char[] text, int length)
+	  {
+		this.text = text;
+		this.length = this.endBounds = length;
+		current = startBounds = end = 0;
+		skipPossessive = hasFinalPossessive = false;
+		setBounds();
+	  }
+
+	  // ================================================= Helper Methods ================================================
+
+	  /// <summary>
+	  /// Determines whether the transition from lastType to type indicates a break
+	  /// </summary>
+	  /// <param name="lastType"> Last subword type </param>
+	  /// <param name="type"> Current subword type </param>
+	  /// <returns> {@code true} if the transition indicates a break, {@code false} otherwise </returns>
+	  private bool isBreak(int lastType, int type)
+	  {
+		if ((type_Renamed & lastType) != 0)
+		{
+		  return false;
+		}
+
+		if (!splitOnCaseChange && WordDelimiterFilter.isAlpha(lastType) && WordDelimiterFilter.isAlpha(type_Renamed))
+		{
+		  // ALPHA->ALPHA: always ignore if case isn't considered.
+		  return false;
+		}
+		else if (WordDelimiterFilter.isUpper(lastType) && WordDelimiterFilter.isAlpha(type_Renamed))
+		{
+		  // UPPER->letter: Don't split
+		  return false;
+		}
+		else if (!splitOnNumerics && ((WordDelimiterFilter.isAlpha(lastType) && WordDelimiterFilter.isDigit(type_Renamed)) || (WordDelimiterFilter.isDigit(lastType) && WordDelimiterFilter.isAlpha(type_Renamed))))
+		{
+		  // ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split
+		  return false;
+		}
+
+		return true;
+	  }
+
+	  /// <summary>
+	  /// Determines if the current word contains only one subword.  Note, it could be potentially surrounded by delimiters
+	  /// </summary>
+	  /// <returns> {@code true} if the current word contains only one subword, {@code false} otherwise </returns>
+	  internal bool SingleWord
+	  {
+		  get
+		  {
+			if (hasFinalPossessive)
+			{
+			  return current == startBounds && end == endBounds - 2;
+			}
+			else
+			{
+			  return current == startBounds && end == endBounds;
+			}
+		  }
+	  }
+
+	  /// <summary>
+	  /// Set the internal word bounds (remove leading and trailing delimiters). Note, if a possessive is found, don't remove
+	  /// it yet, simply note it.
+	  /// </summary>
+	  private void setBounds()
+	  {
+		while (startBounds < length && (WordDelimiterFilter.isSubwordDelim(charType(text[startBounds]))))
+		{
+		  startBounds++;
+		}
+
+		while (endBounds > startBounds && (WordDelimiterFilter.isSubwordDelim(charType(text[endBounds - 1]))))
+		{
+		  endBounds--;
+		}
+		if (endsWithPossessive(endBounds))
+		{
+		  hasFinalPossessive = true;
+		}
+		current = startBounds;
+	  }
+
+	  /// <summary>
+	  /// Determines if the text at the given position indicates an English possessive which should be removed
+	  /// </summary>
+	  /// <param name="pos"> Position in the text to check if it indicates an English possessive </param>
+	  /// <returns> {@code true} if the text at the position indicates an English posessive, {@code false} otherwise </returns>
+	  private bool endsWithPossessive(int pos)
+	  {
+		return (stemEnglishPossessive && pos > 2 && text[pos - 2] == '\'' && (text[pos - 1] == 's' || text[pos - 1] == 'S') && WordDelimiterFilter.isAlpha(charType(text[pos - 3])) && (pos == endBounds || WordDelimiterFilter.isSubwordDelim(charType(text[pos]))));
+	  }
+
+	  /// <summary>
+	  /// Determines the type of the given character
+	  /// </summary>
+	  /// <param name="ch"> Character whose type is to be determined </param>
+	  /// <returns> Type of the character </returns>
+	  private int charType(int ch)
+	  {
+		if (ch < charTypeTable.Length)
+		{
+		  return charTypeTable[ch];
+		}
+		return getType(ch);
+	  }
+
+	  /// <summary>
+	  /// Computes the type of the given character
+	  /// </summary>
+	  /// <param name="ch"> Character whose type is to be determined </param>
+	  /// <returns> Type of the character </returns>
+	  public static sbyte getType(int ch)
+	  {
+		switch (char.getType(ch))
+		{
+		  case char.UPPERCASE_LETTER:
+			  return WordDelimiterFilter.UPPER;
+		  case char.LOWERCASE_LETTER:
+			  return WordDelimiterFilter.LOWER;
+
+		  case char.TITLECASE_LETTER:
+		  case char.MODIFIER_LETTER:
+		  case char.OTHER_LETTER:
+		  case char.NON_SPACING_MARK:
+		  case char.ENCLOSING_MARK: // depends what it encloses?
+		  case char.COMBINING_SPACING_MARK:
+			return WordDelimiterFilter.ALPHA;
+
+		  case char.DECIMAL_DIGIT_NUMBER:
+		  case char.LETTER_NUMBER:
+		  case char.OTHER_NUMBER:
+			return WordDelimiterFilter.DIGIT;
+
+		  // case Character.SPACE_SEPARATOR:
+		  // case Character.LINE_SEPARATOR:
+		  // case Character.PARAGRAPH_SEPARATOR:
+		  // case Character.CONTROL:
+		  // case Character.FORMAT:
+		  // case Character.PRIVATE_USE:
+
+		  case char.SURROGATE: // prevent splitting
+			return WordDelimiterFilter.ALPHA | WordDelimiterFilter.DIGIT;
+
+		  // case Character.DASH_PUNCTUATION:
+		  // case Character.START_PUNCTUATION:
+		  // case Character.END_PUNCTUATION:
+		  // case Character.CONNECTOR_PUNCTUATION:
+		  // case Character.OTHER_PUNCTUATION:
+		  // case Character.MATH_SYMBOL:
+		  // case Character.CURRENCY_SYMBOL:
+		  // case Character.MODIFIER_SYMBOL:
+		  // case Character.OTHER_SYMBOL:
+		  // case Character.INITIAL_QUOTE_PUNCTUATION:
+		  // case Character.FINAL_QUOTE_PUNCTUATION:
+
+		  default:
+			  return WordDelimiterFilter.SUBWORD_DELIM;
+		}
+	  }
+	}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/99717176/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramFilterFactory.cs
new file mode 100644
index 0000000..5b3d94b
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramFilterFactory.cs
@@ -0,0 +1,61 @@
+using System.Collections.Generic;
+using TokenFilterFactory = Lucene.Net.Analysis.Util.TokenFilterFactory;
+
+namespace org.apache.lucene.analysis.ngram
+{
+
+	/*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+	using TokenFilterFactory = TokenFilterFactory;
+
+	/// <summary>
+	/// Creates new instances of <seealso cref="EdgeNGramTokenFilter"/>.
+	/// <pre class="prettyprint">
+	/// &lt;fieldType name="text_edgngrm" class="solr.TextField" positionIncrementGap="100"&gt;
+	///   &lt;analyzer&gt;
+	///     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
+	///     &lt;filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="1"/&gt;
+	///   &lt;/analyzer&gt;
+	/// &lt;/fieldType&gt;</pre>
+	/// </summary>
+	public class EdgeNGramFilterFactory : TokenFilterFactory
+	{
+	  private readonly int maxGramSize;
+	  private readonly int minGramSize;
+	  private readonly string side;
+
+	  /// <summary>
+	  /// Creates a new EdgeNGramFilterFactory </summary>
+	  public EdgeNGramFilterFactory(IDictionary<string, string> args) : base(args)
+	  {
+		minGramSize = getInt(args, "minGramSize", EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE);
+		maxGramSize = getInt(args, "maxGramSize", EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE);
+		side = get(args, "side", EdgeNGramTokenFilter.Side.FRONT.Label);
+		if (args.Count > 0)
+		{
+		  throw new System.ArgumentException("Unknown parameters: " + args);
+		}
+	  }
+
+	  public override EdgeNGramTokenFilter create(TokenStream input)
+	  {
+		return new EdgeNGramTokenFilter(luceneMatchVersion, input, side, minGramSize, maxGramSize);
+	  }
+	}
+
+}
\ No newline at end of file


Mime
View raw message