lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mhern...@apache.org
Subject [41/50] [abbrv] git commit: Implement Standard and Classic Analyzers
Date Tue, 24 Sep 2013 18:33:17 GMT
Implement Standard and Classic Analyzers


Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/7a4b442f
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/7a4b442f
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/7a4b442f

Branch: refs/heads/branch_4x
Commit: 7a4b442f13ad71b5094c6058e1746c04d97ac34c
Parents: 98e877d
Author: Paul Irwin <paulirwin@gmail.com>
Authored: Thu Aug 8 14:58:40 2013 -0400
Committer: Paul Irwin <paulirwin@gmail.com>
Committed: Thu Aug 8 14:58:40 2013 -0400

----------------------------------------------------------------------
 src/contrib/Analyzers/Contrib.Analyzers.csproj  |   15 +
 .../Analyzers/Standard/ClassicAnalyzer.cs       |   70 +
 src/contrib/Analyzers/Standard/ClassicFilter.cs |   59 +
 .../Analyzers/Standard/ClassicFilterFactory.cs  |   25 +
 .../Analyzers/Standard/ClassicTokenizer.cs      |  131 ++
 .../Standard/ClassicTokenizerFactory.cs         |   31 +
 .../Analyzers/Standard/ClassicTokenizerImpl.cs  |  657 ++++++++++
 .../Standard/IStandardTokenizerInterface.cs     |   27 +
 .../Analyzers/Standard/StandardAnalyzer.cs      |   70 +
 .../Analyzers/Standard/StandardFilter.cs        |   73 ++
 .../Analyzers/Standard/StandardFilterFactory.cs |   26 +
 .../Analyzers/Standard/StandardTokenizer.cs     |  167 +++
 .../Standard/StandardTokenizerFactory.cs        |   31 +
 .../Analyzers/Standard/StandardTokenizerImpl.cs | 1241 ++++++++++++++++++
 .../Standard/Std31/StandardTokenizerImpl31.cs   | 1116 ++++++++++++++++
 .../Standard/Std34/StandardTokenizerImpl34.cs   | 1134 ++++++++++++++++
 16 files changed, 4873 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7a4b442f/src/contrib/Analyzers/Contrib.Analyzers.csproj
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Contrib.Analyzers.csproj b/src/contrib/Analyzers/Contrib.Analyzers.csproj
index 74b0f63..e13f118 100644
--- a/src/contrib/Analyzers/Contrib.Analyzers.csproj
+++ b/src/contrib/Analyzers/Contrib.Analyzers.csproj
@@ -122,6 +122,21 @@
     <Compile Include="Core\WhitespaceTokenizer.cs" />
     <Compile Include="Core\WhitespaceTokenizerFactory.cs" />
     <Compile Include="Properties\AssemblyInfo.cs" />
+    <Compile Include="Standard\ClassicAnalyzer.cs" />
+    <Compile Include="Standard\ClassicFilter.cs" />
+    <Compile Include="Standard\ClassicFilterFactory.cs" />
+    <Compile Include="Standard\ClassicTokenizer.cs" />
+    <Compile Include="Standard\ClassicTokenizerFactory.cs" />
+    <Compile Include="Standard\ClassicTokenizerImpl.cs" />
+    <Compile Include="Standard\IStandardTokenizerInterface.cs" />
+    <Compile Include="Standard\StandardAnalyzer.cs" />
+    <Compile Include="Standard\StandardFilter.cs" />
+    <Compile Include="Standard\StandardFilterFactory.cs" />
+    <Compile Include="Standard\StandardTokenizer.cs" />
+    <Compile Include="Standard\StandardTokenizerFactory.cs" />
+    <Compile Include="Standard\StandardTokenizerImpl.cs" />
+    <Compile Include="Standard\Std31\StandardTokenizerImpl31.cs" />
+    <Compile Include="Standard\Std34\StandardTokenizerImpl34.cs" />
     <Compile Include="Support\AbstractSet.cs" />
     <Compile Include="Support\StringExtensions.cs" />
     <Compile Include="Util\AbstractAnalysisFactory.cs" />

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7a4b442f/src/contrib/Analyzers/Standard/ClassicAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Standard/ClassicAnalyzer.cs b/src/contrib/Analyzers/Standard/ClassicAnalyzer.cs
new file mode 100644
index 0000000..193f111
--- /dev/null
+++ b/src/contrib/Analyzers/Standard/ClassicAnalyzer.cs
@@ -0,0 +1,70 @@
+using Lucene.Net.Analysis.Core;
+using Lucene.Net.Analysis.Util;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using Version = Lucene.Net.Util.Version;
+
+namespace Lucene.Net.Analysis.Standard
+{
+    public sealed class ClassicAnalyzer : StopwordAnalyzerBase
+    {
+        public const int DEFAULT_MAX_TOKEN_LENGTH = 255;
+
+        private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
+
+        public static readonly CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
+
+        public ClassicAnalyzer(Version? matchVersion, CharArraySet stopWords)
+            : base(matchVersion, stopWords)
+        {
+        }
+
+        public ClassicAnalyzer(Version? matchVersion)
+            : this(matchVersion, STOP_WORDS_SET)
+        {
+        }
+
+        public ClassicAnalyzer(Version? matchVersion, TextReader stopwords)
+            : this(matchVersion, LoadStopwordSet(stopwords, matchVersion))
+        {
+        }
+
+        public int MaxTokenLength
+        {
+            get { return maxTokenLength; }
+            set { maxTokenLength = value; }
+        }
+
+        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
+        {
+            ClassicTokenizer src = new ClassicTokenizer(matchVersion, reader);
+            src.MaxTokenLength = maxTokenLength;
+            TokenStream tok = new ClassicFilter(src);
+            tok = new LowerCaseFilter(matchVersion, tok);
+            tok = new StopFilter(matchVersion, tok, stopwords);
+            return new AnonymousTokenStreamComponents(this, src, tok);
+        }
+
+        private sealed class AnonymousTokenStreamComponents : TokenStreamComponents
+        {
+            private readonly ClassicAnalyzer parent;
+            private readonly ClassicTokenizer src;
+
+            public AnonymousTokenStreamComponents(ClassicAnalyzer parent, ClassicTokenizer src, TokenStream tok)
+                : base(src, tok)
+            {
+                this.parent = parent;
+                this.src = src;
+            }
+
+            public override void SetReader(TextReader reader)
+            {
+                src.MaxTokenLength = parent.maxTokenLength;
+                base.SetReader(reader);
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7a4b442f/src/contrib/Analyzers/Standard/ClassicFilter.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Standard/ClassicFilter.cs b/src/contrib/Analyzers/Standard/ClassicFilter.cs
new file mode 100644
index 0000000..eac2d3e
--- /dev/null
+++ b/src/contrib/Analyzers/Standard/ClassicFilter.cs
@@ -0,0 +1,59 @@
+using Lucene.Net.Analysis.Tokenattributes;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Standard
+{
+    public class ClassicFilter : TokenFilter
+    {
+        public ClassicFilter(TokenStream input)
+            : base(input)
+        {
+            typeAtt = AddAttribute<ITypeAttribute>();
+            termAtt = AddAttribute<ICharTermAttribute>();
+        }
+
+        private static readonly String APOSTROPHE_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.APOSTROPHE];
+        private static readonly String ACRONYM_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.ACRONYM];
+
+        // this filters uses attribute type
+        private readonly ITypeAttribute typeAtt; // = addAttribute(TypeAttribute.class);
+        private readonly ICharTermAttribute termAtt; // = addAttribute(CharTermAttribute.class);
+
+        public override bool IncrementToken()
+        {
+            if (!input.IncrementToken())
+            {
+                return false;
+            }
+
+            char[] buffer = termAtt.Buffer;
+            int bufferLength = termAtt.Length;
+            String type = typeAtt.Type;
+
+            if (type == APOSTROPHE_TYPE &&      // remove 's
+                bufferLength >= 2 &&
+                buffer[bufferLength - 2] == '\'' &&
+                (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S'))
+            {
+                // Strip last 2 characters off
+                termAtt.SetLength(bufferLength - 2);
+            }
+            else if (type == ACRONYM_TYPE)
+            {      // remove dots
+                int upto = 0;
+                for (int i = 0; i < bufferLength; i++)
+                {
+                    char c = buffer[i];
+                    if (c != '.')
+                        buffer[upto++] = c;
+                }
+                termAtt.SetLength(upto);
+            }
+
+            return true;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7a4b442f/src/contrib/Analyzers/Standard/ClassicFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Standard/ClassicFilterFactory.cs b/src/contrib/Analyzers/Standard/ClassicFilterFactory.cs
new file mode 100644
index 0000000..378004b
--- /dev/null
+++ b/src/contrib/Analyzers/Standard/ClassicFilterFactory.cs
@@ -0,0 +1,25 @@
+using Lucene.Net.Analysis.Util;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Standard
+{
+    public class ClassicFilterFactory : TokenFilterFactory
+    {
+        public ClassicFilterFactory(IDictionary<String, String> args)
+            : base(args)
+        {
+            if (args.Count > 0)
+            {
+                throw new ArgumentException("Unknown parameters: " + args);
+            }
+        }
+
+        public override TokenStream Create(TokenStream input)
+        {
+            return new ClassicFilter(input);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7a4b442f/src/contrib/Analyzers/Standard/ClassicTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Standard/ClassicTokenizer.cs b/src/contrib/Analyzers/Standard/ClassicTokenizer.cs
new file mode 100644
index 0000000..bad1c9e
--- /dev/null
+++ b/src/contrib/Analyzers/Standard/ClassicTokenizer.cs
@@ -0,0 +1,131 @@
+using Lucene.Net.Analysis.Tokenattributes;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using Version = Lucene.Net.Util.Version;
+
+namespace Lucene.Net.Analysis.Standard
+{
+    public sealed class ClassicTokenizer : Tokenizer
+    {
+        private IStandardTokenizerInterface scanner;
+
+        public const int ALPHANUM = 0;
+        public const int APOSTROPHE = 1;
+        public const int ACRONYM = 2;
+        public const int COMPANY = 3;
+        public const int EMAIL = 4;
+        public const int HOST = 5;
+        public const int NUM = 6;
+        public const int CJ = 7;
+
+        public const int ACRONYM_DEP = 8;
+
+        public static readonly string[] TOKEN_TYPES = new string[] {
+            "<ALPHANUM>",
+            "<APOSTROPHE>",
+            "<ACRONYM>",
+            "<COMPANY>",
+            "<EMAIL>",
+            "<HOST>",
+            "<NUM>",
+            "<CJ>",
+            "<ACRONYM_DEP>"
+          };
+
+        private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
+
+        public int MaxTokenLength
+        {
+            get { return maxTokenLength; }
+            set { maxTokenLength = value; }
+        }
+
+        public ClassicTokenizer(Version? matchVersion, TextReader input)
+            : base(input)
+        {
+            termAtt = AddAttribute<ICharTermAttribute>();
+            offsetAtt = AddAttribute<IOffsetAttribute>();
+            posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
+            typeAtt = AddAttribute<ITypeAttribute>();
+
+            Init(matchVersion);
+        }
+
+        public ClassicTokenizer(Version? matchVersion, AttributeFactory factory, TextReader input)
+            : base(factory, input)
+        {
+            termAtt = AddAttribute<ICharTermAttribute>();
+            offsetAtt = AddAttribute<IOffsetAttribute>();
+            posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
+            typeAtt = AddAttribute<ITypeAttribute>();
+
+            Init(matchVersion);
+        }
+
+        private void Init(Version? matchVersion)
+        {
+            this.scanner = new ClassicTokenizerImpl(null); // best effort NPE if you dont call reset
+        }
+
+        // this tokenizer generates three attributes:
+        // term offset, positionIncrement and type
+        private readonly ICharTermAttribute termAtt; // = addAttribute(CharTermAttribute.class);
+        private readonly IOffsetAttribute offsetAtt; // = addAttribute(OffsetAttribute.class);
+        private readonly IPositionIncrementAttribute posIncrAtt; // = addAttribute(PositionIncrementAttribute.class);
+        private readonly ITypeAttribute typeAtt; // = addAttribute(TypeAttribute.class);
+
+        public override bool IncrementToken()
+        {
+            ClearAttributes();
+            int posIncr = 1;
+
+            while (true)
+            {
+                int tokenType = scanner.GetNextToken();
+
+                if (tokenType == StandardTokenizerInterface.YYEOF)
+                {
+                    return false;
+                }
+
+                if (scanner.YYLength <= maxTokenLength)
+                {
+                    posIncrAtt.PositionIncrement = posIncr;
+                    scanner.GetText(termAtt);
+                    int start = scanner.YYChar;
+                    offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + termAtt.Length));
+
+                    if (tokenType == ClassicTokenizer.ACRONYM_DEP)
+                    {
+                        typeAtt.Type = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.HOST];
+                        termAtt.SetLength(termAtt.Length - 1); // remove extra '.'
+                    }
+                    else
+                    {
+                        typeAtt.Type = ClassicTokenizer.TOKEN_TYPES[tokenType];
+                    }
+                    return true;
+                }
+                else
+                    // When we skip a too-long term, we still increment the
+                    // position increment
+                    posIncr++;
+            }
+        }
+
+        public override void End()
+        {
+            // set final offset
+            int finalOffset = CorrectOffset(scanner.YYChar + scanner.YYLength);
+            offsetAtt.SetOffset(finalOffset, finalOffset);
+        }
+
+        public override void Reset()
+        {
+            scanner.YYReset(input);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7a4b442f/src/contrib/Analyzers/Standard/ClassicTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Standard/ClassicTokenizerFactory.cs b/src/contrib/Analyzers/Standard/ClassicTokenizerFactory.cs
new file mode 100644
index 0000000..2bcd775
--- /dev/null
+++ b/src/contrib/Analyzers/Standard/ClassicTokenizerFactory.cs
@@ -0,0 +1,31 @@
+using Lucene.Net.Analysis.Util;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Standard
+{
+    public class ClassicTokenizerFactory : TokenizerFactory
+    {
+        private readonly int maxTokenLength;
+
+        public ClassicTokenizerFactory(IDictionary<String, String> args)
+            : base(args)
+        {
+            AssureMatchVersion();
+            maxTokenLength = GetInt(args, "maxTokenLength", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
+            if (args.Count > 0)
+            {
+                throw new ArgumentException("Unknown parameters: " + args);
+            }
+        }
+
+        public override Tokenizer Create(Net.Util.AttributeSource.AttributeFactory factory, System.IO.TextReader input)
+        {
+            ClassicTokenizer tokenizer = new ClassicTokenizer(luceneMatchVersion, factory, input);
+            tokenizer.MaxTokenLength = maxTokenLength;
+            return tokenizer;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7a4b442f/src/contrib/Analyzers/Standard/ClassicTokenizerImpl.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Standard/ClassicTokenizerImpl.cs b/src/contrib/Analyzers/Standard/ClassicTokenizerImpl.cs
new file mode 100644
index 0000000..9a096ac
--- /dev/null
+++ b/src/contrib/Analyzers/Standard/ClassicTokenizerImpl.cs
@@ -0,0 +1,657 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Standard
+{
+    internal class ClassicTokenizerImpl : IStandardTokenizerInterface
+    {
+        /** This character denotes the end of file */
+        public const int YYEOF = -1;
+
+        /** initial size of the lookahead buffer */
+        private const int ZZ_BUFFERSIZE = 4096;
+
+        /** lexical states */
+        public const int YYINITIAL = 0;
+
+        /**
+        * ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
+        * ZZ_LEXSTATE[l+1] is the state in the DFA for the lexical state l
+        *                  at the beginning of a line
+        * l is of the form l = 2*k, k a non negative integer
+        */
+        private readonly int[] ZZ_LEXSTATE = { 
+            0, 0
+        };
+
+        /** 
+        * Translates characters to character classes
+        */
+        private const string ZZ_CMAP_PACKED =
+          "\u0009\0\u0001\0\u0001\u000d\u0001\0\u0001\0\u0001\u000c\u0012\0\u0001\0\u0005\0\u0001\u0005" +
+          "\u0001\u0003\u0004\0\u0001\u0009\u0001\u0007\u0001\u0004\u0001\u0009\u000a\u0002\u0006\0\u0001\u0006\u001a\u000a" +
+          "\u0004\0\u0001\u0008\u0001\0\u001a\u000a\u002f\0\u0001\u000a\u000a\0\u0001\u000a\u0004\0\u0001\u000a" +
+          "\u0005\0\u0017\u000a\u0001\0\u001f\u000a\u0001\0\u0128\u000a\u0002\0\u0012\u000a\u001c\0\u005e\u000a" +
+          "\u0002\0\u0009\u000a\u0002\0\u0007\u000a\u000e\0\u0002\u000a\u000e\0\u0005\u000a\u0009\0\u0001\u000a" +
+          "\u008b\0\u0001\u000a\u000b\0\u0001\u000a\u0001\0\u0003\u000a\u0001\0\u0001\u000a\u0001\0\u0014\u000a" +
+          "\u0001\0\u002c\u000a\u0001\0\u0008\u000a\u0002\0\u001a\u000a\u000c\0\u0082\u000a\u000a\0\u0039\u000a" +
+          "\u0002\0\u0002\u000a\u0002\0\u0002\u000a\u0003\0\u0026\u000a\u0002\0\u0002\u000a\u0037\0\u0026\u000a" +
+          "\u0002\0\u0001\u000a\u0007\0\u0027\u000a\u0048\0\u001b\u000a\u0005\0\u0003\u000a\u002e\0\u001a\u000a" +
+          "\u0005\0\u000b\u000a\u0015\0\u000a\u0002\u0007\0\u0063\u000a\u0001\0\u0001\u000a\u000f\0\u0002\u000a" +
+          "\u0009\0\u000a\u0002\u0003\u000a\u0013\0\u0001\u000a\u0001\0\u001b\u000a\u0053\0\u0026\u000a\u015f\0" +
+          "\u0035\u000a\u0003\0\u0001\u000a\u0012\0\u0001\u000a\u0007\0\u000a\u000a\u0004\0\u000a\u0002\u0015\0" +
+          "\u0008\u000a\u0002\0\u0002\u000a\u0002\0\u0016\u000a\u0001\0\u0007\u000a\u0001\0\u0001\u000a\u0003\0" +
+          "\u0004\u000a\u0022\0\u0002\u000a\u0001\0\u0003\u000a\u0004\0\u000a\u0002\u0002\u000a\u0013\0\u0006\u000a" +
+          "\u0004\0\u0002\u000a\u0002\0\u0016\u000a\u0001\0\u0007\u000a\u0001\0\u0002\u000a\u0001\0\u0002\u000a" +
+          "\u0001\0\u0002\u000a\u001f\0\u0004\u000a\u0001\0\u0001\u000a\u0007\0\u000a\u0002\u0002\0\u0003\u000a" +
+          "\u0010\0\u0007\u000a\u0001\0\u0001\u000a\u0001\0\u0003\u000a\u0001\0\u0016\u000a\u0001\0\u0007\u000a" +
+          "\u0001\0\u0002\u000a\u0001\0\u0005\u000a\u0003\0\u0001\u000a\u0012\0\u0001\u000a\u000f\0\u0001\u000a" +
+          "\u0005\0\u000a\u0002\u0015\0\u0008\u000a\u0002\0\u0002\u000a\u0002\0\u0016\u000a\u0001\0\u0007\u000a" +
+          "\u0001\0\u0002\u000a\u0002\0\u0004\u000a\u0003\0\u0001\u000a\u001e\0\u0002\u000a\u0001\0\u0003\u000a" +
+          "\u0004\0\u000a\u0002\u0015\0\u0006\u000a\u0003\0\u0003\u000a\u0001\0\u0004\u000a\u0003\0\u0002\u000a" +
+          "\u0001\0\u0001\u000a\u0001\0\u0002\u000a\u0003\0\u0002\u000a\u0003\0\u0003\u000a\u0003\0\u0008\u000a" +
+          "\u0001\0\u0003\u000a\u002d\0\u0009\u0002\u0015\0\u0008\u000a\u0001\0\u0003\u000a\u0001\0\u0017\u000a" +
+          "\u0001\0\u000a\u000a\u0001\0\u0005\u000a\u0026\0\u0002\u000a\u0004\0\u000a\u0002\u0015\0\u0008\u000a" +
+          "\u0001\0\u0003\u000a\u0001\0\u0017\u000a\u0001\0\u000a\u000a\u0001\0\u0005\u000a\u0024\0\u0001\u000a" +
+          "\u0001\0\u0002\u000a\u0004\0\u000a\u0002\u0015\0\u0008\u000a\u0001\0\u0003\u000a\u0001\0\u0017\u000a" +
+          "\u0001\0\u0010\u000a\u0026\0\u0002\u000a\u0004\0\u000a\u0002\u0015\0\u0012\u000a\u0003\0\u0018\u000a" +
+          "\u0001\0\u0009\u000a\u0001\0\u0001\u000a\u0002\0\u0007\u000a\u0039\0\u0001\u0001\u0030\u000a\u0001\u0001" +
+          "\u0002\u000a\u000c\u0001\u0007\u000a\u0009\u0001\u000a\u0002\u0027\0\u0002\u000a\u0001\0\u0001\u000a\u0002\0" +
+          "\u0002\u000a\u0001\0\u0001\u000a\u0002\0\u0001\u000a\u0006\0\u0004\u000a\u0001\0\u0007\u000a\u0001\0" +
+          "\u0003\u000a\u0001\0\u0001\u000a\u0001\0\u0001\u000a\u0002\0\u0002\u000a\u0001\0\u0004\u000a\u0001\0" +
+          "\u0002\u000a\u0009\0\u0001\u000a\u0002\0\u0005\u000a\u0001\0\u0001\u000a\u0009\0\u000a\u0002\u0002\0" +
+          "\u0002\u000a\u0022\0\u0001\u000a\u001f\0\u000a\u0002\u0016\0\u0008\u000a\u0001\0\u0022\u000a\u001d\0" +
+          "\u0004\u000a\u0074\0\u0022\u000a\u0001\0\u0005\u000a\u0001\0\u0002\u000a\u0015\0\u000a\u0002\u0006\0" +
+          "\u0006\u000a\u004a\0\u0026\u000a\u000a\0\u0027\u000a\u0009\0\u005a\u000a\u0005\0\u0044\u000a\u0005\0" +
+          "\u0052\u000a\u0006\0\u0007\u000a\u0001\0\u003f\u000a\u0001\0\u0001\u000a\u0001\0\u0004\u000a\u0002\0" +
+          "\u0007\u000a\u0001\0\u0001\u000a\u0001\0\u0004\u000a\u0002\0\u0027\u000a\u0001\0\u0001\u000a\u0001\0" +
+          "\u0004\u000a\u0002\0\u001f\u000a\u0001\0\u0001\u000a\u0001\0\u0004\u000a\u0002\0\u0007\u000a\u0001\0" +
+          "\u0001\u000a\u0001\0\u0004\u000a\u0002\0\u0007\u000a\u0001\0\u0007\u000a\u0001\0\u0017\u000a\u0001\0" +
+          "\u001f\u000a\u0001\0\u0001\u000a\u0001\0\u0004\u000a\u0002\0\u0007\u000a\u0001\0\u0027\u000a\u0001\0" +
+          "\u0013\u000a\u000e\0\u0009\u0002\u002e\0\u0055\u000a\u000c\0\u026c\u000a\u0002\0\u0008\u000a\u000a\0" +
+          "\u001a\u000a\u0005\0\u004b\u000a\u0095\0\u0034\u000a\u002c\0\u000a\u0002\u0026\0\u000a\u0002\u0006\0" +
+          "\u0058\u000a\u0008\0\u0029\u000a\u0557\0\u009c\u000a\u0004\0\u005a\u000a\u0006\0\u0016\u000a\u0002\0" +
+          "\u0006\u000a\u0002\0\u0026\u000a\u0002\0\u0006\u000a\u0002\0\u0008\u000a\u0001\0\u0001\u000a\u0001\0" +
+          "\u0001\u000a\u0001\0\u0001\u000a\u0001\0\u001f\u000a\u0002\0\u0035\u000a\u0001\0\u0007\u000a\u0001\0" +
+          "\u0001\u000a\u0003\0\u0003\u000a\u0001\0\u0007\u000a\u0003\0\u0004\u000a\u0002\0\u0006\u000a\u0004\0" +
+          "\u000d\u000a\u0005\0\u0003\u000a\u0001\0\u0007\u000a\u0082\0\u0001\u000a\u0082\0\u0001\u000a\u0004\0" +
+          "\u0001\u000a\u0002\0\u000a\u000a\u0001\0\u0001\u000a\u0003\0\u0005\u000a\u0006\0\u0001\u000a\u0001\0" +
+          "\u0001\u000a\u0001\0\u0001\u000a\u0001\0\u0004\u000a\u0001\0\u0003\u000a\u0001\0\u0007\u000a\u0ecb\0" +
+          "\u0002\u000a\u002a\0\u0005\u000a\u000a\0\u0001\u000b\u0054\u000b\u0008\u000b\u0002\u000b\u0002\u000b\u005a\u000b" +
+          "\u0001\u000b\u0003\u000b\u0006\u000b\u0028\u000b\u0003\u000b\u0001\0\u005e\u000a\u0011\0\u0018\u000a\u0038\0" +
+          "\u0010\u000b\u0100\0\u0080\u000b\u0080\0\u19b6\u000b\u000a\u000b\u0040\0\u51a6\u000b\u005a\u000b\u048d\u000a" +
+          "\u0773\0\u2ba4\u000a\u215c\0\u012e\u000b\u00d2\u000b\u0007\u000a\u000c\0\u0005\u000a\u0005\0\u0001\u000a" +
+          "\u0001\0\u000a\u000a\u0001\0\u000d\u000a\u0001\0\u0005\u000a\u0001\0\u0001\u000a\u0001\0\u0002\u000a" +
+          "\u0001\0\u0002\u000a\u0001\0\u006c\u000a\u0021\0\u016b\u000a\u0012\0\u0040\u000a\u0002\0\u0036\u000a" +
+          "\u0028\0\u000c\u000a\u0074\0\u0003\u000a\u0001\0\u0001\u000a\u0001\0\u0087\u000a\u0013\0\u000a\u0002" +
+          "\u0007\0\u001a\u000a\u0006\0\u001a\u000a\u000a\0\u0001\u000b\u003a\u000b\u001f\u000a\u0003\0\u0006\u000a" +
+          "\u0002\0\u0006\u000a\u0002\0\u0006\u000a\u0002\0\u0003\u000a\u0023\0";
+
+        /** 
+        * Translates characters to character classes
+        */
+        private static readonly char[] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
+
+        /** 
+         * Translates DFA states to action switch labels.
+         */
+        private static readonly int[] ZZ_ACTION = zzUnpackAction();
+
+        private const String ZZ_ACTION_PACKED_0 =
+        "\u0001\0\u0001\u0001\u0003\u0002\u0001\u0003\u0001\u0001\u000b\0\u0001\u0002\u0003\u0004" +
+        "\u0002\0\u0001\u0005\u0001\0\u0001\u0005\u0003\u0004\u0006\u0005\u0001\u0006\u0001\u0004" +
+        "\u0002\u0007\u0001\u0008\u0001\0\u0001\u0008\u0003\0\u0002\u0008\u0001\u0009\u0001\u000a" +
+        "\u0001\u0004";
+
+        private static int[] zzUnpackAction()
+        {
+            int[] result = new int[51];
+            int offset = 0;
+            offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
+            return result;
+        }
+
+        private static int zzUnpackAction(String packed, int offset, int[] result)
+        {
+            int i = 0;       /* index in packed string  */
+            int j = offset;  /* index in unpacked array */
+            int l = packed.Length;
+            while (i < l)
+            {
+                int count = packed[i++];
+                int value = packed[i++];
+                do result[j++] = value; while (--count > 0);
+            }
+            return j;
+        }
+
+        /** 
+        * Translates a state to a row index in the transition table
+        */
+        private static readonly int[] ZZ_ROWMAP = zzUnpackRowMap();
+
+        private const String ZZ_ROWMAP_PACKED_0 =
+        "\0\0\0\u000e\0\u001c\0\u002a\0\u0038\0\u000e\0\u0046\0\u0054" +
+        "\0\u0062\0\u0070\0\u007e\0\u008c\0\u009a\0\u00a8\0\u00b6\0\u00c4" +
+        "\0\u00d2\0\u00e0\0\u00ee\0\u00fc\0\u010a\0\u0118\0\u0126\0\u0134" +
+        "\0\u0142\0\u0150\0\u015e\0\u016c\0\u017a\0\u0188\0\u0196\0\u01a4" +
+        "\0\u01b2\0\u01c0\0\u01ce\0\u01dc\0\u01ea\0\u01f8\0\u00d2\0\u0206" +
+        "\0\u0214\0\u0222\0\u0230\0\u023e\0\u024c\0\u025a\0\u0054\0\u008c" +
+        "\0\u0268\0\u0276\0\u0284";
+
+        private static int[] zzUnpackRowMap()
+        {
+            int[] result = new int[51];
+            int offset = 0;
+            offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
+            return result;
+        }
+
+        private static int zzUnpackRowMap(String packed, int offset, int[] result)
+        {
+            int i = 0;  /* index in packed string  */
+            int j = offset;  /* index in unpacked array */
+            int l = packed.Length;
+            while (i < l)
+            {
+                int high = packed[i++] << 16;
+                result[j++] = high | packed[i++];
+            }
+            return j;
+        }
+
+        /** 
+        * The transition table of the DFA
+        */
+        private static readonly int[] ZZ_TRANS = zzUnpackTrans();
+
+        private const String ZZ_TRANS_PACKED_0 =
+        "\u0001\u0002\u0001\u0003\u0001\u0004\u0007\u0002\u0001\u0005\u0001\u0006\u0001\u0007\u0001\u0002" +
+        "\u000f\0\u0002\u0003\u0001\0\u0001\u0008\u0001\0\u0001\u0009\u0002\u000a\u0001\u000b" +
+        "\u0001\u0003\u0004\0\u0001\u0003\u0001\u0004\u0001\0\u0001\u000c\u0001\0\u0001\u0009" +
+        "\u0002\u000d\u0001\u000e\u0001\u0004\u0004\0\u0001\u0003\u0001\u0004\u0001\u000f\u0001\u0010" +
+        "\u0001\u0011\u0001\u0012\u0002\u000a\u0001\u000b\u0001\u0013\u0010\0\u0001\u0002\u0001\0" +
+        "\u0001\u0014\u0001\u0015\u0007\0\u0001\u0016\u0004\0\u0002\u0017\u0007\0\u0001\u0017" +
+        "\u0004\0\u0001\u0018\u0001\u0019\u0007\0\u0001\u001a\u0005\0\u0001\u001b\u0007\0" +
+        "\u0001\u000b\u0004\0\u0001\u001c\u0001\u001d\u0007\0\u0001\u001e\u0004\0\u0001\u001f" +
+        "\u0001\u0020\u0007\0\u0001\u0021\u0004\0\u0001\u0022\u0001\u0023\u0007\0\u0001\u0024" +
+        "\u000d\0\u0001\u0025\u0004\0\u0001\u0014\u0001\u0015\u0007\0\u0001\u0026\u000d\0" +
+        "\u0001\u0027\u0004\0\u0002\u0017\u0007\0\u0001\u0028\u0004\0\u0001\u0003\u0001\u0004" +
+        "\u0001\u000f\u0001\u0008\u0001\u0011\u0001\u0012\u0002\u000a\u0001\u000b\u0001\u0013\u0004\0" +
+        "\u0002\u0014\u0001\0\u0001\u0029\u0001\0\u0001\u0009\u0002\u002a\u0001\0\u0001\u0014" +
+        "\u0004\0\u0001\u0014\u0001\u0015\u0001\0\u0001\u002b\u0001\0\u0001\u0009\u0002\u002c" +
+        "\u0001\u002d\u0001\u0015\u0004\0\u0001\u0014\u0001\u0015\u0001\0\u0001\u0029\u0001\0" +
+        "\u0001\u0009\u0002\u002a\u0001\0\u0001\u0016\u0004\0\u0002\u0017\u0001\0\u0001\u002e" +
+        "\u0002\0\u0001\u002e\u0002\0\u0001\u0017\u0004\0\u0002\u0018\u0001\0\u0001\u002a" +
+        "\u0001\0\u0001\u0009\u0002\u002a\u0001\0\u0001\u0018\u0004\0\u0001\u0018\u0001\u0019" +
+        "\u0001\0\u0001\u002c\u0001\0\u0001\u0009\u0002\u002c\u0001\u002d\u0001\u0019\u0004\0" +
+        "\u0001\u0018\u0001\u0019\u0001\0\u0001\u002a\u0001\0\u0001\u0009\u0002\u002a\u0001\0" +
+        "\u0001\u001a\u0005\0\u0001\u001b\u0001\0\u0001\u002d\u0002\0\u0003\u002d\u0001\u001b" +
+        "\u0004\0\u0002\u001c\u0001\0\u0001\u002f\u0001\0\u0001\u0009\u0002\u000a\u0001\u000b" +
+        "\u0001\u001c\u0004\0\u0001\u001c\u0001\u001d\u0001\0\u0001\u0030\u0001\0\u0001\u0009" +
+        "\u0002\u000d\u0001\u000e\u0001\u001d\u0004\0\u0001\u001c\u0001\u001d\u0001\0\u0001\u002f" +
+        "\u0001\0\u0001\u0009\u0002\u000a\u0001\u000b\u0001\u001e\u0004\0\u0002\u001f\u0001\0" +
+        "\u0001\u000a\u0001\0\u0001\u0009\u0002\u000a\u0001\u000b\u0001\u001f\u0004\0\u0001\u001f" +
+        "\u0001\u0020\u0001\0\u0001\u000d\u0001\0\u0001\u0009\u0002\u000d\u0001\u000e\u0001\u0020" +
+        "\u0004\0\u0001\u001f\u0001\u0020\u0001\0\u0001\u000a\u0001\0\u0001\u0009\u0002\u000a" +
+        "\u0001\u000b\u0001\u0021\u0004\0\u0002\u0022\u0001\0\u0001\u000b\u0002\0\u0003\u000b" +
+        "\u0001\u0022\u0004\0\u0001\u0022\u0001\u0023\u0001\0\u0001\u000e\u0002\0\u0003\u000e" +
+        "\u0001\u0023\u0004\0\u0001\u0022\u0001\u0023\u0001\0\u0001\u000b\u0002\0\u0003\u000b" +
+        "\u0001\u0024\u0006\0\u0001\u000f\u0006\0\u0001\u0025\u0004\0\u0001\u0014\u0001\u0015" +
+        "\u0001\0\u0001\u0031\u0001\0\u0001\u0009\u0002\u002a\u0001\0\u0001\u0016\u0004\0" +
+        "\u0002\u0017\u0001\0\u0001\u002e\u0002\0\u0001\u002e\u0002\0\u0001\u0028\u0004\0" +
+        "\u0002\u0014\u0007\0\u0001\u0014\u0004\0\u0002\u0018\u0007\0\u0001\u0018\u0004\0" +
+        "\u0002\u001c\u0007\0\u0001\u001c\u0004\0\u0002\u001f\u0007\0\u0001\u001f\u0004\0" +
+        "\u0002\u0022\u0007\0\u0001\u0022\u0004\0\u0002\u0032\u0007\0\u0001\u0032\u0004\0" +
+        "\u0002\u0014\u0007\0\u0001\u0033\u0004\0\u0002\u0032\u0001\0\u0001\u002e\u0002\0" +
+        "\u0001\u002e\u0002\0\u0001\u0032\u0004\0\u0002\u0014\u0001\0\u0001\u0031\u0001\0" +
+        "\u0001\u0009\u0002\u002a\u0001\0\u0001\u0014\u0003\0";
+
+        private static int[] zzUnpackTrans()
+        {
+            int[] result = new int[658];
+            int offset = 0;
+            offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
+            return result;
+        }
+
+        private static int zzUnpackTrans(String packed, int offset, int[] result)
+        {
+            int i = 0;       /* index in packed string  */
+            int j = offset;  /* index in unpacked array */
+            int l = packed.Length;
+            while (i < l)
+            {
+                int count = packed[i++];
+                int value = packed[i++];
+                value--;
+                do result[j++] = value; while (--count > 0);
+            }
+            return j;
+        }
+
+        /* error codes */
+        private const int ZZ_UNKNOWN_ERROR = 0;
+        private const int ZZ_NO_MATCH = 1;
+        private const int ZZ_PUSHBACK_2BIG = 2;
+
+        /* error messages for the codes above */
+        private static readonly String[] ZZ_ERROR_MSG = {
+        "Unkown internal scanner error",
+        "Error: could not match input",
+        "Error: pushback value was too large"
+        };
+
+        /**
+        * ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
+        */
+        private static readonly int[] ZZ_ATTRIBUTE = zzUnpackAttribute();
+
+        private const String ZZ_ATTRIBUTE_PACKED_0 =
+        "\u0001\0\u0001\u0009\u0003\u0001\u0001\u0009\u0001\u0001\u000b\0\u0004\u0001\u0002\0" +
+        "\u0001\u0001\u0001\0\u000f\u0001\u0001\0\u0001\u0001\u0003\0\u0005\u0001";
+
+        private static int[] zzUnpackAttribute()
+        {
+            int[] result = new int[51];
+            int offset = 0;
+            offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
+            return result;
+        }
+
+        private static int zzUnpackAttribute(String packed, int offset, int[] result)
+        {
+            int i = 0;       /* index in packed string  */
+            int j = offset;  /* index in unpacked array */
+            int l = packed.Length;
+            while (i < l)
+            {
+                int count = packed[i++];
+                int value = packed[i++];
+                do result[j++] = value; while (--count > 0);
+            }
+            return j;
+        }
+
+        /** the input device */
+        private TextReader zzReader;
+
+        /** the current state of the DFA */
+        private int zzState;
+
+        /** the current lexical state */
+        private int zzLexicalState = YYINITIAL;
+
+        /** this buffer contains the current text to be matched and is
+        the source of the yytext() string */
+        private char[] zzBuffer = new char[ZZ_BUFFERSIZE];
+
+        /** the textposition at the last accepting state */
+        private int zzMarkedPos;
+
+        /** the current text position in the buffer */
+        private int zzCurrentPos;
+
+        /** startRead marks the beginning of the yytext() string in the buffer */
+        private int zzStartRead;
+
+        /** endRead marks the last character in the buffer, that has been read
+        from input */
+        private int zzEndRead;
+
+        /** number of newlines encountered up to the start of the matched text */
+        private int yyline;
+
+        /** the number of characters up to the start of the matched text */
+        private int yychar;
+
+        /**
+        * the number of characters from the last newline up to the start of the 
+        * matched text
+        */
+        private int yycolumn;
+
+        /** 
+        * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+        */
+        private bool zzAtBOL = true;
+
+        /** zzAtEOF == true <=> the scanner is at the EOF */
+        private bool zzAtEOF;
+
+        /** denotes if the user-EOF-code has already been executed */
+        private bool zzEOFDone;
+
+
+        /* user code: */
+
+        public const int ALPHANUM = StandardTokenizer.ALPHANUM;
+        public const int APOSTROPHE = StandardTokenizer.APOSTROPHE;
+        public const int ACRONYM = StandardTokenizer.ACRONYM;
+        public const int COMPANY = StandardTokenizer.COMPANY;
+        public const int EMAIL = StandardTokenizer.EMAIL;
+        public const int HOST = StandardTokenizer.HOST;
+        public const int NUM = StandardTokenizer.NUM;
+        public const int CJ = StandardTokenizer.CJ;
+        public const int ACRONYM_DEP = StandardTokenizer.ACRONYM_DEP;
+
+        public static readonly String[] TOKEN_TYPES = StandardTokenizer.TOKEN_TYPES;
+
+        public int YYChar
+        {
+            get { return yychar; }
+        }
+
+        public void GetText(Tokenattributes.ICharTermAttribute t)
+        {
+            t.CopyBuffer(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead);
+        }
+
+        /**
+        * Creates a new scanner
+        * There is also a java.io.InputStream version of this constructor.
+        *
+        * @param   in  the java.io.Reader to read input from.
+        */
+        internal ClassicTokenizerImpl(TextReader input)
+        {
+            this.zzReader = input;
+        }
+
+        private static char[] zzUnpackCMap(String packed)
+        {
+            char[] map = new char[0x10000];
+            int i = 0;  /* index in packed string  */
+            int j = 0;  /* index in unpacked array */
+            while (i < 1154)
+            {
+                int count = packed[i++];
+                char value = packed[i++];
+                do map[j++] = value; while (--count > 0);
+            }
+            return map;
+        }
+
+        private bool zzRefill()
+        {
+
+            /* first: make room (if you can) */
+            if (zzStartRead > 0)
+            {
+                Array.Copy(zzBuffer, zzStartRead,
+                                 zzBuffer, 0,
+                                 zzEndRead - zzStartRead);
+
+                /* translate stored positions */
+                zzEndRead -= zzStartRead;
+                zzCurrentPos -= zzStartRead;
+                zzMarkedPos -= zzStartRead;
+                zzStartRead = 0;
+            }
+
+            /* is the buffer big enough? */
+            if (zzCurrentPos >= zzBuffer.Length)
+            {
+                /* if not: blow it up */
+                char[] newBuffer = new char[zzCurrentPos * 2];
+                Array.Copy(zzBuffer, 0, newBuffer, 0, zzBuffer.Length);
+                zzBuffer = newBuffer;
+            }
+
+            /* finally: fill the buffer with new input */
+            int numRead = zzReader.Read(zzBuffer, zzEndRead,
+                                                    zzBuffer.Length - zzEndRead);
+
+            if (numRead > 0)
+            {
+                zzEndRead += numRead;
+                return false;
+            }
+            // unlikely but not impossible: read 0 characters, but not at end of stream    
+            if (numRead == 0)
+            {
+                int c = zzReader.Read();
+                if (c <= 0)
+                {
+                    return true;
+                }
+                else
+                {
+                    zzBuffer[zzEndRead++] = (char)c;
+                    return false;
+                }
+            }
+
+            // numRead < 0
+            return true;
+        }
+
+        public void yyclose()
+        {
+            zzAtEOF = true;            /* indicate end of file */
+            zzEndRead = zzStartRead;  /* invalidate buffer    */
+
+            if (zzReader != null)
+                zzReader.Close();
+        }
+
+        public void YYReset(TextReader reader)
+        {
+            zzReader = reader;
+            zzAtBOL = true;
+            zzAtEOF = false;
+            zzEOFDone = false;
+            zzEndRead = zzStartRead = 0;
+            zzCurrentPos = zzMarkedPos = 0;
+            yyline = yychar = yycolumn = 0;
+            zzLexicalState = YYINITIAL;
+            if (zzBuffer.Length > ZZ_BUFFERSIZE)
+                zzBuffer = new char[ZZ_BUFFERSIZE];
+        }
+
+        public int yystate()
+        {
+            return zzLexicalState;
+        }
+
+        public void yybegin(int newState)
+        {
+            zzLexicalState = newState;
+        }
+
+        public String yytext()
+        {
+            return new String(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead);
+        }
+
+        public char yycharat(int pos)
+        {
+            return zzBuffer[zzStartRead + pos];
+        }
+
+        public int YYLength
+        {
+            get { return zzMarkedPos - zzStartRead; }
+        }
+
+        private void zzScanError(int errorCode)
+        {
+            String message;
+            try
+            {
+                message = ZZ_ERROR_MSG[errorCode];
+            }
+            catch (IndexOutOfRangeException e)
+            {
+                message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+            }
+
+            throw new Exception(message);
+        }
+
+        public void yypushback(int number)
+        {
+            if (number > YYLength)
+                zzScanError(ZZ_PUSHBACK_2BIG);
+
+            zzMarkedPos -= number;
+        }
+
+        public int GetNextToken()
+        {
+            int zzInput;
+            int zzAction;
+
+            // cached fields:
+            int zzCurrentPosL;
+            int zzMarkedPosL;
+            int zzEndReadL = zzEndRead;
+            char[] zzBufferL = zzBuffer;
+            char[] zzCMapL = ZZ_CMAP;
+
+            int[] zzTransL = ZZ_TRANS;
+            int[] zzRowMapL = ZZ_ROWMAP;
+            int[] zzAttrL = ZZ_ATTRIBUTE;
+
+            while (true)
+            {
+                zzMarkedPosL = zzMarkedPos;
+
+                yychar += zzMarkedPosL - zzStartRead;
+
+                zzAction = -1;
+
+                zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+
+                zzState = ZZ_LEXSTATE[zzLexicalState];
+
+                // set up zzAction for empty match case:
+                int zzAttributes = zzAttrL[zzState];
+                if ((zzAttributes & 1) == 1)
+                {
+                    zzAction = zzState;
+                }
+
+
+            //zzForAction:
+                {
+                    while (true)
+                    {
+
+                        if (zzCurrentPosL < zzEndReadL)
+                            zzInput = zzBufferL[zzCurrentPosL++];
+                        else if (zzAtEOF)
+                        {
+                            zzInput = YYEOF;
+                            break;
+                        }
+                        else
+                        {
+                            // store back cached positions
+                            zzCurrentPos = zzCurrentPosL;
+                            zzMarkedPos = zzMarkedPosL;
+                            bool eof = zzRefill();
+                            // get translated positions and possibly new buffer
+                            zzCurrentPosL = zzCurrentPos;
+                            zzMarkedPosL = zzMarkedPos;
+                            zzBufferL = zzBuffer;
+                            zzEndReadL = zzEndRead;
+                            if (eof)
+                            {
+                                zzInput = YYEOF;
+                                break;
+                            }
+                            else
+                            {
+                                zzInput = zzBufferL[zzCurrentPosL++];
+                            }
+                        }
+                        int zzNext = zzTransL[zzRowMapL[zzState] + zzCMapL[zzInput]];
+                        if (zzNext == -1) break;
+                        zzState = zzNext;
+
+                        zzAttributes = zzAttrL[zzState];
+                        if ((zzAttributes & 1) == 1)
+                        {
+                            zzAction = zzState;
+                            zzMarkedPosL = zzCurrentPosL;
+                            if ((zzAttributes & 8) == 8) break;
+                        }
+
+                    }
+                }
+
+                // store back cached position
+                zzMarkedPos = zzMarkedPosL;
+
+                switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction])
+                {
+                    case 1:
+                        { /* Break so we don't hit fall-through warning: */
+                            break;/* ignore */
+                        }
+                    case 11: break;
+                    case 2:
+                        {
+                            return ALPHANUM;
+                        }
+                    case 12: break;
+                    case 3:
+                        {
+                            return CJ;
+                        }
+                    case 13: break;
+                    case 4:
+                        {
+                            return HOST;
+                        }
+                    case 14: break;
+                    case 5:
+                        {
+                            return NUM;
+                        }
+                    case 15: break;
+                    case 6:
+                        {
+                            return APOSTROPHE;
+                        }
+                    case 16: break;
+                    case 7:
+                        {
+                            return COMPANY;
+                        }
+                    case 17: break;
+                    case 8:
+                        {
+                            return ACRONYM_DEP;
+                        }
+                    case 18: break;
+                    case 9:
+                        {
+                            return ACRONYM;
+                        }
+                    case 19: break;
+                    case 10:
+                        {
+                            return EMAIL;
+                        }
+                    case 20: break;
+                    default:
+                        if (zzInput == YYEOF && zzStartRead == zzCurrentPos)
+                        {
+                            zzAtEOF = true;
+                            return YYEOF;
+                        }
+                        else
+                        {
+                            zzScanError(ZZ_NO_MATCH);
+                        }
+                        break;
+                }
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7a4b442f/src/contrib/Analyzers/Standard/IStandardTokenizerInterface.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Standard/IStandardTokenizerInterface.cs b/src/contrib/Analyzers/Standard/IStandardTokenizerInterface.cs
new file mode 100644
index 0000000..883e7a0
--- /dev/null
+++ b/src/contrib/Analyzers/Standard/IStandardTokenizerInterface.cs
@@ -0,0 +1,27 @@
+using Lucene.Net.Analysis.Tokenattributes;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Standard
+{
+    public interface IStandardTokenizerInterface
+    {
+        void GetText(ICharTermAttribute t);
+
+        int YYChar { get; }
+
+        void YYReset(TextReader reader);
+
+        int YYLength { get; }
+
+        int GetNextToken();
+    }
+
+    public static class StandardTokenizerInterface
+    {
+        public const int YYEOF = -1;
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7a4b442f/src/contrib/Analyzers/Standard/StandardAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Standard/StandardAnalyzer.cs b/src/contrib/Analyzers/Standard/StandardAnalyzer.cs
new file mode 100644
index 0000000..dead459
--- /dev/null
+++ b/src/contrib/Analyzers/Standard/StandardAnalyzer.cs
@@ -0,0 +1,70 @@
+using Lucene.Net.Analysis.Core;
+using Lucene.Net.Analysis.Util;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using Version = Lucene.Net.Util.Version;
+
+namespace Lucene.Net.Analysis.Standard
+{
+    public sealed class StandardAnalyzer : StopwordAnalyzerBase
+    {
+        public const int DEFAULT_MAX_TOKEN_LENGTH = 255;
+
+        private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
+
+        public static readonly CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
+
+        public StandardAnalyzer(Version? matchVersion, CharArraySet stopWords)
+            : base(matchVersion, stopWords)
+        {
+        }
+
+        public StandardAnalyzer(Version? matchVersion)
+            : this(matchVersion, STOP_WORDS_SET)
+        {
+        }
+
+        public StandardAnalyzer(Version? matchVersion, TextReader stopwords)
+            : this(matchVersion, LoadStopwordSet(stopwords, matchVersion))
+        {
+        }
+
+        public int MaxTokenLength
+        {
+            get { return maxTokenLength; }
+            set { maxTokenLength = value; }
+        }
+
+        public override TokenStreamComponents CreateComponents(string fieldName, TextReader reader)
+        {
+            StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
+            src.MaxTokenLength = maxTokenLength;
+            TokenStream tok = new StandardFilter(matchVersion, src);
+            tok = new LowerCaseFilter(matchVersion, tok);
+            tok = new StopFilter(matchVersion, tok, stopwords);
+            return new AnonymousTokenStreamComponents(this, src, tok);
+        }
+
+        private sealed class AnonymousTokenStreamComponents : TokenStreamComponents
+        {
+            private readonly StandardTokenizer src;
+            private readonly StandardAnalyzer parent;
+
+            public AnonymousTokenStreamComponents(StandardAnalyzer parent, StandardTokenizer src, TokenStream tok)
+                : base(src, tok)
+            {
+                this.parent = parent;
+                this.src = src;
+            }
+
+            public override void SetReader(TextReader reader)
+            {
+                src.MaxTokenLength = parent.maxTokenLength;
+                base.SetReader(reader);
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7a4b442f/src/contrib/Analyzers/Standard/StandardFilter.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Standard/StandardFilter.cs b/src/contrib/Analyzers/Standard/StandardFilter.cs
new file mode 100644
index 0000000..9381883
--- /dev/null
+++ b/src/contrib/Analyzers/Standard/StandardFilter.cs
@@ -0,0 +1,73 @@
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using Version = Lucene.Net.Util.Version;
+
+namespace Lucene.Net.Analysis.Standard
+{
+    public class StandardFilter : TokenFilter
+    {
+        private readonly Version? matchVersion;
+
+        public StandardFilter(Version? matchVersion, TokenStream input)
+            : base(input)
+        {
+            this.matchVersion = matchVersion;
+
+            typeAtt = AddAttribute<ITypeAttribute>();
+            termAtt = AddAttribute<ICharTermAttribute>();
+        }
+
+        private static readonly String APOSTROPHE_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.APOSTROPHE];
+        private static readonly String ACRONYM_TYPE = ClassicTokenizer.TOKEN_TYPES[ClassicTokenizer.ACRONYM];
+
+        // this filters uses attribute type
+        private readonly ITypeAttribute typeAtt; // = addAttribute(TypeAttribute.class);
+        private readonly ICharTermAttribute termAtt; // = addAttribute(CharTermAttribute.class);
+
+        public override bool IncrementToken()
+        {
+            if (matchVersion.GetValueOrDefault().OnOrAfter(Version.LUCENE_31))
+                return input.IncrementToken(); // TODO: add some niceties for the new grammar
+            else
+                return IncrementTokenClassic();
+        }
+
+        public bool IncrementTokenClassic()
+        {
+            if (!input.IncrementToken())
+            {
+                return false;
+            }
+
+            char[] buffer = termAtt.Buffer;
+            int bufferLength = termAtt.Length;
+            String type = typeAtt.Type;
+
+            if (type == APOSTROPHE_TYPE &&      // remove 's
+                bufferLength >= 2 &&
+                buffer[bufferLength - 2] == '\'' &&
+                (buffer[bufferLength - 1] == 's' || buffer[bufferLength - 1] == 'S'))
+            {
+                // Strip last 2 characters off
+                termAtt.SetLength(bufferLength - 2);
+            }
+            else if (type == ACRONYM_TYPE)
+            {      // remove dots
+                int upto = 0;
+                for (int i = 0; i < bufferLength; i++)
+                {
+                    char c = buffer[i];
+                    if (c != '.')
+                        buffer[upto++] = c;
+                }
+                termAtt.SetLength(upto);
+            }
+
+            return true;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7a4b442f/src/contrib/Analyzers/Standard/StandardFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Standard/StandardFilterFactory.cs b/src/contrib/Analyzers/Standard/StandardFilterFactory.cs
new file mode 100644
index 0000000..447b5e3
--- /dev/null
+++ b/src/contrib/Analyzers/Standard/StandardFilterFactory.cs
@@ -0,0 +1,26 @@
+using Lucene.Net.Analysis.Util;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Standard
+{
+    public class StandardFilterFactory : TokenFilterFactory
+    {
+        public StandardFilterFactory(IDictionary<String, String> args)
+            : base(args)
+        {            
+            AssureMatchVersion();
+            if (args.Count > 0)
+            {
+                throw new ArgumentException("Unknown parameters: " + args);
+            }
+        }
+
+        public override TokenStream Create(TokenStream input)
+        {
+            return new StandardFilter(luceneMatchVersion, input);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7a4b442f/src/contrib/Analyzers/Standard/StandardTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Standard/StandardTokenizer.cs b/src/contrib/Analyzers/Standard/StandardTokenizer.cs
new file mode 100644
index 0000000..4c3d375
--- /dev/null
+++ b/src/contrib/Analyzers/Standard/StandardTokenizer.cs
@@ -0,0 +1,167 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using Lucene.Net.Util;
+using Version = Lucene.Net.Util.Version;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Analysis.Standard.Std31;
+using Lucene.Net.Analysis.Standard.Std34;
+
+namespace Lucene.Net.Analysis.Standard
+{
+    public sealed class StandardTokenizer : Tokenizer
+    {
+        private IStandardTokenizerInterface scanner;
+
+        public const int ALPHANUM = 0;
+        [Obsolete]
+        public const int APOSTROPHE = 1;
+        [Obsolete]
+        public const int ACRONYM = 2;
+        [Obsolete]
+        public const int COMPANY = 3;
+        public const int EMAIL = 4;
+        [Obsolete]
+        public const int HOST = 5;
+        public const int NUM = 6;
+        [Obsolete]
+        public const int CJ = 7;
+        [Obsolete]
+        public const int ACRONYM_DEP = 8;
+        public const int SOUTHEAST_ASIAN = 9;
+        public const int IDEOGRAPHIC = 10;
+        public const int HIRAGANA = 11;
+        public const int KATAKANA = 12;
+        public const int HANGUL = 13;
+
+        public static readonly string[] TOKEN_TYPES = new string[] {
+            "<ALPHANUM>",
+            "<APOSTROPHE>",
+            "<ACRONYM>",
+            "<COMPANY>",
+            "<EMAIL>",
+            "<HOST>",
+            "<NUM>",
+            "<CJ>",
+            "<ACRONYM_DEP>",
+            "<SOUTHEAST_ASIAN>",
+            "<IDEOGRAPHIC>",
+            "<HIRAGANA>",
+            "<KATAKANA>",
+            "<HANGUL>"
+          };
+
+        private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
+
+        public int MaxTokenLength
+        {
+            get { return maxTokenLength; }
+            set { maxTokenLength = value; }
+        }
+
+        public StandardTokenizer(Version? matchVersion, TextReader input)
+            : base(input)
+        {
+            termAtt = AddAttribute<ICharTermAttribute>();
+            offsetAtt = AddAttribute<IOffsetAttribute>();
+            posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
+            typeAtt = AddAttribute<ITypeAttribute>();
+
+            Init(matchVersion.GetValueOrDefault());
+        }
+
+        public StandardTokenizer(Version? matchVersion, AttributeFactory factory, TextReader input)
+            : base(factory, input)
+        {
+            termAtt = AddAttribute<ICharTermAttribute>();
+            offsetAtt = AddAttribute<IOffsetAttribute>();
+            posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
+            typeAtt = AddAttribute<ITypeAttribute>();
+
+            Init(matchVersion.GetValueOrDefault());
+        }
+
+        private void Init(Version matchVersion)
+        {
+            // best effort NPE if you dont call reset
+            if (matchVersion.OnOrAfter(Version.LUCENE_40))
+            {
+                this.scanner = new StandardTokenizerImpl(null);
+            }
+            else if (matchVersion.OnOrAfter(Version.LUCENE_34))
+            {
+                this.scanner = new StandardTokenizerImpl34(null);
+            }
+            else if (matchVersion.OnOrAfter(Version.LUCENE_31))
+            {
+                this.scanner = new StandardTokenizerImpl31(null);
+            }
+            else
+            {
+                this.scanner = new ClassicTokenizerImpl(null);
+            }
+        }
+
+        // this tokenizer generates three attributes:
+        // term offset, positionIncrement and type
+        private readonly ICharTermAttribute termAtt; // = addAttribute(CharTermAttribute.class);
+        private readonly IOffsetAttribute offsetAtt; // = addAttribute(OffsetAttribute.class);
+        private readonly IPositionIncrementAttribute posIncrAtt; // = addAttribute(PositionIncrementAttribute.class);
+        private readonly ITypeAttribute typeAtt; // = addAttribute(TypeAttribute.class);
+
+        public override bool IncrementToken()
+        {
+            ClearAttributes();
+            int posIncr = 1;
+
+            while (true)
+            {
+                int tokenType = scanner.GetNextToken();
+
+                if (tokenType == StandardTokenizerInterface.YYEOF)
+                {
+                    return false;
+                }
+
+                if (scanner.YYLength <= maxTokenLength)
+                {
+                    posIncrAtt.PositionIncrement = posIncr;
+                    scanner.GetText(termAtt);
+                    int start = scanner.YYChar;
+                    offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + termAtt.Length));
+                    // This 'if' should be removed in the next release. For now, it converts
+                    // invalid acronyms to HOST. When removed, only the 'else' part should
+                    // remain.
+                    if (tokenType == StandardTokenizer.ACRONYM_DEP)
+                    {
+                        typeAtt.Type = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HOST];
+                        termAtt.SetLength(termAtt.Length - 1); // remove extra '.'
+                    }
+                    else
+                    {
+                        typeAtt.Type = StandardTokenizer.TOKEN_TYPES[tokenType];
+                    }
+                    return true;
+                }
+                else
+                    // When we skip a too-long term, we still increment the
+                    // position increment
+                    posIncr++;
+            }
+        }
+
+        public override void End()
+        {
+            // set final offset
+            int finalOffset = CorrectOffset(scanner.YYChar + scanner.YYLength);
+            offsetAtt.SetOffset(finalOffset, finalOffset);
+        }
+
+        public override void Reset()
+        {
+            scanner.YYReset(input);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/7a4b442f/src/contrib/Analyzers/Standard/StandardTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Analyzers/Standard/StandardTokenizerFactory.cs b/src/contrib/Analyzers/Standard/StandardTokenizerFactory.cs
new file mode 100644
index 0000000..bfc64ca
--- /dev/null
+++ b/src/contrib/Analyzers/Standard/StandardTokenizerFactory.cs
@@ -0,0 +1,31 @@
+using Lucene.Net.Analysis.Util;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Standard
+{
+    public class StandardTokenizerFactory : TokenizerFactory
+    {
+        private readonly int maxTokenLength;
+
+        public StandardTokenizerFactory(IDictionary<String, String> args)
+            : base(args)
+        {
+            AssureMatchVersion();
+            maxTokenLength = GetInt(args, "maxTokenLength", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
+            if (args.Count > 0)
+            {
+                throw new ArgumentException("Unknown parameters: " + args);
+            }
+        }
+
+        public override Tokenizer Create(Net.Util.AttributeSource.AttributeFactory factory, System.IO.TextReader input)
+        {
+            StandardTokenizer tokenizer = new StandardTokenizer(luceneMatchVersion, factory, input);
+            tokenizer.MaxTokenLength = maxTokenLength;
+            return tokenizer;
+        }
+    }
+}


Mime
View raw message