lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From pnas...@apache.org
Subject [1/5] Blocking Terms Codec - nearly finished
Date Sun, 14 Sep 2014 22:06:09 GMT
Repository: lucenenet
Updated Branches:
  refs/heads/master f69360e20 -> 17db2acd2


http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b6b784fb/src/Lucene.Net.Codecs/BlockTerms/TermsIndexReaderBase.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Codecs/BlockTerms/TermsIndexReaderBase.cs b/src/Lucene.Net.Codecs/BlockTerms/TermsIndexReaderBase.cs
index aa15e4e..3fc2c18 100644
--- a/src/Lucene.Net.Codecs/BlockTerms/TermsIndexReaderBase.cs
+++ b/src/Lucene.Net.Codecs/BlockTerms/TermsIndexReaderBase.cs
@@ -15,13 +15,12 @@
  * limitations under the License.
  */
 
-using System;
-using Lucene.Net.Index;
-using Lucene.Net.Util;
-
 namespace Lucene.Net.Codecs.BlockTerms
 {
 
+    using System;
+    using Index;
+    using Util;
 
     /// <summary>
     /// TODO
@@ -40,14 +39,16 @@ namespace Lucene.Net.Codecs.BlockTerms
     /// </summary>
     public abstract class TermsIndexReaderBase : IDisposable
     {
+        public abstract bool SupportsOrd { get; }
 
-        public abstract FieldIndexEnum GetFieldEnum(FieldInfo fieldInfo);
+        public abstract int Divisor { get; }
 
-        public abstract void Dispose();
+        /// <summary>Returns approximate RAM bytes used</summary>
+        public abstract long RamBytesUsed { get; }
 
-        public abstract bool SupportsOrd();
+        public abstract FieldIndexEnum GetFieldEnum(FieldInfo fieldInfo);
 
-        public abstract int GetDivisor();
+        public abstract void Dispose();
 
         /// <summary>
         /// Similar to TermsEnum, except, the only "metadata" it
@@ -56,33 +57,30 @@ namespace Lucene.Net.Codecs.BlockTerms
         /// </summary>
         public abstract class FieldIndexEnum
         {
-
-            /// <summary> 
-            /// Seeks to "largest" indexed term that's <=
-            ///  term; returns file pointer index (into the main
-            /// terms index file) for that term 
-            /// </summary>
-            public abstract long Seek(BytesRef term);
-
             /** Returns -1 at end */
-            public abstract long Next();
+            public abstract long Next { get; }
 
-            public abstract BytesRef Term();
+            public abstract BytesRef Term { get; }
 
             /// <summary></summary>
             /// <remarks>Only implemented if {@link TermsIndexReaderBase.supportsOrd()}

             /// returns true</remarks>
             /// <returns></returns>
-            public abstract long Seek(long ord);
+            public abstract long Ord { get; set; }
 
             /// <summary></summary>
             /// <remarks>Only implemented if {@link TermsIndexReaderBase.supportsOrd()}

             /// returns true</remarks>
             /// <returns></returns>
-            public abstract long Ord();
+            public abstract long Seek(long ord);
+        
+            /// <summary> 
+            /// Seeks to "largest" indexed term that's less than or equal
+            /// to term; returns file pointer index (into the main
+            /// terms index file) for that term 
+            /// </summary>
+            public abstract long Seek(BytesRef term);
         }
 
-        /// <summary>Returns approximate RAM bytes used</summary>
-        public abstract long RamBytesUsed();
     }
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b6b784fb/src/Lucene.Net.Codecs/BlockTerms/TermsIndexWriterBase.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Codecs/BlockTerms/TermsIndexWriterBase.cs b/src/Lucene.Net.Codecs/BlockTerms/TermsIndexWriterBase.cs
index bd20e4e..76c5973 100644
--- a/src/Lucene.Net.Codecs/BlockTerms/TermsIndexWriterBase.cs
+++ b/src/Lucene.Net.Codecs/BlockTerms/TermsIndexWriterBase.cs
@@ -15,12 +15,11 @@
  * limitations under the License.
  */
 
-using System;
-using Lucene.Net.Index;
-using Lucene.Net.Util;
-
 namespace Lucene.Net.Codecs.BlockTerms
 {
+    using System;
+    using Index;
+    using Util;
 
     /// <summary>
     ///  Base class for terms index implementations to plug
@@ -34,11 +33,7 @@ namespace Lucene.Net.Codecs.BlockTerms
 
         public abstract FieldWriter AddField(FieldInfo fieldInfo, long termsFilePointer);
 
-        public void Dispose()
-        {
-            //
-        }
-
+        public abstract void Dispose();
 
         /// <summary>Terms index API for a single field</summary>
         public abstract class FieldWriter

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b6b784fb/src/Lucene.Net.Codecs/BlockTerms/VariableGapTermsIndexReader.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Codecs/BlockTerms/VariableGapTermsIndexReader.cs b/src/Lucene.Net.Codecs/BlockTerms/VariableGapTermsIndexReader.cs
index 0c97779..7fcc1fa 100644
--- a/src/Lucene.Net.Codecs/BlockTerms/VariableGapTermsIndexReader.cs
+++ b/src/Lucene.Net.Codecs/BlockTerms/VariableGapTermsIndexReader.cs
@@ -15,84 +15,79 @@
  * limitations under the License.
  */
 
+using System.Linq;
+
 namespace Lucene.Net.Codecs.BlockTerms
 {
 
     using System;
     using System.Collections.Generic;
     using System.Diagnostics;
-    using Lucene.Net.Index;
-    using Lucene.Net.Store;
-    using Lucene.Net.Util;
-    using Lucene.Net.Util.Fst;
-
-/** See {@link VariableGapTermsIndexWriter}
- * 
- * @lucene.experimental */
-
+    using Index;
+    using Store;
+    using Util;
+    using Util.Fst;
+    
+    /// <summary>
+    /// See VariableGapTermsIndexWriter
+    /// 
+    /// lucene.experimental
+    /// </summary>
     public class VariableGapTermsIndexReader : TermsIndexReaderBase
     {
+        private readonly int _indexDivisor;
+        private readonly IndexInput _input;       // Closed if indexLoaded is true:
+        private readonly int _version;
 
-        private readonly PositiveIntOutputs fstOutputs = PositiveIntOutputs.Singleton;
-        private readonly int indexDivisor;
-        private readonly IndexInput input;       // Closed if indexLoaded is true:
-        private volatile bool indexLoaded;
-
-        private readonly Dictionary<FieldInfo, FieldIndexData> fields = new Dictionary<FieldInfo,
FieldIndexData>();
-
-        private long dirOffset;                 // start of the field info data
-        private readonly int version;
-        private readonly String segment;
+        private volatile bool _indexLoaded;
+        private long _dirOffset;                 // start of the field info data
 
+        private readonly PositiveIntOutputs _fstOutputs = PositiveIntOutputs.Singleton;
+        private readonly Dictionary<FieldInfo, FieldIndexData> _fields = new Dictionary<FieldInfo,
FieldIndexData>();
+        
         public VariableGapTermsIndexReader(Directory dir, FieldInfos fieldInfos, String segment,
int indexDivisor,
             String segmentSuffix, IOContext context)
         {
-            input =
+            _input =
                 dir.OpenInput(
                     IndexFileNames.SegmentFileName(segment, segmentSuffix,
                         VariableGapTermsIndexWriter.TERMS_INDEX_EXTENSION), new IOContext(context,
true));
-            this.segment = segment;
-            bool success = false;
+            var success = false;
 
-            Debug.Debug.Assert((indexDivisor == -1 || indexDivisor > 0);
+            Debug.Assert(indexDivisor == -1 || indexDivisor > 0);
 
             try
             {
 
-                version = readHeader(input);
-                this.indexDivisor = indexDivisor;
+                _version = ReadHeader(_input);
+                _indexDivisor = indexDivisor;
 
-                if (version >= VariableGapTermsIndexWriter.VERSION_CHECKSUM)
-                {
-                    CodecUtil.ChecksumEntireFile(input);
-                }
-
-                SeekDir(in,
-                dirOffset)
-                ;
+                if (_version >= VariableGapTermsIndexWriter.VERSION_CHECKSUM)
+                    CodecUtil.ChecksumEntireFile(_input);
+                
+                SeekDir(_input, _dirOffset);
 
                 // Read directory
-                int numFields = input.ReadVInt();
+                var numFields = _input.ReadVInt();
                 if (numFields < 0)
                 {
-                    throw new CorruptIndexException("invalid numFields: " + numFields + "
(resource=" + input + ")");
+                    throw new CorruptIndexException("invalid numFields: " + numFields + "
(resource=" + _input + ")");
                 }
 
-                for (int i = 0; i < numFields; i++)
+                for (var i = 0; i < numFields; i++)
                 {
-                    final
-                    int field = in.
-                    readVInt();
-                    final
-                    long indexStart = in.
-                    readVLong();
-                    final
-                    FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
-                    FieldIndexData previous = fields.put(fieldInfo, new FieldIndexData(fieldInfo,
indexStart));
-                    if (previous != null)
+                    var field = _input.ReadVInt();
+                    var indexStart = _input.ReadVLong();
+                    var fieldInfo = fieldInfos.FieldInfo(field);
+                    
+                    try
+                    {
+                        _fields.Add(fieldInfo, new FieldIndexData(indexStart, this));
+                    }
+                    catch (ArgumentException)
                     {
-                        throw new CorruptIndexException("duplicate field: " + fieldInfo.name
+ " (resource=" +in + ")" )
-                        ;
+                        throw new CorruptIndexException(String.Format("Duplicate Field: {0},
Resource: {1}",
+                            fieldInfo.Name, _input));
                     }
                 }
                 success = true;
@@ -101,75 +96,58 @@ namespace Lucene.Net.Codecs.BlockTerms
             {
                 if (indexDivisor > 0)
                 {
-                in.
-                    close();
-                    in =
-                    null;
+                    _input.Dispose();
+                    _input = null;
                     if (success)
                     {
-                        indexLoaded = true;
+                        _indexLoaded = true;
                     }
                 }
             }
         }
 
-
         private int ReadHeader(IndexInput input)
         {
             int version = CodecUtil.CheckHeader(input, VariableGapTermsIndexWriter.CODEC_NAME,
                 VariableGapTermsIndexWriter.VERSION_START, VariableGapTermsIndexWriter.VERSION_CURRENT);
             if (version < VariableGapTermsIndexWriter.VERSION_APPEND_ONLY)
             {
-                dirOffset = input.ReadLong();
+                _dirOffset = input.ReadLong();
             }
             return version;
         }
 
         public override void Dispose()
         {
-            throw new NotImplementedException();
+            if (_input != null && !_indexLoaded) { 
+                _input.Dispose(); 
+            } 
         }
 
-        public override bool SupportsOrd()
+        public override bool SupportsOrd
         {
-            return false;
+            get { return false; }
         }
-
-        public override int GetDivisor()
+        
+        public override int Divisor
         {
-            return indexDivisor;
+            get { return _indexDivisor; }
         }
 
         public override FieldIndexEnum GetFieldEnum(FieldInfo fieldInfo)
         {
-            FieldIndexData fieldData = fields[fieldInfo];
-            if (fieldData.Fst == null)
-            {
-                return null;
-            }
-            else
-            {
-                return new IndexEnum(fieldData.Fst);
-            }
-        }
-
-        public override void Close()
-        {
-            if (input !=
-                null && !indexLoaded)
-            {
-                input.Close();
-            }
+            FieldIndexData fieldData = _fields[fieldInfo];
+            return fieldData.Fst == null ? null : new IndexEnum(fieldData.Fst);
         }
 
         private void SeekDir(IndexInput input, long dirOffset)
         {
-            if (version >= VariableGapTermsIndexWriter.VERSION_CHECKSUM)
+            if (_version >= VariableGapTermsIndexWriter.VERSION_CHECKSUM)
             {
                 input.Seek(input.Length() - CodecUtil.FooterLength() - 8);
                 dirOffset = input.ReadLong();
             }
-            else if (version >= VariableGapTermsIndexWriter.VERSION_APPEND_ONLY)
+            else if (_version >= VariableGapTermsIndexWriter.VERSION_APPEND_ONLY)
             {
                 input.Seek(input.Length() - 8);
                 dirOffset = input.ReadLong();
@@ -177,132 +155,111 @@ namespace Lucene.Net.Codecs.BlockTerms
             input.Seek(dirOffset);
         }
 
-        public override long RamBytesUsed()
+        public override long RamBytesUsed
         {
-            long sizeInBytes = 0;
-
-            foreach (var entry in fields.Values)
-            {
-                sizeInBytes += entry.RamBytesUsed();
-            }
-
-            return sizeInBytes;
+            get { return _fields.Values.Sum(entry => entry.RamBytesUsed()); }
         }
 
-
         internal class FieldIndexData
         {
 
-            private readonly long indexStart;
+            private readonly long _indexStart;
             // Set only if terms index is loaded:
             public volatile FST<long> Fst;
+            private readonly VariableGapTermsIndexReader _vgtir;
 
-            public FieldIndexData(FieldInfo fieldInfo, long indexStart)
+            public FieldIndexData(long indexStart, VariableGapTermsIndexReader vgtir)
             {
-                this.indexStart = indexStart;
+                _vgtir = vgtir;
+                _indexStart = indexStart;
 
-                if (indexDivisor > 0)
-                {
-                    loadTermsIndex();
-                }
+                if (_vgtir._indexDivisor > 0)
+                    LoadTermsIndex();
             }
 
-            private void loadTermsIndex()
+            private void LoadTermsIndex()
             {
-                if (Fst == null)
+                if (Fst != null) return;
+
+                var clone = (IndexInput) _vgtir._input.Clone();
+                clone.Seek(_indexStart);
+                Fst = new FST<long>(clone, _vgtir._fstOutputs);
+                clone.Dispose();
+
+                /*
+                final String dotFileName = segment + "_" + fieldInfo.name + ".dot";
+                Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName));
+                Util.toDot(fst, w, false, false);
+                System.out.println("FST INDEX: SAVED to " + dotFileName);
+                w.close();
+                */
+
+                if (_vgtir._indexDivisor > 1)
                 {
-                    IndexInput clone = input.Clone();
-                    clone.Seek(indexStart);
-                    Fst = new FST<>(clone, fstOutputs);
-                    clone.Close();
-
-                    /*
-        final String dotFileName = segment + "_" + fieldInfo.name + ".dot";
-        Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName));
-        Util.toDot(fst, w, false, false);
-        System.out.println("FST INDEX: SAVED to " + dotFileName);
-        w.close();
-        */
-
-                    if (indexDivisor > 1)
+                    // subsample
+                    var scratchIntsRef = new IntsRef();
+                    var outputs = PositiveIntOutputs.Singleton;
+                    var builder = new Builder<long>(FST.INPUT_TYPE.BYTE1, outputs);
+                    var fstEnum = new BytesRefFSTEnum<long>(Fst);
+                    var count = _vgtir._indexDivisor;
+                        
+                    BytesRefFSTEnum<long>.InputOutput<long> result;
+                    while ((result = fstEnum.Next()) != null)
                     {
-                        // subsample
-                        IntsRef scratchIntsRef = new IntsRef();
-                        PositiveIntOutputs outputs = PositiveIntOutputs.GetSingleton();
-                        Builder<long> builder = new Builder<long>(FST.INPUT_TYPE.BYTE1,
outputs);
-                        BytesRefFSTEnum<long> fstEnum = new BytesRefFSTEnum<long>(fst);
-                        BytesRefFSTEnum.InputOutput<long> result;
-                        int count = indexDivisor;
-                        while ((result = fstEnum.Next()) != null)
+                        if (count == _vgtir._indexDivisor)
                         {
-                            if (count == indexDivisor)
-                            {
-                                builder.Add(Util.ToIntsRef(result.Input, scratchIntsRef),
result.Output);
-                                count = 0;
-                            }
-                            count++;
+                            builder.Add(Util.ToIntsRef(result.Input, scratchIntsRef), result.Output);
+                            count = 0;
                         }
-                        Fst = builder.Finish();
+                        count++;
                     }
+                    Fst = builder.Finish();
                 }
             }
 
-            /** Returns approximate RAM bytes used */
-
+            /// <summary>Returns approximate RAM bytes used</summary>
             public long RamBytesUsed()
             {
                 return Fst == null ? 0 : Fst.SizeInBytes();
             }
         }
 
-        internal class IndexEnum : FieldIndexEnum
+        protected class IndexEnum : FieldIndexEnum
         {
-            private readonly BytesRefFSTEnum<long> fstEnum;
-            private BytesRefFSTEnum<long>.InputOutput<long> current;
+            private readonly BytesRefFSTEnum<long> _fstEnum;
+            private BytesRefFSTEnum<long>.InputOutput<long> _current;
 
             public IndexEnum(FST<long> fst)
             {
-                fstEnum = new BytesRefFSTEnum<long>(fst);
+                _fstEnum = new BytesRefFSTEnum<long>(fst);
             }
 
-            public override BytesRef Term()
+            public override BytesRef Term
             {
-                if (current == null)
-                {
-                    return null;
-                }
-                else
-                {
-                    return current.Input;
-                }
+                get { return _current == null ? null : _current.Input; }
             }
 
             public override long Seek(BytesRef target)
             {
-                //System.out.println("VGR: seek field=" + fieldInfo.name + " target=" + target);
-                current = fstEnum.SeekFloor(target);
-                //System.out.println("  got input=" + current.input + " output=" + current.output);
-                return current.Output;
+                _current = _fstEnum.SeekFloor(target);
+                return _current.Output;
             }
 
-            public override long Next()
+            public override long Next
             {
-                //System.out.println("VGR: next field=" + fieldInfo.name);
-                current = fstEnum.Next();
-                if (current == null)
+                get
                 {
-                    //System.out.println("  eof");
-                    return -1;
-                }
-                else
-                {
-                    return current.Output;
+                    _current = _fstEnum.Next();
+                    if (_current == null)
+                        return -1;
+
+                    return _current.Output;
                 }
             }
 
-            public override long Ord()
+            public override long Ord
             {
-                throw new NotImplementedException();
+                get { throw new NotImplementedException(); }
             }
 
             public override long Seek(long ord)

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b6b784fb/src/Lucene.Net.Codecs/BlockTerms/VariableGapTermsIndexWriter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Codecs/BlockTerms/VariableGapTermsIndexWriter.cs b/src/Lucene.Net.Codecs/BlockTerms/VariableGapTermsIndexWriter.cs
index 1a8f9d4..fdfdce5 100644
--- a/src/Lucene.Net.Codecs/BlockTerms/VariableGapTermsIndexWriter.cs
+++ b/src/Lucene.Net.Codecs/BlockTerms/VariableGapTermsIndexWriter.cs
@@ -15,352 +15,334 @@
  * limitations under the License.
  */
 
-using System;
-using System.Collections.Generic;
-using System.Diagnostics;
-using Lucene.Net.Codecs;
-using Lucene.Net.Codecs.BlockTerms;
-using Lucene.Net.Index;
-using Lucene.Net.Store;
-using Lucene.Net.Util;
-using Lucene.Net.Util.Fst;
-
 namespace Lucene.Net.Codecs.BlockTerms
 {
+
+    using System;
+    using System.Collections.Generic;
+    using System.Diagnostics;
+    using Codecs;
+    using Index;
+    using Store;
+    using Util;
+    using Util.Fst;
     
-}
-/**
- * Selects index terms according to provided pluggable
- * {@link IndexTermSelector}, and stores them in a prefix trie that's
- * loaded entirely in RAM stored as an FST.  This terms
- * index only supports unsigned byte term sort order
- * (unicode codepoint order when the bytes are UTF8).
- *
- * @lucene.experimental */
-public class VariableGapTermsIndexWriter : TermsIndexWriterBase {
-  protected IndexOutput output;
-
-  /** Extension of terms index file */
-  public const String TERMS_INDEX_EXTENSION = "tiv";
-
- public const String CODEC_NAME = "VARIABLE_GAP_TERMS_INDEX";
- public const int VERSION_START = 0;
-public const int VERSION_APPEND_ONLY = 1;
-  public const int VERSION_CHECKSUM = 2;
-  public const int VERSION_CURRENT = VERSION_CHECKSUM;
-
-  private readonly List<FSTFieldWriter> fields = new ArrayList<>();
-  
-  @SuppressWarnings("unused") private final FieldInfos fieldInfos; // unread
-  private final IndexTermSelector policy;
-
-  /** 
-   * Hook for selecting which terms should be placed in the terms index.
-   * <p>
-   * {@link #newField} is called at the start of each new field, and
-   * {@link #isIndexTerm} for each term in that field.
-   * 
-   * @lucene.experimental 
-   */
-
-    public abstract class IndexTermSelector
+    /// <summary>
+    /// Selects index terms according to provided pluggable
+    /// {@link IndexTermSelector}, and stores them in a prefix trie that's
+    /// loaded entirely in RAM stored as an FST.  This terms
+    /// index only supports unsigned byte term sort order
+    /// (unicode codepoint order when the bytes are UTF8).
+    /// 
+    /// @lucene.experimental
+    /// </summary>
+    public class VariableGapTermsIndexWriter : TermsIndexWriterBase
     {
-        /// <summary>
-        /// Called sequentially on every term being written
-        /// returning true if this term should be indexed
-        /// </summary>
-        public abstract bool IsIndexTerm(BytesRef term, TermStats stats);
-        
-        /// <summary>Called when a new field is started</summary>
-        public abstract void NewField(FieldInfo fieldInfo);
-    }
+        protected IndexOutput Output;
 
-    /// <remarks>
-    /// Same policy as {@link FixedGapTermsIndexWriter}
-    /// </remarks>
-    public sealed class EveryNTermSelector : IndexTermSelector
-    {
-        private int count;
-        private readonly int interval;
+        /** Extension of terms index file */
+        public const String TERMS_INDEX_EXTENSION = "tiv";
+        public const String CODEC_NAME = "VARIABLE_GAP_TERMS_INDEX";
+        public const int VERSION_START = 0;
+        public const int VERSION_APPEND_ONLY = 1;
+        public const int VERSION_CHECKSUM = 2;
+        public const int VERSION_CURRENT = VERSION_CHECKSUM;
 
-        public EveryNTermSelector(int interval)
+        private readonly List<FstFieldWriter> _fields = new List<FstFieldWriter>();
+        private readonly IndexTermSelector _policy;
+
+        /// <summary>
+        /// Hook for selecting which terms should be placed in the terms index
+        /// 
+        /// IsIndexTerm for each term in that field
+        /// NewField is called at the start of each new field
+        /// 
+        /// @lucene.experimental
+        /// </summary>
+        public abstract class IndexTermSelector
         {
-            this.interval = interval;
-            // First term is first indexed term:
-            count = interval;
+            /// <summary>
+            /// Called sequentially on every term being written
+            /// returning true if this term should be indexed
+            /// </summary>
+            public abstract bool IsIndexTerm(BytesRef term, TermStats stats);
+
+            /// <summary>Called when a new field is started</summary>
+            public abstract void NewField(FieldInfo fieldInfo);
         }
 
-        public override bool IsIndexTerm(BytesRef term, TermStats stats)
+        /// <remarks>
+        /// Same policy as {@link FixedGapTermsIndexWriter}
+        /// </remarks>
+        public class EveryNTermSelector : IndexTermSelector
         {
-            if (count >= interval)
+            private int _count;
+            private readonly int _interval;
+
+            public EveryNTermSelector(int interval)
             {
-                count = 1;
-                return true;
+                _interval = interval;
+                _count = interval; // First term is first indexed term
             }
-            else
+
+            public override bool IsIndexTerm(BytesRef term, TermStats stats)
             {
-                count++;
+                if (_count >= _interval) 
+                {
+                    _count = 1;
+                    return true;
+                }
+                
+                _count++;
                 return false;
             }
+
+            public override void NewField(FieldInfo fieldInfo)
+            {
+                _count = _interval;
+            }
         }
 
-        public override void NewField(FieldInfo fieldInfo)
+        /// <summary>
+        /// Sets an index term when docFreq >= docFreqThresh, or
+        /// every interval terms.  This should reduce seek time
+        /// to high docFreq terms. 
+        /// </summary>
+        public class EveryNOrDocFreqTermSelector : IndexTermSelector
         {
-            count = interval;
-        }
-    }
+            private int _count;
+            private readonly int _docFreqThresh;
+            private readonly int _interval;
 
-    /// <summary>
-    /// Sets an index term when docFreq >= docFreqThresh, or
-    /// every interval terms.  This should reduce seek time
-    /// to high docFreq terms. 
-    /// </summary>
-    public class EveryNOrDocFreqTermSelector : IndexTermSelector
-    {
-        private int count;
-        private readonly int docFreqThresh;
-        private readonly int interval;
+            public EveryNOrDocFreqTermSelector(int docFreqThresh, int interval)
+            {
+                _interval = interval;
+                _docFreqThresh = docFreqThresh;
+                _count = interval; // First term is first indexed term
+            }
 
-        public EveryNOrDocFreqTermSelector(int docFreqThresh, int interval)
-        {
-            this.interval = interval;
-            this.docFreqThresh = docFreqThresh;
+            public override bool IsIndexTerm(BytesRef term, TermStats stats)
+            {
+                if (stats.DocFreq >= _docFreqThresh || _count >= _interval)
+                {
+                    _count = 1;
+                    return true;
+                }
+                
+                _count++;
+                return false;
+            }
 
-            // First term is first indexed term:
-            count = interval;
+            public override void NewField(FieldInfo fieldInfo)
+            {
+                _count = _interval;
+            }
         }
 
-        public override bool IsIndexTerm(BytesRef term, TermStats stats)
+        // TODO: it'd be nice to let the FST builder prune based
+        // on term count of each node (the prune1/prune2 that it
+        // accepts), and build the index based on that.  This
+        // should result in a more compact terms index, more like
+        // a prefix trie than the other selectors, because it
+        // only stores enough leading bytes to get down to N
+        // terms that may complete that prefix.  It becomes
+        // "deeper" when terms are dense, and "shallow" when they
+        // are less dense.
+        //
+        // However, it's not easy to make that work this this
+        // API, because that pruning doesn't immediately know on
+        // seeing each term whether that term will be a seek point
+        // or not.  It requires some non-causality in the API, ie
+        // only on seeing some number of future terms will the
+        // builder decide which past terms are seek points.
+        // Somehow the API'd need to be able to return a "I don't
+        // know" value, eg like a Future, which only later on is
+        // flipped (frozen) to true or false.
+        //
+        // We could solve this with a 2-pass approach, where the
+        // first pass would build an FSA (no outputs) solely to
+        // determine which prefixes are the 'leaves' in the
+        // pruning. The 2nd pass would then look at this prefix
+        // trie to mark the seek points and build the FST mapping
+        // to the true output.
+        //
+        // But, one downside to this approach is that it'd result
+        // in uneven index term selection.  EG with prune1=10, the
+        // resulting index terms could be as frequent as every 10
+        // terms or as rare as every <maxArcCount> * 10 (eg 2560),
+        // in the extremes.
+
+        public VariableGapTermsIndexWriter(SegmentWriteState state, IndexTermSelector policy)
         {
-            if (stats.DocFreq >= docFreqThresh || count >= interval)
+            string indexFileName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name,
state.SegmentSuffix,
+                TERMS_INDEX_EXTENSION);
+            Output = state.Directory.CreateOutput(indexFileName, state.Context);
+            bool success = false;
+
+            try
             {
-                count = 1;
-                return true;
+                _policy = policy;
+                WriteHeader(Output);
+                success = true;
             }
-            else
+            finally
             {
-                count++;
-                return false;
+                if (!success)
+                    IOUtils.CloseWhileHandlingException(Output);
             }
         }
 
-        public override void NewField(FieldInfo fieldInfo)
+        private static void WriteHeader(IndexOutput output)
         {
-            count = interval;
+            CodecUtil.WriteHeader(output, CODEC_NAME, VERSION_CURRENT);
         }
-    }
 
-    // TODO: it'd be nice to let the FST builder prune based
-  // on term count of each node (the prune1/prune2 that it
-  // accepts), and build the index based on that.  This
-  // should result in a more compact terms index, more like
-  // a prefix trie than the other selectors, because it
-  // only stores enough leading bytes to get down to N
-  // terms that may complete that prefix.  It becomes
-  // "deeper" when terms are dense, and "shallow" when they
-  // are less dense.
-  //
-  // However, it's not easy to make that work this this
-  // API, because that pruning doesn't immediately know on
-  // seeing each term whether that term will be a seek point
-  // or not.  It requires some non-causality in the API, ie
-  // only on seeing some number of future terms will the
-  // builder decide which past terms are seek points.
-  // Somehow the API'd need to be able to return a "I don't
-  // know" value, eg like a Future, which only later on is
-  // flipped (frozen) to true or false.
-  //
-  // We could solve this with a 2-pass approach, where the
-  // first pass would build an FSA (no outputs) solely to
-  // determine which prefixes are the 'leaves' in the
-  // pruning. The 2nd pass would then look at this prefix
-  // trie to mark the seek points and build the FST mapping
-  // to the true output.
-  //
-  // But, one downside to this approach is that it'd result
-  // in uneven index term selection.  EG with prune1=10, the
-  // resulting index terms could be as frequent as every 10
-  // terms or as rare as every <maxArcCount> * 10 (eg 2560),
-  // in the extremes.
-
-    public VariableGapTermsIndexWriter(SegmentWriteState state, IndexTermSelector policy)
-    {
-        string indexFileName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix,
-            TERMS_INDEX_EXTENSION);
-        output = state.Directory.CreateOutput(indexFileName, state.Context);
-        bool success = false;
-        try
+        public override FieldWriter AddField(FieldInfo field, long termsFilePointer)
         {
-            FieldInfos = state.FieldInfos;
-            this.Policy = policy;
-            writeHeader(output);
-            success = true;
+            _policy.NewField(field);
+            var writer = new FstFieldWriter(field, termsFilePointer, this);
+            _fields.Add(writer);
+            return writer;
         }
-        finally
-        {
-            if (!success)
-            {
-                IOUtils.CloseWhileHandlingException(output);
-            }
-        }
-    }
-
-    private void WriteHeader(IndexOutput output)
-    {
-        CodecUtil.WriteHeader(output, CODEC_NAME, VERSION_CURRENT);
-    }
-
-    public override FieldWriter AddField(FieldInfo field, long termsFilePointer)
-    {
-        ////System.out.println("VGW: field=" + field.name);
-        Policy.newField(field);
-        FSTFieldWriter writer = new FSTFieldWriter(field, termsFilePointer);
-        fields.Add(writer);
-        return writer;
-    }
-
-    /** NOTE: if your codec does not sort in unicode code
-   *  point order, you must override this method, to simply
-   *  return indexedTerm.length. */
 
-    protected int IndexedTermPrefixLength(BytesRef priorTerm, BytesRef indexedTerm)
-    {
-        // As long as codec sorts terms in unicode codepoint
-        // order, we can safely strip off the non-distinguishing
-        // suffix to save RAM in the loaded terms index.
-        int idxTermOffset = indexedTerm.Offset;
-        int priorTermOffset = priorTerm.Offset;
-        int limit = Math.Min(priorTerm.Length, indexedTerm.Length);
-        for (int byteIdx = 0; byteIdx < limit; byteIdx++)
+        /// <remarks>
+        /// Note: If your codec does not sort in unicode code point order,
+        /// you must override this method to simplly return IndexedTerm.Length
+        /// </remarks>
+        protected int IndexedTermPrefixLength(BytesRef priorTerm, BytesRef indexedTerm)
         {
-            if (priorTerm.Bytes[priorTermOffset + byteIdx] != indexedTerm.Bytes[idxTermOffset
+ byteIdx])
+            // As long as codec sorts terms in unicode codepoint
+            // order, we can safely strip off the non-distinguishing
+            // suffix to save RAM in the loaded terms index.
+
+            int idxTermOffset = indexedTerm.Offset;
+            int priorTermOffset = priorTerm.Offset;
+            int limit = Math.Min(priorTerm.Length, indexedTerm.Length);
+            for (int byteIdx = 0; byteIdx < limit; byteIdx++)
             {
-                return byteIdx + 1;
+                if (priorTerm.Bytes[priorTermOffset + byteIdx] != indexedTerm.Bytes[idxTermOffset
+ byteIdx])
+                {
+                    return byteIdx + 1;
+                }
             }
-        }
-
-        return Math.Min(1 + priorTerm.Length, indexedTerm.Length);
-    }
 
-    private class FSTFieldWriter : FieldWriter
-    {
-        private readonly Builder<long> fstBuilder;
-        private readonly PositiveIntOutputs fstOutputs;
-        private readonly long startTermsFilePointer;
+            return Math.Min(1 + priorTerm.Length, indexedTerm.Length);
+        }
 
-        public FieldInfo fieldInfo;
-        private FST<long> fst;
-        private long indexStart;
+        private class FstFieldWriter : FieldWriter
+        {
+            private readonly Builder<long> _fstBuilder;
+            private readonly long _startTermsFilePointer;
+            private readonly BytesRef _lastTerm = new BytesRef();
+            private readonly IntsRef _scratchIntsRef = new IntsRef();
+            private readonly VariableGapTermsIndexWriter _vgtiw;
 
-        private readonly BytesRef lastTerm = new BytesRef();
-        private bool first = true;
+            private bool _first = true;
 
-        public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer)
-        {
-            this.fieldInfo = fieldInfo;
-            fstOutputs = PositiveIntOutputs.Singleton;
-            fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, fstOutputs);
-            indexStart = output.FilePointer;
-            ////System.out.println("VGW: field=" + fieldInfo.name);
-
-            // Always put empty string in
-            fstBuilder.Add(new IntsRef(), termsFilePointer);
-            startTermsFilePointer = termsFilePointer;
-        }
+            public long IndexStart { get; private set; }
+            public FieldInfo FieldInfo { get; private set; }
+            public FST<long> Fst { get; private set; }
 
-        public override bool CheckIndexTerm(BytesRef text, TermStats stats)
-        {
-            //System.out.println("VGW: index term=" + text.utf8ToString());
-            // NOTE: we must force the first term per field to be
-            // indexed, in case policy doesn't:
-            if (policy.isIndexTerm(text, stats) || first)
+            public FstFieldWriter(FieldInfo fieldInfo, long termsFilePointer, VariableGapTermsIndexWriter
vgtiw)
             {
-                first = false;
-                //System.out.println("  YES");
-                return true;
+                _vgtiw = vgtiw;
+                FieldInfo = fieldInfo;
+                PositiveIntOutputs fstOutputs = PositiveIntOutputs.Singleton;
+                _fstBuilder = new Builder<long>(FST.INPUT_TYPE.BYTE1, fstOutputs);
+                IndexStart = _vgtiw.Output.FilePointer;
+
+                // Always put empty string in
+                _fstBuilder.Add(new IntsRef(), termsFilePointer);
+                _startTermsFilePointer = termsFilePointer;
             }
-            else
+
+            public override bool CheckIndexTerm(BytesRef text, TermStats stats)
             {
-                lastTerm.CopyBytes(text);
+                // NOTE: we must force the first term per field to be
+                // indexed, in case policy doesn't:
+                if (_vgtiw._policy.IsIndexTerm(text, stats) || _first)
+                {
+                    _first = false;
+                    return true;
+                }
+            
+                _lastTerm.CopyBytes(text);
                 return false;
             }
-        }
-
-        private readonly IntsRef scratchIntsRef = new IntsRef();
 
-        public override void Add(BytesRef text, TermStats stats, long termsFilePointer)
-        {
-            if (text.Length == 0)
+            public override void Add(BytesRef text, TermStats stats, long termsFilePointer)
             {
-                // We already added empty string in ctor
-                Debug.Assert(termsFilePointer == startTermsFilePointer);
-                return;
-            }
-            int lengthSave = text.Length;
-            text.Length = IndexedTermPrefixLength(lastTerm, text);
-            try
-            {
-                fstBuilder.Add(Util.ToIntsRef(text, scratchIntsRef), termsFilePointer);
-            }
-            finally
-            {
-                text.Length = lengthSave;
+                if (text.Length == 0)
+                {
+                    // We already added empty string in ctor
+                    Debug.Assert(termsFilePointer == _startTermsFilePointer);
+                    return;
+                }
+                int lengthSave = text.Length;
+                text.Length = _vgtiw.IndexedTermPrefixLength(_lastTerm, text);
+                try
+                {
+                    _fstBuilder.Add(Util.ToIntsRef(text, _scratchIntsRef), termsFilePointer);
+                }
+                finally
+                {
+                    text.Length = lengthSave;
+                }
+                _lastTerm.CopyBytes(text);
             }
-            lastTerm.CopyBytes(text);
-        }
 
-        public override void Finish(long termsFilePointer)
-        {
-            fst = fstBuilder.Finish();
-            if (fst != null)
+            public override void Finish(long termsFilePointer)
             {
-                fst.Save(output);
+                Fst = _fstBuilder.Finish();
+                if (Fst != null)
+                    Fst.Save(_vgtiw.Output);
             }
         }
-    }
 
-    public void Dispose()
-    {
-        if (output != null)
+        public override void Dispose()
         {
+            if (Output == null) return;
+
             try
             {
-                long dirStart = output.FilePointer;
-                int fieldCount = fields.Size;
+                long dirStart = Output.FilePointer;
+                int fieldCount = _fields.Count;
 
                 int nonNullFieldCount = 0;
                 for (int i = 0; i < fieldCount; i++)
                 {
-                    FSTFieldWriter field = fields[i];
-                    if (field.fst != null)
+                    FstFieldWriter field = _fields[i];
+                    if (field.Fst != null)
                     {
                         nonNullFieldCount++;
                     }
                 }
 
-                output.WriteVInt(nonNullFieldCount);
+                Output.WriteVInt(nonNullFieldCount);
                 for (int i = 0; i < fieldCount; i++)
                 {
-                    FSTFieldWriter field = fields[i];
+                    FstFieldWriter field = _fields[i];
                     if (field.Fst != null)
                     {
-                        output.WriteVInt(field.fieldInfo.Number);
-                        output.WriteVLong(field.indexStart);
+                        Output.WriteVInt(field.FieldInfo.Number);
+                        Output.WriteVLong(field.IndexStart);
                     }
                 }
-                writeTrailer(dirStart);
-                CodecUtil.WriteFooter(output);
+                WriteTrailer(dirStart);
+                CodecUtil.WriteFooter(Output);
             }
             finally
             {
-                output.Dispose();
-                output = null;
+                Output.Dispose();
+                Output = null;
             }
         }
-    }
 
-    private void WriteTrailer(long dirStart)
-    {
-        output.WriteLong(dirStart);
+        private void WriteTrailer(long dirStart)
+        {
+            Output.WriteLong(dirStart);
+        }
+
     }
-}
+
+}
\ No newline at end of file


Mime
View raw message