lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mhern...@apache.org
Subject [48/50] [abbrv] git commit: fixed fst errors
Date Tue, 24 Sep 2013 18:33:24 GMT
fixed fst errors


Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/b7ca14a7
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/b7ca14a7
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/b7ca14a7

Branch: refs/heads/branch_4x
Commit: b7ca14a77922a0e4d216781f74b9c1379f350698
Parents: 96a95e3
Author: James Blair <jmblair7@gmail.com>
Authored: Tue Aug 13 15:49:10 2013 -0400
Committer: Paul Irwin <paulirwin@gmail.com>
Committed: Tue Aug 13 16:18:11 2013 -0400

----------------------------------------------------------------------
 src/core/Util/Fst/FST.cs | 478 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 423 insertions(+), 55 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b7ca14a7/src/core/Util/Fst/FST.cs
----------------------------------------------------------------------
diff --git a/src/core/Util/Fst/FST.cs b/src/core/Util/Fst/FST.cs
index c6cbcd6..70f4e8e 100644
--- a/src/core/Util/Fst/FST.cs
+++ b/src/core/Util/Fst/FST.cs
@@ -9,13 +9,15 @@ using Lucene.Net.Codecs;
 
 namespace Lucene.Net.Util.Fst
 {
-    public class FST<T> : FST
+    public sealed class FST<T> : FST
     {
         private readonly INPUT_TYPE inputType;
         public INPUT_TYPE InputType { get { return inputType; } }
 
         private int[] bytesPerArc = new int[0];
 
+        // if non-null, this FST accepts the empty string and
+        // produces this output
         private T emptyOutput;
         public T EmptyOutput
         {
@@ -37,18 +39,24 @@ namespace Lucene.Net.Util.Fst
         private readonly Outputs<T> outputs;
         public Outputs<T> Outputs { get { return outputs; } }
 
+        // Used for the BIT_TARGET_NEXT optimization (whereby
+        // instead of storing the address of the target node for
+        // a given arc, we mark a single bit noting that the next
+        // node in the byte[] is the target node):
         private long lastFrozenNode;
 
         private readonly T NO_OUTPUT;
 
         internal long NodeCount { get; set; }
-
         public long ArcCount { get; set; }
         public long ArcWithOutputCount { get; set; }
 
         private readonly bool packed;
         private PackedInts.IReader nodeRefToAddress;
 
+        /// <summary>
+        /// If arc has this label then that arc is final/accepted
+        /// </summary>
         public const int END_LABEL = -1;
 
         private readonly bool allowArrayArcs;
@@ -63,10 +71,15 @@ namespace Lucene.Net.Util.Fst
 
         private GrowableWriter NodeAddress;
 
+        // TODO: we could be smarter here, and prune periodically
+        // as we go; high in-count nodes will "usually" become
+        // clear early on:
         private GrowableWriter InCounts;
 
         private readonly int Version;
 
+        // make a new empty FST, for building; Builder invokes
+        // this ctor
         internal FST(INPUT_TYPE inputType, Outputs<T> outputs, bool willPackFST, float
acceptableOverheadRatio,
             bool allowArrayArcs, int bytesPageBits)
         {
@@ -75,6 +88,8 @@ namespace Lucene.Net.Util.Fst
             this.allowArrayArcs = allowArrayArcs;
             Version = VERSION_CURRENT;
             bytes = new BytesStore(bytesPageBits);
+            // pad: ensure no node gets address 0 which is reserved to mean
+            // the stop state w/ no arcs
             bytes.WriteByte((Byte)0);
             NO_OUTPUT = outputs.GetNoOutput();
             if (willPackFST)
@@ -95,31 +110,53 @@ namespace Lucene.Net.Util.Fst
 
         public static readonly int DEFAULT_MAX_BLOCK_BITS = Constants.JRE_IS_64BIT ? 30 :
28;
 
+        /// <summary>
+        /// Load a previously saved FST.
+        /// </summary>
+        /// <param name="input"></param>
+        /// <param name="outputs"></param>
         public FST(DataInput input, Outputs<T> outputs)
             : this(input, outputs, DEFAULT_MAX_BLOCK_BITS)
         {
         }
 
+        /// <summary>
+        /// Load a previously saved FST; maxBlockBits allows you to
+        /// control the size of the byte[] pages used to hold the FST bytes.
+        /// </summary>
+        /// <param name="dataInput"></param>
+        /// <param name="outputs"></param>
+        /// <param name="maxBlockBits"></param>
         public FST(DataInput dataInput, Outputs<T> outputs, int maxBlockBits)
         {
             this.outputs = outputs;
 
-            if (maxBlockBits < 1 || maxBlockBits > 30) throw new ArgumentException("maxBlockBits
should be 1 .. 30; got " + maxBlockBits, "maxBlockBits");
-            Version = CodecUtil.CheckHeader(dataInput, FILE_FORMAT_NAME, VERSIONpacked, VERSION_VINT_TARGET);
+            if (maxBlockBits < 1 || maxBlockBits > 30)
+                throw new ArgumentException("maxBlockBits should be 1 .. 30; got " + maxBlockBits,
"maxBlockBits");
+
+            // NOTE: only reads most recent format; we don't have
+            // back-compat promise for FSTs (they are experimental):
+            Version = CodecUtil.CheckHeader(dataInput, FILE_FORMAT_NAME, VERSION_PACKED,
VERSION_VINT_TARGET);
             packed = dataInput.ReadByte() == 1;
 
             if (dataInput.ReadByte() == 1)
             {
+                // accepts empty string
+                // 1 KB blocks:
                 var emptyBytes = new BytesStore(10);
                 int numBytes = dataInput.ReadVInt();
                 emptyBytes.CopyBytes(dataInput, numBytes);
 
+                // De-serialize empty-string output:
                 FST.BytesReader reader;
                 if (packed)
                     reader = emptyBytes.GetForwardReader();
                 else
                 {
                     reader = emptyBytes.GetReverseReader();
+                    // NoOutputs uses 0 bytes when writing its output,
+                    // so we have to check here else BytesStore gets
+                    // angry:
                     if (numBytes > 0)
                         reader.Position = numBytes - 1;
                 }
@@ -159,9 +196,16 @@ namespace Lucene.Net.Util.Fst
 
             CacheRootArcs();
 
+            // NOTE: bogus because this is only used during
+            // building; we need to break out mutable FST from
+            // immutable
             allowArrayArcs = false;
         }
 
+        /// <summary>
+        /// Returns bytes used to represent the FST.
+        /// </summary>
+        /// <returns></returns>
         public long SizeInBytes()
         {
             var size = bytes.GetPosition();
@@ -197,6 +241,7 @@ namespace Lucene.Net.Util.Fst
             return node;
         }
 
+        // Caches first 128 labels
         private void CacheRootArcs()
         {
             cachedRootArcs = new Arc<T>[0x80];
@@ -238,6 +283,8 @@ namespace Lucene.Net.Util.Fst
             else
                 output.WriteByte((sbyte)0);
 
+            // TODO: really we should encode this as an arc, arriving
+            // to the root node, instead of special casing here:
             if (emptyOutput != null)
             {
                 // Accepts empty string
@@ -298,10 +345,14 @@ namespace Lucene.Net.Util.Fst
             bytes.WriteTo(output);
         }
 
-        public void Save(FileStream fileStream)
+        /// <summary>
+        /// Writes an automaton to a file
+        /// </summary>
+        /// <param name="file"></param>
+        public void Save(FileInfo fileInfo)
         {
             var success = false;
-            var bs = new BufferedStream(fileStream);
+            var bs = new BufferedStream(new FileStream(fileInfo.FullName));
             try
             {
                 Save(new OutputStreamDataOutput(bs));
@@ -316,9 +367,16 @@ namespace Lucene.Net.Util.Fst
             }
         }
 
-        public static FST<TMethod> Read<TMethod>(FileStream fileStream, Outputs<TMethod>
outputs) where TMethod : class
+        /// <summary>
+        /// Reads an automaton from a file
+        /// </summary>
+        /// <typeparam name="TMethod"></typeparam>
+        /// <param name="fileInfo"></param>
+        /// <param name="outputs"></param>
+        /// <returns></returns>
+        public static FST<TMethod> Read<TMethod>(FileStream fileInfo, Outputs<TMethod>
outputs) where TMethod : class
         {
-            var bs = new BufferedStream(fileStream);
+            var bs = new BufferedStream(new FileStream(fileInfo));
             var success = false;
             try
             {
@@ -373,11 +431,20 @@ namespace Lucene.Net.Util.Fst
             return v;
         }
 
+        /// <summary>
+        /// Returns true if the node at this address
+        /// has any outgoing arcs
+        /// </summary>
+        /// <typeparam name="TMethod"></typeparam>
+        /// <param name="arc"></param>
+        /// <returns></returns>
         public static bool TargetHasArcs<TMethod>(Arc<TMethod> arc)
         {
             return arc.Target > 0;
         }
 
+        // Serializes new node by appending its bytes to the end
+        // of the current byte[]
         internal long AddNode(Builder<T>.UnCompiledNode<T> nodeIn)
         {
             if (nodeIn.NumArcs == 0)
@@ -408,6 +475,9 @@ namespace Lucene.Net.Util.Fst
                     flags += BIT_LAST_ARC;
 
                 if (lastFrozenNode == target.Node && !doFixedArray)
+                    // TODO: for better perf (but more RAM used) we
+                    // could avoid this except when arc is "near" the
+                    // last arc:
                     flags += BIT_TARGET_NEXT;
 
                 if (arc.IsFinal)
@@ -418,7 +488,6 @@ namespace Lucene.Net.Util.Fst
                 }
                 else
                 {
-                    // TODO: Assert is correct here?
                     //Debug.Assert(arc.NextFinalOutput == NO_OUTPUT);
                 }
 
@@ -453,6 +522,9 @@ namespace Lucene.Net.Util.Fst
                     bytes.WriteVLong(target.Node);
                 }
 
+                // just write the arcs "like normal" on first pass,
+                // but record how many bytes each one took, and max
+                // byte size:
                 if (doFixedArray)
                 {
                     bytesPerArc[arcIdx] = (int)(bytes.GetPosition() - lastArcStart);
@@ -461,14 +533,36 @@ namespace Lucene.Net.Util.Fst
                 }
             }
 
+            // TODO: try to avoid wasteful cases: disable doFixedArray in that case
+            /* 
+             * 
+             * LUCENE-4682: what is a fair heuristic here?
+             * It could involve some of these:
+             * 1. how "busy" the node is: nodeIn.inputCount relative to frontier[0].inputCount?
+             * 2. how much binSearch saves over scan: nodeIn.numArcs
+             * 3. waste: numBytes vs numBytesExpanded
+             * 
+             * the one below just looks at #3
+            if (doFixedArray) {
+              // rough heuristic: make this 1.25 "waste factor" a parameter to the phd ctor????
+              int numBytes = lastArcStart - startAddress;
+              int numBytesExpanded = maxBytesPerArc * nodeIn.numArcs;
+              if (numBytesExpanded > numBytes*1.25) {
+                doFixedArray = false;
+              }
+            }
+            */
+
             if (doFixedArray)
             {
                 var MAX_HEADER_SIZE = 11;
-                // TODO: assert correct here?
-                Debug.Assert(maxBytesPerArc > 0);
+                // assert maxBytesPerArc > 0;
 
+                // create the header
+                // TODO: clean this up: or just rewind+reuse and deal with it
                 var header = new byte[MAX_HEADER_SIZE];
                 var bad = new ByteArrayDataOutput(header);
+                // write a "false" first arc:
                 bad.WriteByte(ARCS_AS_FIXED_ARRAY);
                 bad.WriteVInt(nodeIn.NumArcs);
                 bad.WriteVInt(maxBytesPerArc);
@@ -476,10 +570,10 @@ namespace Lucene.Net.Util.Fst
 
                 long fixedArrayStart = startAddress + headerLen;
 
+                // expand the arcs in place, backwards
                 var srcPos = bytes.GetPosition();
                 var destPos = fixedArrayStart + nodeIn.NumArcs * maxBytesPerArc;
-                // TODO: assert correct here?
-                Debug.Assert(destPos >= srcPos);
+                // assert destPos >= srcPos
                 if (destPos > srcPos)
                 {
                     bytes.SkipBytes((int)(destPos - srcPos));
@@ -517,6 +611,7 @@ namespace Lucene.Net.Util.Fst
             long node;
             if (NodeAddress != null)
             {
+                // Nodes are addressed by 1+ord:
                 if ((int)NodeCount == NodeAddress.Size())
                 {
                     NodeAddress =
@@ -536,6 +631,12 @@ namespace Lucene.Net.Util.Fst
             return node;
         }
 
+        /// <summary>
+        /// Fills virtual 'start' arc, ie, an empty incoming arc
+        /// to the FST's start node
+        /// </summary>
+        /// <param name="arc"></param>
+        /// <returns></returns>
         public Arc<T> GetFirstArc(Arc<T> arc)
         {
             if (EmptyOutput != null)
@@ -550,15 +651,26 @@ namespace Lucene.Net.Util.Fst
             }
             arc.Output = NO_OUTPUT;
 
+            // If there are no nodes, ie, the FST only accepts the
+            // empty string, then startNode is 0
+            arc.Target = startNode;
             return arc;
         }
 
+        /// <summary>
+        /// Follows the <code>follow</code> arc and reads the last
+        /// arc of its target; this changes the provided
+        /// <code>arc</code> (2nd arg) in-place and returns it
+        /// </summary>
+        /// <param name="follow"></param>
+        /// <param name="arc"></param>
+        /// <param name="input"></param>
+        /// <returns>Returns the second argument</returns>
         public Arc<T> ReadLastTargetArc(Arc<T> follow, Arc<T> arc, FST.BytesReader
input)
         {
             if (!TargetHasArcs(follow))
             {
-                // TODO: assert correct here?
-                Debug.Assert(follow.IsFinal());
+                // assert follow.isFinal();
                 arc.Label = END_LABEL;
                 arc.Target = FINAL_END_NODE;
                 arc.Output = follow.NextFinalOutput;
@@ -572,6 +684,7 @@ namespace Lucene.Net.Util.Fst
                 var b = input.ReadByte();
                 if (b == ARCS_AS_FIXED_ARRAY)
                 {
+                    // array: jump straight to end
                     arc.NumArcs = input.ReadVInt();
                     if (packed || Version >= VERSION_VINT_TARGET)
                         arc.BytesPerArc = input.ReadVInt();
@@ -584,15 +697,21 @@ namespace Lucene.Net.Util.Fst
                 else
                 {
                     arc.Flags = (sbyte)b;
+                    // non-array: linear scan
                     arc.BytesPerArc = 0;
 
                     while (!arc.IsLast())
                     {
+                        // skip this arc:
                         ReadLabel(input);
                         if (arc.Flag(BIT_ARC_HAS_OUTPUT))
+                        {
                             Outputs.Read(input);
+                        }
                         if (arc.Flag(BIT_ARC_HAS_FINAL_OUTPUT))
+                        {
                             Outputs.ReadFinalOutput(input);
+                        }
                         if (arc.Flag(BIT_STOP_NODE))
                         {
                         }
@@ -606,14 +725,13 @@ namespace Lucene.Net.Util.Fst
 
                         arc.Flags = (sbyte)input.ReadByte();
                     }
-
+                    // Undo the byte flags we read:
                     input.SkipBytes(-1);
                     arc.NextArc = input.Position;
                 }
 
                 ReadNextRealArc(arc, input);
-                // TODO: assert is correct here?
-                //Debug.Assert(arc.IsLast());
+                // assert arc.isLast();
                 return arc;
             }
         }
@@ -629,10 +747,20 @@ namespace Lucene.Net.Util.Fst
             return target;
         }
 
+        /// <summary>
+        /// Follow the <code>follow</code> arc and read the first arc of its
target;
+        /// this changes the provided <code>arc</code> (2nd arg) in-place and
returns it.
+        /// </summary>
+        /// <param name="follow"></param>
+        /// <param name="arc"></param>
+        /// <param name="input"></param>
+        /// <returns>Returns the second argument (<code>arc</code>)</returns>
         public Arc<T> ReadFirstTargetArc(Arc<T> follow, Arc<T> arc, FST.BytesReader
input)
         {
+            // int pos = address;
             if (follow.IsFinal())
             {
+                // Insert "fake" final first arc:
                 arc.Label = END_LABEL;
                 arc.Output = follow.NextFinalOutput;
                 arc.Flags = (sbyte)BIT_FINAL_ARC;
@@ -641,6 +769,7 @@ namespace Lucene.Net.Util.Fst
                 else
                 {
                     arc.Node = follow.Target;
+                    // NOTE: nextArc is a node (not an address!) in this case:
                     arc.NextArc = follow.Target;
                 }
                 arc.Target = FINAL_END_NODE;
@@ -679,6 +808,13 @@ namespace Lucene.Net.Util.Fst
             return ReadNextRealArc(arc, input);
         }
 
+        /// <summary>
+        /// Checks if <code>arc</code>'s target state is in expanded (or vector)
format.
+        /// </summary>
+        /// <param name="follow"></param>
+        /// <param name="input"></param>
+        /// <returns>Returns <code>true</code> if <code>arc</code>
points to a state 
+        /// in an expanded array format.</returns>
         internal bool IsExpandedTarget(Arc<T> follow, FST.BytesReader input)
         {
             if (!TargetHasArcs(follow))
@@ -690,10 +826,17 @@ namespace Lucene.Net.Util.Fst
             }
         }
 
+        /// <summary>
+        /// In-place read; returns the arc.
+        /// </summary>
+        /// <param name="arc"></param>
+        /// <param name="input"></param>
+        /// <returns></returns>
         public Arc<T> ReadNextArc(Arc<T> arc, FST.BytesReader input)
         {
             if (arc.Label == END_LABEL)
             {
+                // This was a fake isnerted "final" arc
                 if (arc.NextArc <= 0)
                     throw new ArgumentException("cannot readNextArc when arc.isLast()=true");
                 return ReadFirstRealTargetArc(arc.NextArc, arc, input);
@@ -704,10 +847,17 @@ namespace Lucene.Net.Util.Fst
             }
         }
 
+        /// <summary>
+        /// Peeks at next arc's label; does not alter arc. Do
+        /// not call this if arc.IsLast()!
+        /// </summary>
+        /// <param name="arc"></param>
+        /// <param name="input"></param>
+        /// <returns></returns>
         public int ReadNextArcLabel(Arc<T> arc, FST.BytesReader input)
         {
-            // TODO: assert correct here?
-            Debug.Assert(!arc.IsLast());
+            if (arc.IsLast)
+                throw new ArgumentException("cannot readNextArc when arc.isLast()=true");
 
             if (arc.Label == END_LABEL)
             {
@@ -719,6 +869,7 @@ namespace Lucene.Net.Util.Fst
                 {
                     input.ReadVInt();
 
+                    // Skip bytesPerArc:
                     if (packed || Version >= VERSION_VINT_TARGET)
                         input.ReadVInt();
                     else
@@ -726,60 +877,101 @@ namespace Lucene.Net.Util.Fst
                 }
                 else
                 {
+                    input.Position = pos;
+                }
+            }
+            else
+            {
+                if (arc.BytesPerArc != 0)
+                {
+                    // arcs are at fixed entries
+                    input.Position = arc.PosArcsStart;
+                    input.SkipBytes((1 + arc.ArcIdx) * arc.BytesPerArc);
+                }
+                else
+                {
+                    // arcs are packed
                     input.Position = arc.NextArc;
                 }
             }
 
+            // skip flags
             input.ReadByte();
             return ReadLabel(input);
         }
 
+        /// <summary>
+        /// Never returns null, but you should never call this if
+        /// arc.IsLast() is true.
+        /// </summary>
+        /// <param name="arc"></param>
+        /// <param name="input"></param>
+        /// <returns></returns>
         public Arc<T> ReadNextRealArc(Arc<T> arc, FST.BytesReader input)
         {
+            // TODO: can't assert this because we call from readFirstArc
+            // assert !flag(arc.flags, BIT_LAST_ARC);
+
+            // this is a continuing arc in a fixed array
             if (arc.BytesPerArc != 0)
             {
+                // arcs are at fixed entries
                 arc.ArcIdx++;
-                // TODO: assert correct here?
-                Debug.Assert(arc.ArcIdx < arc.NumArcs);
+                // assert arc.arcIdx < arc.numArcs;
                 input.Position = arc.PosArcsStart;
                 input.SkipBytes(arc.ArcIdx * arc.BytesPerArc);
             }
             else
             {
+                // arcs are packed
                 input.Position = arc.NextArc;
             }
             arc.Flags = (sbyte)input.ReadByte();
             arc.Label = ReadLabel(input);
 
             if (arc.Flag(BIT_ARC_HAS_OUTPUT))
+            {
                 arc.Output = Outputs.Read(input);
+            }
             else
+            {
                 arc.Output = Outputs.GetNoOutput();
+            }
 
             if (arc.Flag(BIT_ARC_HAS_FINAL_OUTPUT))
+            {
                 arc.NextFinalOutput = Outputs.ReadFinalOutput(input);
+            }
             else
+            {
                 arc.NextFinalOutput = Outputs.GetNoOutput();
+            }
 
             if (arc.Flag(BIT_STOP_NODE))
             {
                 if (arc.Flag(BIT_FINAL_ARC))
+                {
                     arc.Target = FINAL_END_NODE;
+                }
                 else
+                {
                     arc.Target = NON_FINAL_END_NODE;
-
+                }
                 arc.NextArc = input.Position;
             }
             else if (arc.Flag(BIT_TARGET_NEXT))
             {
                 arc.NextArc = input.Position;
 
+                // TODO: would be nice to make this lazy -- maybe
+                // caller doesn't need the target and is scanning arcs...
                 if (NodeAddress == null)
                 {
                     if (!arc.Flag(BIT_LAST_ARC))
                     {
                         if (arc.BytesPerArc == 0)
                         {
+                            // must scan
                             SeekToNextNode(input);
                         }
                         else
@@ -793,8 +985,7 @@ namespace Lucene.Net.Util.Fst
                 else
                 {
                     arc.Target = arc.Node - 1;
-                    // TODO: assert correct here?
-                    Debug.Assert(arc.Target > 0);
+                    // assert arc.target > 0
                 }
             }
             else
@@ -805,14 +996,17 @@ namespace Lucene.Net.Util.Fst
                     var code = input.ReadVLong();
                     if (arc.Flag(BIT_TARGET_DELTA))
                     {
+                        // Address is delta-coded from current address:
                         arc.Target = pos + code;
                     }
                     else if (code < nodeRefToAddress.Size())
                     {
+                        // Deref
                         arc.Target = nodeRefToAddress.Get((int)code);
                     }
                     else
                     {
+                        // Absolute
                         arc.Target = code;
                     }
                 }
@@ -825,10 +1019,22 @@ namespace Lucene.Net.Util.Fst
             return arc;
         }
 
+        // TODO: could we somehow [partially] tableize arc lookups
+        // look automaton?
+
+        /// <summary>
+        /// Finds an arc leaving the incoming arc, replacing the arc in place.
+        /// This returns null if the arc was not found, else the incoming arc.
+        /// </summary>
+        /// <param name="labelToMatch"></param>
+        /// <param name="follow"></param>
+        /// <param name="arc"></param>
+        /// <param name="input"></param>
+        /// <returns></returns>
         public Arc<T> FindTargetArc(int labelToMatch, Arc<T> follow, Arc<T>
arc, FST.BytesReader input)
         {
-            // TODO: appropriate error message
-            if (cachedRootArcs == null) throw new InvalidOperationException("cachedRootArcs
cannot be null");
+            if (cachedRootArcs == null)
+                throw new InvalidOperationException("cachedRootArcs cannot be null");
 
             if (labelToMatch == END_LABEL)
             {
@@ -841,6 +1047,7 @@ namespace Lucene.Net.Util.Fst
                     else
                     {
                         arc.Flags = 0;
+                        // NOTE: nextArc is a node (not an address!) in this case:
                         arc.NextArc = follow.Target;
                         arc.Node = follow.Target;
                     }
@@ -854,6 +1061,7 @@ namespace Lucene.Net.Util.Fst
                 }
             }
 
+            // Short-circuit if this arc is in the root arc cache:
             if (follow.Target == startNode && labelToMatch < cachedRootArcs.Length)
             {
                 var result = cachedRootArcs[labelToMatch];
@@ -867,7 +1075,9 @@ namespace Lucene.Net.Util.Fst
             }
 
             if (!TargetHasArcs(follow))
+            {
                 return null;
+            }
 
             input.Position = GetNodeAddress(follow.Target);
 
@@ -875,6 +1085,7 @@ namespace Lucene.Net.Util.Fst
 
             if (input.ReadByte() == ARCS_AS_FIXED_ARRAY)
             {
+                // Arcs are full array; do binary search:
                 arc.NumArcs = input.ReadVInt();
                 if (packed || Version >= VERSION_VINT_TARGET)
                     arc.BytesPerArc = input.ReadVInt();
@@ -903,6 +1114,7 @@ namespace Lucene.Net.Util.Fst
                 return null;
             }
 
+            // Linear scan
             ReadFirstRealTargetArc(follow.Target, arc, input);
 
             while (true)
@@ -938,8 +1150,6 @@ namespace Lucene.Net.Util.Fst
 
                 if (Flag(flags, BIT_LAST_ARC))
                     return;
-
-
             }
         }
 
@@ -949,6 +1159,20 @@ namespace Lucene.Net.Util.Fst
             return 1 + NodeCount;
         }
 
+
+        /// <summary>
+        /// Nodes will be expanded if their depth (distance from the root node) is
+        /// &lt;= this value and their number of arcs is &gt=
+        /// (FIXED_ARRAY_NUM_ARCS_SHALLOW).
+        /// 
+        /// Fixed array consumes more RAM but enables binary search on the arcs
+        /// (instead of a linear scan) on lookup by arc label.
+        /// </summary>
+        /// <param name="node"></param>
+        /// <returns><code>true</code> if <code>node</code>
should be stored
+        /// in an expanded (array) form.</returns>
+        /// <see cref="FIXED_ARRAY_NUM_ARCS_DEEP"/>
+        /// <see cref="Builder.UnCompiledNode"/>
         private bool ShouldExpand(Builder<T>.UnCompiledNode<T> node)
         {
             return allowArrayArcs &&
@@ -956,11 +1180,22 @@ namespace Lucene.Net.Util.Fst
                     node.NumArcs >= FIXED_ARRAY_NUM_ARCS_DEEP);
         }
 
+        /// <summary>
+        /// Returns a BytesReader for this FST, positioned at position 0.
+        /// </summary>
+        /// <returns></returns>
         public FST.BytesReader GetBytesReader()
         {
-            return packed ?
-                bytes.GetForwardReader() :
-                bytes.GetReverseReader();
+            BytesReader input;
+            if (packed)
+            {
+                input = bytes.GetForwardReader();
+            }
+            else
+            {
+                input = bytes.GetReverseReader();
+            }
+            return input;
         }
 
 
@@ -977,6 +1212,7 @@ namespace Lucene.Net.Util.Fst
         //    public abstract void SkipBytes(int count);
         //}
 
+        // Creates a packed FST
         private FST(INPUT_TYPE inputType, Outputs<T> outputs, int bytesPageBits)
         {
             Version = VERSION_CURRENT;
@@ -989,9 +1225,42 @@ namespace Lucene.Net.Util.Fst
             allowArrayArcs = false;
         }
 
+        /// <summary>
+        /// Expert: creates an FST by packing this one. This
+        /// process requires substantial additional RAM (currently
+        /// up to ~8 bytes per node depending on 
+        /// <code>acceptableOverheadRatio</code>, but then should
+        /// produce a smaller FST.
+        /// 
+        /// The implementation of this method uses ideas from
+        /// http://www.cs.put.poznan.pl/dweiss/site/publications/download/fsacomp.pdf
+        /// Smaller representation 
+        /// which describes this techniques to reduce the size of a FST.
+        /// However, this is not a strict implementation of the
+        /// algorithms described in this paper.
+        /// </summary>
+        /// <param name="minInCountDeref"></param>
+        /// <param name="maxDerefNodes"></param>
+        /// <param name="acceptableOverheadRatio"></param>
+        /// <returns></returns>
         internal FST<T> Pack(int minInCountDeref, int maxDerefNodes, float acceptableOverheadRatio)
         {
-            if (NodeAddress == null) throw new ArgumentException("this FST was not built
with willPackFST=true");
+            // NOTE: maxDerefNodes is intentionally int: we cannot
+            // support > 2.1B deref nodes
+
+            // TODO: other things to try
+            //   - renumber the nodes to get more next / better locality?
+            //   - allow multiple input labels on an arc, so
+            //     singular chain of inputs can take one arc (on
+            //     wikipedia terms this could save another ~6%)
+            //   - in the ord case, the output '1' is presumably
+            //     very common (after NO_OUTPUT)... maybe use a bit
+            //     for it..?
+            //   - use spare bits in flags.... for top few labels /
+            //     outputs / targets
+
+            if (NodeAddress == null)
+                throw new ArgumentException("this FST was not built with willPackFST=true");
 
             var arc = new Arc<T>();
 
@@ -999,8 +1268,10 @@ namespace Lucene.Net.Util.Fst
 
             var topN = Math.Min(maxDerefNodes, InCounts.Size());
 
+            // Find top nodes with highest number of incoming arcs:
             var q = new NodeQueue(topN);
 
+            // TODO: we could use more RAM efficent solution algo here...
             NodeAndInCount bottom = null;
             for (var node = 0; node < InCounts.Size(); node++)
             {
@@ -1021,6 +1292,7 @@ namespace Lucene.Net.Util.Fst
 
             // Free up RAM
             InCounts = null;
+
             var topNodeMap = new HashMap<long, long>();
             for (var downTo = q.Size - 1; downTo >= 0; downTo--)
             {
@@ -1028,9 +1300,11 @@ namespace Lucene.Net.Util.Fst
                 topNodeMap.Add(n.Node, downTo);
             }
 
+            // +1 because node ords start at 1 (0 is reserved as stop node):
             var newNodeAddress = new GrowableWriter(PackedInts.BitsRequired(bytes.GetPosition()),
                                                     (int)(1 + NodeCount), acceptableOverheadRatio);
 
+            // Fill initial coarse guess:
             for (var node = 1; node <= NodeCount; node++)
                 newNodeAddress.Set(node, 1 + bytes.GetPosition() - NodeAddress.Get(node));
 
@@ -1041,16 +1315,19 @@ namespace Lucene.Net.Util.Fst
 
             FST<T> fst;
 
+            // Iterate until we converge:
             while (true)
             {
                 var changed = false;
 
+                // for assert:
                 var negDelta = false;
 
                 fst = new FST<T>(InputType, Outputs, bytes.GetBlockBits());
 
                 var writer = fst.bytes;
 
+                // Skip 0 byte since 0 is reserved target:
                 writer.WriteByte((sbyte)0);
 
                 fst.ArcWithOutputCount = 0;
@@ -1063,6 +1340,9 @@ namespace Lucene.Net.Util.Fst
 
                 long addressError = 0;
 
+                // Since we re-reverse the bytes, we now write the
+                // nodes backwards, so taht BIT_TARGET_NEXT is
+                // unchanged:
                 for (var node = (int)NodeCount; node >= 1; node--)
                 {
                     fst.NodeCount++;
@@ -1081,11 +1361,15 @@ namespace Lucene.Net.Util.Fst
 
                     var retry = false;
 
+                    // for assert:
                     var anyNegDelta = false;
 
                     // in Java Lucene there is a label 'writeNode:'
-                    // writeNode:
-                    while (true)
+                    //writeNode:
+
+                    // Retry loop: possibly iterate more than once, if
+                    // this is an array'd node and bytesPerArc changes
+                    while (true) // retry writing this node
                     {
                         ReadFirstRealTargetArc(node, arc, r);
 
@@ -1127,8 +1411,7 @@ namespace Lucene.Net.Util.Fst
                             }
                             else
                             {
-                                // TODO: assert is correct here?
-                                //Debug.Assert(arc.NextFinalOutput == NO_OUTPUT);
+                                // assert arc.nextFinalOutput == NO_OUTPUT;
                             }
 
                             if (!TargetHasArcs(arc))
@@ -1148,8 +1431,7 @@ namespace Lucene.Net.Util.Fst
                                 else
                                     absPtr = topNodeMap.Count + newNodeAddress.Get((int)arc.Target)
+ addressError;
 
-                                var delta = newNodeAddress.Get((int)arc.Target) + addressError
- writer.GetPosition() -
-                                            2;
+                                var delta = newNodeAddress.Get((int)arc.Target) + addressError
- writer.GetPosition() - 2;
                                 if (delta < 0)
                                 {
                                     anyNegDelta = true;
@@ -1157,15 +1439,16 @@ namespace Lucene.Net.Util.Fst
                                 }
 
                                 if (delta < absPtr)
+                                {
                                     flags += (sbyte)BIT_TARGET_DELTA;
+                                }
                             }
                             else
                             {
                                 absPtr = 0;
                             }
 
-                            // TODO: assert correct here?
-                            Debug.Assert(flags != ARCS_AS_FIXED_ARRAY);
+                            // assert flags != ARCS_AS_FIXED_ARRAY
                             writer.WriteByte(flags);
 
                             fst.WriteLabel(writer, arc.Label);
@@ -1212,6 +1495,13 @@ namespace Lucene.Net.Util.Fst
                             {
                                 var arcBytes = (int)(writer.GetPosition() - arcStartPos);
                                 maxBytesPerArc = Math.Max(maxBytesPerArc, arcBytes);
+                                // NOTE: this may in fact go "backwards", if
+                                // somehow (rarely, possibly never) we use
+                                // more bytesPerArc in this rewrite than the
+                                // incoming FST did... but in this case we
+                                // will retry (below) so it's OK to ovewrite
+                                // bytes:
+                                //wasted += bytesPerArc - arcBytes;
                                 writer.SkipBytes((int)(arcStartPos + bytesPerArc - writer.GetPosition()));
                             }
 
@@ -1224,7 +1514,10 @@ namespace Lucene.Net.Util.Fst
                         if (useArcArray)
                         {
                             if (maxBytesPerArc == bytesPerArc || (retry && maxBytesPerArc
<= bytesPerArc))
+                            {
+                                // converged
                                 break;
+                            }
                         }
                         else
                         {
@@ -1246,8 +1539,13 @@ namespace Lucene.Net.Util.Fst
 
                 if (!changed)
                 {
-                    // TODO: assert correct here?
-                    Debug.Assert(!negDelta);
+                    // We don't renumber the nodes (just reverse their
+                    // order) so nodes should only point forward to
+                    // other nodes because we only produce acyclic FSTs
+                    // w/ nodes only pointing "forwards":
+                    // assert != negDelta
+
+                    // Converged!
                     break;
                 }
             }
@@ -1266,12 +1564,13 @@ namespace Lucene.Net.Util.Fst
             fst.startNode = newNodeAddress.Get((int)startNode);
 
             if (emptyOutput != null)
+            {
                 fst.emptyOutput = emptyOutput;
+            }
 
-            // TODO: assert correct here?
-            Debug.Assert(fst.NodeCount == NodeCount, "fst.NodeCount=" + fst.NodeCount + "
NodeCount=" + NodeCount);
-            Debug.Assert(fst.ArcCount == ArcCount);
-            Debug.Assert(fst.ArcWithOutputCount == ArcWithOutputCount, "fst.ArcWithOutputCount="
+ fst.ArcWithOutputCount + " ArcWithOutputCount=" + ArcWithOutputCount);
+            //assert fst.nodeCount == nodeCount: "fst.nodeCount=" + fst.nodeCount + " nodeCount="
+ nodeCount;
+            //assert fst.arcCount == arcCount;
+            //assert fst.arcWithOutputCount == arcWithOutputCount: "fst.arcWithOutputCount="
+ fst.arcWithOutputCount + " arcWithOutputCount=" + arcWithOutputCount;
 
             fst.bytes.Finish();
             fst.CacheRootArcs();
@@ -1290,69 +1589,136 @@ namespace Lucene.Net.Util.Fst
         internal const int BIT_LAST_ARC = 1 << 1;
         internal const int BIT_TARGET_NEXT = 1 << 2;
 
+        // TODO: we can free up a bit if we can nuke this:
         internal const int BIT_STOP_NODE = 1 << 3;
         internal const int BIT_ARC_HAS_OUTPUT = 1 << 4;
         internal const int BIT_ARC_HAS_FINAL_OUTPUT = 1 << 5;
 
+        // Arcs are stored as fixed-size (per entry) array, so
+        // that we can find an arc using binary search.  We do
+        // this when number of arcs is > NUM_ARCS_ARRAY:
+
+        // If set, thie target node is delta coded vs current position:
         internal const int BIT_TARGET_DELTA = 1 << 6;
 
+        // We use this as a marker (because this one flag is
+        // illegal by itself ...):
         internal const sbyte ARCS_AS_FIXED_ARRAY = BIT_ARC_HAS_FINAL_OUTPUT;
 
+        /// <summary>
+        /// <see cref="UnCompiledNode"/>
+        /// </summary>
         internal const int FIXED_ARRAY_SHALLOW_DISTANCE = 3;
-        
+
+        /// <summary>
+        /// <see cref="UnCompiledNode"/>
+        /// </summary>
         internal const int FIXED_ARRAY_NUM_ARCS_SHALLOW = 5;
 
+        /// <summary>
+        /// <see cref="UnCompiledNode"/>
+        /// </summary>
         internal const int FIXED_ARRAY_NUM_ARCS_DEEP = 10;
 
+        // Increment version to change it
         internal const string FILE_FORMAT_NAME = "FST";
         internal const int VERSION_START = 0;
 
+        /// <summary>
+        /// Changed numBytesPerArc for array'd case from byte to int.
+        /// </summary>
         internal const int VERSION_INT_NUM_BYTES_PER_ARC = 1;
 
+        /// <summary>
+        /// Write BYTE2 labels as 2-byte short, not vInt.
+        /// </summary>
         internal const int VERSION_SHORT_BYTE2_LABELS = 2;
 
-        internal const int VERSIONpacked = 3;
+        /// <summary>
+        /// Added optional packed format.
+        /// </summary>
+        internal const int VERSION_PACKED = 3;
 
+        /// <summary>
+        /// Changed from int to vInt for encoding arc targets.
+        /// Also changed maxBytesPerArc from int to vInt in the array case.
+        /// </summary>
         internal const int VERSION_VINT_TARGET = 4;
 
         internal const int VERSION_CURRENT = VERSION_VINT_TARGET;
 
+        // Never serialized; just used to represent the virtual
+        // final node w/ no arcs:
         internal const long FINAL_END_NODE = -1;
 
+        // Never serialized; just used to represent the virtual
+        // non-final node w/ no arcs:
         internal const long NON_FINAL_END_NODE = 0;
 
-
+        /// <summary>
+        /// Reads bytes stored in an FST.
+        /// </summary>
         public abstract class BytesReader : DataInput
         {
+            /// <summary>
+            /// Current read position
+            /// </summary>
             public abstract long Position { get; set; }
 
+            /// <summary>
+            /// Returns true if this reader uses reversed bytes 
+            /// under-the-hood.
+            /// </summary>
+            /// <returns></returns>
             public abstract bool Reversed();
 
+            /// <summary>
+            /// Skips bytes.
+            /// </summary>
+            /// <param name="count"></param>
             public abstract void SkipBytes(int count);
         }
 
+        /// <summary>
+        /// Specifies allowed range of each int input label for this FST.
+        /// </summary>
         public enum INPUT_TYPE { BYTE1, BYTE2, BYTE4 }
 
+        /// <summary>
+        /// Represents a single arc.
+        /// </summary>
+        /// <typeparam name="T"></typeparam>
         public sealed class Arc<T>
         {
             public int Label { get; set; }
             public T Output { get; set; }
 
+            // From node (ord or address); currently only used when
+            // building an FST w/ willPackFST=true:
             internal long Node { get; set; }
 
+            /// <summary>
+            /// To node (ord or address)
+            /// </summary>
             public long Target { get; set; }
 
             internal sbyte Flags { get; set; }
-
             public T NextFinalOutput { get; set; }
 
+            // address (into the byte[]), or ord/address if label == END_LABEL
             internal long NextArc { get; set; }
 
+            // This is non-zero if current arcs are fixed array:
             internal long PosArcsStart { get; set; }
             internal int BytesPerArc { get; set; }
             internal int ArcIdx { get; set; }
             internal int NumArcs { get; set; }
 
+            /// <summary>
+            /// Return this
+            /// </summary>
+            /// <param name="other"></param>
+            /// <returns></returns>
             public Arc<T> CopyFrom(Arc<T> other)
             {
                 Node = other.Node;
@@ -1436,8 +1802,11 @@ namespace Lucene.Net.Util.Fst
 
             public int CompareTo(NodeAndInCount other)
             {
-                if (Count > other.Count) return 1;
-                if (Count < other.Count) return -1;
+                if (Count > other.Count) 
+                    return 1;
+                if (Count < other.Count) 
+                    return -1;
+                // Tie-break: smaller node compares as greater than
                 return other.Node - Node;
             }
         }
@@ -1452,8 +1821,7 @@ namespace Lucene.Net.Util.Fst
             public override bool LessThan(NodeAndInCount a, NodeAndInCount b)
             {
                 var cmp = a.CompareTo(b);
-                // TODO: assert correct here?
-                Debug.Assert(cmp != 0);
+                // assert cmp != 0;
                 return cmp < 0;
             }
         }


Mime
View raw message