lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From synhers...@apache.org
Subject [1/8] Porting Lucene.Net.Suggest (still not compiling)
Date Mon, 15 Sep 2014 22:24:48 GMT
Repository: lucenenet
Updated Branches:
  refs/heads/master 6e900565d -> 0ebac7269


http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Suggest/Tst/TSTAutocomplete.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Suggest/Tst/TSTAutocomplete.cs b/src/Lucene.Net.Suggest/Suggest/Tst/TSTAutocomplete.cs
new file mode 100644
index 0000000..4a8330b
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Suggest/Tst/TSTAutocomplete.cs
@@ -0,0 +1,207 @@
+using System.Collections.Generic;
+
+namespace Lucene.Net.Search.Suggest.Tst
+{
+
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Ternary Search Trie implementation.
+    /// </summary>
+    /// <seealso cref= TernaryTreeNode </seealso>
+    public class TSTAutocomplete
+    {
+
+        internal TSTAutocomplete()
+        {
+        }
+
+        /// <summary>
+        /// Inserting keys in TST in the order middle,small,big (lexicographic measure)
+        /// recursively creates a balanced tree which reduces insertion and search
+        /// times significantly.
+        /// </summary>
+        /// <param name="tokens">
+        ///          Sorted list of keys to be inserted in TST. </param>
+        /// <param name="lo">
+        ///          stores the lower index of current list. </param>
+        /// <param name="hi">
+        ///          stores the higher index of current list. </param>
+        /// <param name="root">
+        ///          a reference object to root of TST. </param>
+        public virtual void BalancedTree(object[] tokens, object[] vals, int lo, int hi,
TernaryTreeNode root)
+        {
+            if (lo > hi)
+            {
+                return;
+            }
+            int mid = (lo + hi) / 2;
+            root = Insert(root, (string)tokens[mid], vals[mid], 0);
+            BalancedTree(tokens, vals, lo, mid - 1, root);
+            BalancedTree(tokens, vals, mid + 1, hi, root);
+        }
+
+        /// <summary>
+        /// Inserts a key in TST creating a series of Binary Search Trees at each node.
+        /// The key is actually stored across the eqKid of each node in a successive
+        /// manner.
+        /// </summary>
+        /// <param name="currentNode">
+        ///          a reference node where the insertion will take currently. </param>
+        /// <param name="s">
+        ///          key to be inserted in TST. </param>
+        /// <param name="x">
+        ///          index of character in key to be inserted currently. </param>
+        /// <returns> currentNode The new reference to root node of TST </returns>
+        public virtual TernaryTreeNode Insert(TernaryTreeNode currentNode, string s, object
val, int x)
+        {
+            if (s == null || s.Length <= x)
+            {
+                return currentNode;
+            }
+            if (currentNode == null)
+            {
+                TernaryTreeNode newNode = new TernaryTreeNode();
+                newNode.splitchar = s.charAt(x);
+                currentNode = newNode;
+                if (x < s.Length - 1)
+                {
+                    currentNode.eqKid = Insert(currentNode.eqKid, s, val, x + 1);
+                }
+                else
+                {
+                    currentNode.token = s.ToString();
+                    currentNode.val = val;
+                    return currentNode;
+                }
+            }
+            else if (currentNode.splitchar > s.charAt(x))
+            {
+                currentNode.loKid = Insert(currentNode.loKid, s, val, x);
+            }
+            else if (currentNode.splitchar == s.charAt(x))
+            {
+                if (x < s.Length - 1)
+                {
+                    currentNode.eqKid = Insert(currentNode.eqKid, s, val, x + 1);
+                }
+                else
+                {
+                    currentNode.token = s;
+                    currentNode.val = val;
+                    return currentNode;
+                }
+            }
+            else
+            {
+                currentNode.hiKid = Insert(currentNode.hiKid, s, val, x);
+            }
+            return currentNode;
+        }
+
+        /// <summary>
+        /// Auto-completes a given prefix query using Depth-First Search with the end
+        /// of prefix as source node each time finding a new leaf to get a complete key
+        /// to be added in the suggest list.
+        /// </summary>
+        /// <param name="root">
+        ///          a reference to root node of TST. </param>
+        /// <param name="s">
+        ///          prefix query to be auto-completed. </param>
+        /// <param name="x">
+        ///          index of current character to be searched while traversing through
+        ///          the prefix in TST. </param>
+        /// <returns> suggest list of auto-completed keys for the given prefix query.
</returns>
+        public virtual List<TernaryTreeNode> PrefixCompletion(TernaryTreeNode root,
string s, int x)
+        {
+
+            TernaryTreeNode p = root;
+            List<TernaryTreeNode> suggest = new List<TernaryTreeNode>();
+
+            while (p != null)
+            {
+                if (s.charAt(x) < p.splitchar)
+                {
+                    p = p.loKid;
+                }
+                else if (s.charAt(x) == p.splitchar)
+                {
+                    if (x == s.Length - 1)
+                    {
+                        break;
+                    }
+                    else
+                    {
+                        x++;
+                    }
+                    p = p.eqKid;
+                }
+                else
+                {
+                    p = p.hiKid;
+                }
+            }
+
+            if (p == null)
+            {
+                return suggest;
+            }
+            if (p.eqKid == null && p.token == null)
+            {
+                return suggest;
+            }
+            if (p.eqKid == null && p.token != null)
+            {
+                suggest.Add(p);
+                return suggest;
+            }
+
+            if (p.token != null)
+            {
+                suggest.Add(p);
+            }
+            p = p.eqKid;
+
+            var st = new Stack<TernaryTreeNode>();
+            st.Push(p);
+            while (st.Count > 0)
+            {
+                TernaryTreeNode top = st.Peek();
+                st.Pop();
+                if (top.token != null)
+                {
+                    suggest.Add(top);
+                }
+                if (top.eqKid != null)
+                {
+                    st.Push(top.eqKid);
+                }
+                if (top.loKid != null)
+                {
+                    st.Push(top.loKid);
+                }
+                if (top.hiKid != null)
+                {
+                    st.Push(top.hiKid);
+                }
+            }
+            return suggest;
+        }
+    }
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Suggest/Tst/TSTLookup.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Suggest/Tst/TSTLookup.cs b/src/Lucene.Net.Suggest/Suggest/Tst/TSTLookup.cs
new file mode 100644
index 0000000..2f5ee22
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Suggest/Tst/TSTLookup.cs
@@ -0,0 +1,295 @@
+using System;
+using System.Collections.Generic;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Search.Suggest.Tst
+{
+
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+    /// <summary>
+    /// Suggest implementation based on a 
+    /// <a href="http://en.wikipedia.org/wiki/Ternary_search_tree">Ternary Search Tree</a>
+    /// </summary>
+    /// <seealso cref= TSTAutocomplete </seealso>
+    public class TSTLookup : Lookup
+    {
+        internal TernaryTreeNode root = new TernaryTreeNode();
+        internal TSTAutocomplete autocomplete = new TSTAutocomplete();
+
+        /// <summary>
+        /// Number of entries the lookup was built with </summary>
+        private long count = 0;
+
+        /// <summary>
+        /// Creates a new TSTLookup with an empty Ternary Search Tree. </summary>
+        /// <seealso cref= #build(InputIterator) </seealso>
+        public TSTLookup()
+        {
+        }
+
+        public override void Build(InputIterator tfit)
+        {
+            if (tfit.HasPayloads)
+            {
+                throw new System.ArgumentException("this suggester doesn't support payloads");
+            }
+            if (tfit.HasContexts)
+            {
+                throw new System.ArgumentException("this suggester doesn't support contexts");
+            }
+            root = new TernaryTreeNode();
+            // buffer first
+            if (tfit.Comparator != BytesRef.UTF8SortedAsUTF16Comparator)
+            {
+                // make sure it's sorted and the comparator uses UTF16 sort order
+                tfit = new SortedInputIterator(tfit, BytesRef.UTF8SortedAsUTF16Comparator);
+            }
+
+            List<string> tokens = new List<string>();
+            List<Number> vals = new List<Number>();
+            BytesRef spare;
+            CharsRef charsSpare = new CharsRef();
+            while ((spare = tfit.Next()) != null)
+            {
+                charsSpare.Grow(spare.Length);
+                UnicodeUtil.UTF8toUTF16(spare.Bytes, spare.Offset, spare.Length, charsSpare);
+                tokens.Add(charsSpare.ToString());
+                vals.Add(Convert.ToInt64(tfit.Weight));
+            }
+            autocomplete.BalancedTree(tokens.ToArray(), vals.ToArray(), 0, tokens.Count -
1, root);
+        }
+
+        /// <summary>
+        /// Adds a new node if <code>key</code> already exists,
+        /// otherwise replaces its value.
+        /// <para>
+        /// This method always returns true.
+        /// </para>
+        /// </summary>
+        public virtual bool Add(string key, object value)
+        {
+            autocomplete.Insert(root, key, value, 0);
+            // XXX we don't know if a new node was created
+            return true;
+        }
+
+        /// <summary>
+        /// Returns the value for the specified key, or null
+        /// if the key does not exist.
+        /// </summary>
+        public virtual object Get(string key)
+        {
+            IList<TernaryTreeNode> list = autocomplete.PrefixCompletion(root, key,
0);
+            if (list == null || list.Count == 0)
+            {
+                return null;
+            }
+            foreach (TernaryTreeNode n in list)
+            {
+                if (CharSeqEquals(n.token, key))
+                {
+                    return n.val;
+                }
+            }
+            return null;
+        }
+
+        private static bool CharSeqEquals(string left, string right)
+        {
+            int len = left.Length;
+            if (len != right.Length)
+            {
+                return false;
+            }
+            for (int i = 0; i < len; i++)
+            {
+                if (left.charAt(i) != right.charAt(i))
+                {
+                    return false;
+                }
+            }
+            return true;
+        }
+
+        public override IList<LookupResult> Lookup(string key, HashSet<BytesRef>
contexts, bool onlyMorePopular, int num)
+        {
+            if (contexts != null)
+            {
+                throw new System.ArgumentException("this suggester doesn't support contexts");
+            }
+            IList<TernaryTreeNode> list = autocomplete.PrefixCompletion(root, key,
0);
+            IList<LookupResult> res = new List<LookupResult>();
+            if (list == null || list.Count == 0)
+            {
+                return res;
+            }
+            int maxCnt = Math.Min(num, list.Count);
+            if (onlyMorePopular)
+            {
+                LookupPriorityQueue queue = new LookupPriorityQueue(num);
+
+                foreach (TernaryTreeNode ttn in list)
+                {
+                    queue.InsertWithOverflow(new LookupResult(ttn.token, (long)((Number)ttn.val)));
+                }
+                foreach (LookupResult lr in queue.Results)
+                {
+                    res.Add(lr);
+                }
+            }
+            else
+            {
+                for (int i = 0; i < maxCnt; i++)
+                {
+                    TernaryTreeNode ttn = list[i];
+                    res.Add(new LookupResult(ttn.token, (long)((Number)ttn.val)));
+                }
+            }
+            return res;
+        }
+
+        private const sbyte LO_KID = 0x01;
+        private const sbyte EQ_KID = 0x02;
+        private const sbyte HI_KID = 0x04;
+        private const sbyte HAS_TOKEN = 0x08;
+        private const sbyte HAS_VALUE = 0x10;
+
+        // pre-order traversal
+        private void ReadRecursively(DataInput @in, TernaryTreeNode node)
+        {
+            node.splitchar = @in.readString().charAt(0);
+            sbyte mask = @in.readByte();
+            if ((mask & HAS_TOKEN) != 0)
+            {
+                node.token = @in.readString();
+            }
+            if ((mask & HAS_VALUE) != 0)
+            {
+                node.val = Convert.ToInt64(@in.readLong());
+            }
+            if ((mask & LO_KID) != 0)
+            {
+                node.loKid = new TernaryTreeNode();
+                ReadRecursively(@in, node.loKid);
+            }
+            if ((mask & EQ_KID) != 0)
+            {
+                node.eqKid = new TernaryTreeNode();
+                ReadRecursively(@in, node.eqKid);
+            }
+            if ((mask & HI_KID) != 0)
+            {
+                node.hiKid = new TernaryTreeNode();
+                ReadRecursively(@in, node.hiKid);
+            }
+        }
+
+        // pre-order traversal
+        private void WriteRecursively(DataOutput @out, TernaryTreeNode node)
+        {
+            // write out the current node
+            @out.writeString(new string(new char[] { node.splitchar }, 0, 1));
+            // prepare a mask of kids
+            sbyte mask = 0;
+            if (node.eqKid != null)
+            {
+                mask |= EQ_KID;
+            }
+            if (node.loKid != null)
+            {
+                mask |= LO_KID;
+            }
+            if (node.hiKid != null)
+            {
+                mask |= HI_KID;
+            }
+            if (node.token != null)
+            {
+                mask |= HAS_TOKEN;
+            }
+            if (node.val != null)
+            {
+                mask |= HAS_VALUE;
+            }
+            @out.writeByte(mask);
+            if (node.token != null)
+            {
+                @out.writeString(node.token);
+            }
+            if (node.val != null)
+            {
+                @out.writeLong((long)((Number)node.val));
+            }
+            // recurse and write kids
+            if (node.loKid != null)
+            {
+                WriteRecursively(@out, node.loKid);
+            }
+            if (node.eqKid != null)
+            {
+                WriteRecursively(@out, node.eqKid);
+            }
+            if (node.hiKid != null)
+            {
+                WriteRecursively(@out, node.hiKid);
+            }
+        }
+
+        public override bool Store(DataOutput output)
+        {
+            lock (this)
+            {
+                output.writeVLong(count);
+                WriteRecursively(output, root);
+                return true;
+            }
+        }
+
+        public override bool Load(DataInput input)
+        {
+            lock (this)
+            {
+                count = input.readVLong();
+                root = new TernaryTreeNode();
+                ReadRecursively(input, root);
+                return true;
+            }
+        }
+
+        /// <summary>
+        /// Returns byte size of the underlying TST
+        /// </summary>
+        public override long SizeInBytes()
+        {
+            long mem = RamUsageEstimator.ShallowSizeOf(this);
+            if (root != null)
+            {
+                mem += root.sizeInBytes();
+            }
+            return mem;
+        }
+
+        public override long Count
+        {
+            get
+            {
+                return count;
+            }
+        }
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Suggest/Tst/TernaryTreeNode.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Suggest/Tst/TernaryTreeNode.cs b/src/Lucene.Net.Suggest/Suggest/Tst/TernaryTreeNode.cs
new file mode 100644
index 0000000..52ee9b3
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Suggest/Tst/TernaryTreeNode.cs
@@ -0,0 +1,78 @@
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Search.Suggest.Tst
+{
+    /*
+	 * Licensed to the Apache Software Foundation (ASF) under one or more
+	 * contributor license agreements.  See the NOTICE file distributed with
+	 * this work for additional information regarding copyright ownership.
+	 * The ASF licenses this file to You under the Apache License, Version 2.0
+	 * (the "License"); you may not use this file except in compliance with
+	 * the License.  You may obtain a copy of the License at
+	 *
+	 *     http://www.apache.org/licenses/LICENSE-2.0
+	 *
+	 * Unless required by applicable law or agreed to in writing, software
+	 * distributed under the License is distributed on an "AS IS" BASIS,
+	 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	 * See the License for the specific language governing permissions and
+	 * limitations under the License.
+	 */
+
+    /// <summary>
+    /// The class creates a TST node.
+    /// </summary>
+
+    public class TernaryTreeNode
+    {
+
+        /// <summary>
+        /// Creates a new empty node </summary>
+        public TernaryTreeNode()
+        {
+        }
+        /// <summary>
+        /// the character stored by a node. </summary>
+        internal char splitchar;
+        /// <summary>
+        /// a reference object to the node containing character smaller than this node's
character. </summary>
+        internal TernaryTreeNode loKid;
+        /// <summary>
+        ///  a reference object to the node containing character next to this node's character
as 
+        ///  occurring in the inserted token.
+        /// </summary>
+        internal TernaryTreeNode eqKid;
+        /// <summary>
+        /// a reference object to the node containing character higher than this node's character.
</summary>
+        internal TernaryTreeNode hiKid;
+        /// <summary>
+        /// used by leaf nodes to store the complete tokens to be added to suggest list while

+        /// auto-completing the prefix.
+        /// </summary>
+        internal string token;
+        internal object val;
+
+        internal virtual long sizeInBytes()
+        {
+            long mem = RamUsageEstimator.ShallowSizeOf(this);
+            if (loKid != null)
+            {
+                mem += loKid.sizeInBytes();
+            }
+            if (eqKid != null)
+            {
+                mem += eqKid.sizeInBytes();
+            }
+            if (hiKid != null)
+            {
+                mem += hiKid.sizeInBytes();
+            }
+            if (token != null)
+            {
+                mem += RamUsageEstimator.ShallowSizeOf(token) + RamUsageEstimator.NUM_BYTES_ARRAY_HEADER
+ RamUsageEstimator.NUM_BYTES_CHAR * token.Length;
+            }
+            mem += RamUsageEstimator.ShallowSizeOf(val);
+            return mem;
+        }
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Suggest/UnsortedInputIterator.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Suggest/UnsortedInputIterator.cs b/src/Lucene.Net.Suggest/Suggest/UnsortedInputIterator.cs
new file mode 100644
index 0000000..0953233
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Suggest/UnsortedInputIterator.cs
@@ -0,0 +1,108 @@
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Search.Suggest
+{
+
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// This wrapper buffers the incoming elements and makes sure they are in
+    /// random order.
+    /// @lucene.experimental
+    /// </summary>
+    public class UnsortedInputIterator : BufferedInputIterator
+    {
+        // TODO keep this for now
+        private readonly int[] ords;
+        private int currentOrd = -1;
+        private readonly BytesRef spare = new BytesRef();
+        private readonly BytesRef payloadSpare = new BytesRef();
+
+        /// <summary>
+        /// Creates a new iterator, wrapping the specified iterator and
+        /// returning elements in a random order.
+        /// </summary>
+        public UnsortedInputIterator(InputIterator source)
+            : base(source)
+        {
+            ords = new int[entries.Size()];
+            Random random = new Random();
+            for (int i = 0; i < ords.Length; i++)
+            {
+                ords[i] = i;
+            }
+            for (int i = 0; i < ords.Length; i++)
+            {
+                int randomPosition = random.Next(ords.Length);
+                int temp = ords[i];
+                ords[i] = ords[randomPosition];
+                ords[randomPosition] = temp;
+            }
+        }
+
+        public override long Weight
+        {
+            get
+            {
+                Debug.Assert(currentOrd == ords[curPos]);
+                return freqs[currentOrd];
+            }
+        }
+
+        public BytesRef Next()
+        {
+            if (++curPos < entries.Size())
+            {
+                currentOrd = ords[curPos];
+                return entries.Get(spare, currentOrd);
+            }
+            return null;
+        }
+
+        public override BytesRef Payload
+        {
+            get
+            {
+                {
+                    if (HasPayloads && curPos < payloads.Size())
+                    {
+                        Debug.Assert(currentOrd == ords[curPos]);
+                        return payloads.Get(payloadSpare, currentOrd);
+                    }
+                    return null;
+                }
+            }
+        }
+
+        public override HashSet<BytesRef> Contexts
+        {
+            get
+            {
+                if (HasContexts && curPos < contextSets.Count)
+                {
+                    Debug.Assert(currentOrd == ords[curPos]);
+                    return contextSets[currentOrd];
+                }
+                return null;
+            }
+        }
+    }
+}
\ No newline at end of file


Mime
View raw message