lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From nightowl...@apache.org
Subject [3/7] lucenenet git commit: Corrected physical directory locations of Lucene.Net.Sandbox and Lucene.Net.Tests.Sandbox
Date Sat, 10 Dec 2016 19:35:33 GMT
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/87245e31/src/Lucene.Net.Sandbox/Queries/FuzzyLikeThisQuery.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Sandbox/Queries/FuzzyLikeThisQuery.cs b/src/Lucene.Net.Sandbox/Queries/FuzzyLikeThisQuery.cs
new file mode 100644
index 0000000..34da622
--- /dev/null
+++ b/src/Lucene.Net.Sandbox/Queries/FuzzyLikeThisQuery.cs
@@ -0,0 +1,397 @@
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Index;
+using Lucene.Net.Search;
+using Lucene.Net.Search.Similarities;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using System;
+using System.Collections.Generic;
+
+namespace Lucene.Net.Sandbox.Queries
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Fuzzifies ALL terms provided as strings and then picks the best n differentiating terms.
+    /// In effect this mixes the behaviour of <see cref="FuzzyQuery"/> and MoreLikeThis but with special consideration
+    /// of fuzzy scoring factors.
+    /// This generally produces good results for queries where users may provide details in a number of 
+    /// fields and have no knowledge of boolean query syntax and also want a degree of fuzzy matching and
+    /// a fast query.
+    /// <para/>
+    /// For each source term the fuzzy variants are held in a <see cref="BooleanQuery"/> with no coord factor (because
+    /// we are not looking for matches on multiple variants in any one doc). Additionally, a specialized
+    /// <see cref="TermQuery"/> is used for variants and does not use that variant term's IDF because this would favour rarer 
+    /// terms eg misspellings. Instead, all variants use the same IDF ranking (the one for the source query 
+    /// term) and this is factored into the variant's boost. If the source query term does not exist in the
+    /// index the average IDF of the variants is used.
+    /// </summary>
+    public class FuzzyLikeThisQuery : Query
+    {
+        // TODO: generalize this query (at least it should not reuse this static sim!
+        // a better way might be to convert this into multitermquery rewrite methods.
+        // the rewrite method can 'average' the TermContext's term statistics (docfreq,totalTermFreq) 
+        // provided to TermQuery, so that the general idea is agnostic to any scoring system...
+        internal static TFIDFSimilarity sim = new DefaultSimilarity();
+        Query rewrittenQuery = null;
+        List<FieldVals> fieldVals = new List<FieldVals>();
+        Analyzer analyzer;
+
+        ScoreTermQueue q;
+        int MAX_VARIANTS_PER_TERM = 50;
+        bool ignoreTF = false;
+        private int maxNumTerms;
+
+        public override int GetHashCode()
+        {
+            int prime = 31;
+            int result = base.GetHashCode();
+            result = prime * result + ((analyzer == null) ? 0 : analyzer.GetHashCode());
+            result = prime * result
+                + ((fieldVals == null) ? 0 : fieldVals.GetValueHashCode());
+            result = prime * result + (ignoreTF ? 1231 : 1237);
+            result = prime * result + maxNumTerms;
+            return result;
+        }
+
+        public override bool Equals(object obj)
+        {
+            if (this == obj)
+                return true;
+            if (obj == null)
+                return false;
+            if (GetType() != obj.GetType())
+                return false;
+            if (!base.Equals(obj))
+            {
+                return false;
+            }
+            FuzzyLikeThisQuery other = (FuzzyLikeThisQuery)obj;
+            if (analyzer == null)
+            {
+                if (other.analyzer != null)
+                    return false;
+            }
+            else if (!analyzer.Equals(other.analyzer))
+                return false;
+            if (fieldVals == null)
+            {
+                if (other.fieldVals != null)
+                    return false;
+            }
+            else if (!fieldVals.ValueEquals(other.fieldVals))
+                return false;
+            if (ignoreTF != other.ignoreTF)
+                return false;
+            if (maxNumTerms != other.maxNumTerms)
+                return false;
+            return true;
+        }
+
+        /// <summary>
+        /// 
+        /// </summary>
+        /// <param name="maxNumTerms">The total number of terms clauses that will appear once rewritten as a <see cref="BooleanQuery"/></param>
+        /// <param name="analyzer"></param>
+        public FuzzyLikeThisQuery(int maxNumTerms, Analyzer analyzer)
+        {
+            q = new ScoreTermQueue(maxNumTerms);
+            this.analyzer = analyzer;
+            this.maxNumTerms = maxNumTerms;
+        }
+
+        internal class FieldVals
+        {
+            internal string queryString;
+            internal string fieldName;
+            internal float minSimilarity;
+            internal int prefixLength;
+            public FieldVals(string name, float similarity, int length, string queryString)
+            {
+                fieldName = name;
+                minSimilarity = similarity;
+                prefixLength = length;
+                this.queryString = queryString;
+            }
+
+            public override int GetHashCode()
+            {
+                int prime = 31;
+                int result = 1;
+                result = prime * result
+                    + ((fieldName == null) ? 0 : fieldName.GetHashCode());
+                result = prime * result + Number.FloatToIntBits(minSimilarity);
+                result = prime * result + prefixLength;
+                result = prime * result
+                    + ((queryString == null) ? 0 : queryString.GetHashCode());
+                return result;
+            }
+
+            public override bool Equals(object obj)
+            {
+                if (this == obj)
+                    return true;
+                if (obj == null)
+                    return false;
+                if (GetType() != obj.GetType())
+                    return false;
+                FieldVals other = (FieldVals)obj;
+                if (fieldName == null)
+                {
+                    if (other.fieldName != null)
+                        return false;
+                }
+                else if (!fieldName.Equals(other.fieldName, StringComparison.Ordinal))
+                    return false;
+                if (Number.FloatToIntBits(minSimilarity) != Number
+                    .FloatToIntBits(other.minSimilarity))
+                    return false;
+                if (prefixLength != other.prefixLength)
+                    return false;
+                if (queryString == null)
+                {
+                    if (other.queryString != null)
+                        return false;
+                }
+                else if (!queryString.Equals(other.queryString, StringComparison.Ordinal))
+                    return false;
+                return true;
+            }
+        }
+
+        /// <summary>
+        /// Adds user input for "fuzzification" 
+        /// </summary>
+        /// <param name="queryString">The string which will be parsed by the analyzer and for which fuzzy variants will be parsed</param>
+        /// <param name="fieldName">The minimum similarity of the term variants (see <see cref="FuzzyTermsEnum"/>)</param>
+        /// <param name="minSimilarity">Length of required common prefix on variant terms (see <see cref="FuzzyTermsEnum"/>)</param>
+        /// <param name="prefixLength"></param>
+        public virtual void AddTerms(string queryString, string fieldName, float minSimilarity, int prefixLength)
+        {
+            fieldVals.Add(new FieldVals(fieldName, minSimilarity, prefixLength, queryString));
+        }
+
+
+        private void AddTerms(IndexReader reader, FieldVals f)
+        {
+            if (f.queryString == null) return;
+            Terms terms = MultiFields.GetTerms(reader, f.fieldName);
+            if (terms == null)
+            {
+                return;
+            }
+            TokenStream ts = analyzer.TokenStream(f.fieldName, f.queryString);
+            try
+            {
+                ICharTermAttribute termAtt = ts.AddAttribute<ICharTermAttribute>();
+
+                int corpusNumDocs = reader.NumDocs;
+                HashSet<string> processedTerms = new HashSet<string>();
+                ts.Reset();
+                while (ts.IncrementToken())
+                {
+                    string term = termAtt.ToString();
+                    if (!processedTerms.Contains(term))
+                    {
+                        processedTerms.Add(term);
+                        ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
+                        float minScore = 0;
+                        Term startTerm = new Term(f.fieldName, term);
+                        AttributeSource atts = new AttributeSource();
+                        IMaxNonCompetitiveBoostAttribute maxBoostAtt =
+                            atts.AddAttribute<IMaxNonCompetitiveBoostAttribute>();
+#pragma warning disable 612, 618
+                        SlowFuzzyTermsEnum fe = new SlowFuzzyTermsEnum(terms, atts, startTerm, f.minSimilarity, f.prefixLength);
+#pragma warning restore 612, 618
+                        //store the df so all variants use same idf
+                        int df = reader.DocFreq(startTerm);
+                        int numVariants = 0;
+                        int totalVariantDocFreqs = 0;
+                        BytesRef possibleMatch;
+                        IBoostAttribute boostAtt =
+                          fe.Attributes().AddAttribute<IBoostAttribute>();
+                        while ((possibleMatch = fe.Next()) != null)
+                        {
+                            numVariants++;
+                            totalVariantDocFreqs += fe.DocFreq();
+                            float score = boostAtt.Boost;
+                            if (variantsQ.Size() < MAX_VARIANTS_PER_TERM || score > minScore)
+                            {
+                                ScoreTerm st = new ScoreTerm(new Term(startTerm.Field, BytesRef.DeepCopyOf(possibleMatch)), score, startTerm);
+                                variantsQ.InsertWithOverflow(st);
+                                minScore = variantsQ.Top().score; // maintain minScore
+                            }
+                            maxBoostAtt.MaxNonCompetitiveBoost = variantsQ.Size() >= MAX_VARIANTS_PER_TERM ? minScore : float.NegativeInfinity;
+                        }
+
+                        if (numVariants > 0)
+                        {
+                            int avgDf = totalVariantDocFreqs / numVariants;
+                            if (df == 0)//no direct match we can use as df for all variants
+                            {
+                                df = avgDf; //use avg df of all variants
+                            }
+
+                            // take the top variants (scored by edit distance) and reset the score
+                            // to include an IDF factor then add to the global queue for ranking
+                            // overall top query terms
+                            int size = variantsQ.Size();
+                            for (int i = 0; i < size; i++)
+                            {
+                                ScoreTerm st = variantsQ.Pop();
+                                st.score = (st.score * st.score) * sim.Idf(df, corpusNumDocs);
+                                q.InsertWithOverflow(st);
+                            }
+                        }
+                    }
+                }
+                ts.End();
+            }
+            finally
+            {
+                IOUtils.CloseWhileHandlingException(ts);
+            }
+        }
+
+        public override Query Rewrite(IndexReader reader)
+        {
+            if (rewrittenQuery != null)
+            {
+                return rewrittenQuery;
+            }
+            //load up the list of possible terms
+            for (IEnumerator<FieldVals> iter = fieldVals.GetEnumerator(); iter.MoveNext();)
+            {
+                FieldVals f = iter.Current;
+                AddTerms(reader, f);
+            }
+            //clear the list of fields
+            fieldVals.Clear();
+
+            BooleanQuery bq = new BooleanQuery();
+
+
+            //create BooleanQueries to hold the variants for each token/field pair and ensure it
+            // has no coord factor
+            //Step 1: sort the termqueries by term/field
+            IDictionary<Term, List<ScoreTerm>> variantQueries = new Dictionary<Term, List<ScoreTerm>>();
+            int size = q.Size();
+            for (int i = 0; i < size; i++)
+            {
+                ScoreTerm st = q.Pop();
+                //List<ScoreTerm> l = variantQueries.get(st.fuzziedSourceTerm);
+                //          if(l==null)
+                List<ScoreTerm> l;
+                if (!variantQueries.TryGetValue(st.fuzziedSourceTerm, out l) || l == null)
+                {
+                    l = new List<ScoreTerm>();
+                    variantQueries[st.fuzziedSourceTerm] = l;
+                }
+                l.Add(st);
+            }
+            //Step 2: Organize the sorted termqueries into zero-coord scoring boolean queries
+            for (var iter = variantQueries.Values.GetEnumerator(); iter.MoveNext();)
+            {
+                List<ScoreTerm> variants = iter.Current;
+                if (variants.Count == 1)
+                {
+                    //optimize where only one selected variant
+                    ScoreTerm st = variants[0];
+                    Query tq = ignoreTF ? (Query)new ConstantScoreQuery(new TermQuery(st.term)) : new TermQuery(st.term, 1);
+                    tq.Boost = st.score; // set the boost to a mix of IDF and score
+                    bq.Add(tq, BooleanClause.Occur.SHOULD);
+                }
+                else
+                {
+                    BooleanQuery termVariants = new BooleanQuery(true); //disable coord and IDF for these term variants
+                    for (IEnumerator<ScoreTerm> iterator2 = variants.GetEnumerator(); iterator2
+                            .MoveNext();)
+                    {
+                        ScoreTerm st = iterator2.Current;
+                        // found a match
+                        Query tq = ignoreTF ? (Query)new ConstantScoreQuery(new TermQuery(st.term)) : new TermQuery(st.term, 1);
+                        tq.Boost = st.score; // set the boost using the ScoreTerm's score
+                        termVariants.Add(tq, BooleanClause.Occur.SHOULD);          // add to query                    
+                    }
+                    bq.Add(termVariants, BooleanClause.Occur.SHOULD);          // add to query
+                }
+            }
+            //TODO possible alternative step 3 - organize above booleans into a new layer of field-based
+            // booleans with a minimum-should-match of NumFields-1?
+            bq.Boost = Boost;
+            this.rewrittenQuery = bq;
+            return bq;
+        }
+
+        //Holds info for a fuzzy term variant - initially score is set to edit distance (for ranking best
+        // term variants) then is reset with IDF for use in ranking against all other
+        // terms/fields
+        internal class ScoreTerm
+        {
+            public Term term;
+            public float score;
+            internal Term fuzziedSourceTerm;
+
+            public ScoreTerm(Term term, float score, Term fuzziedSourceTerm)
+            {
+                this.term = term;
+                this.score = score;
+                this.fuzziedSourceTerm = fuzziedSourceTerm;
+            }
+        }
+
+        internal class ScoreTermQueue : Util.PriorityQueue<ScoreTerm>
+        {
+            public ScoreTermQueue(int size)
+                : base(size)
+            {
+            }
+
+            /// <summary>
+            /// (non-Javadoc)
+            /// <see cref="Util.PriorityQueue{T}.LessThan(T, T)"/>
+            /// </summary>
+            public override bool LessThan(ScoreTerm termA, ScoreTerm termB)
+            {
+                if (termA.score == termB.score)
+                    return termA.term.CompareTo(termB.term) > 0;
+                else
+                    return termA.score < termB.score;
+            }
+
+        }
+
+        /// <summary>
+        /// (non-Javadoc)
+        /// <see cref="Query.ToString(string)"/>
+        /// </summary>
+        /// <param name="field"></param>
+        /// <returns></returns>
+        public override string ToString(string field)
+        {
+            return null;
+        }
+
+        public virtual bool IgnoreTF
+        {
+            get { return ignoreTF; }
+            set { ignoreTF = value; }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/87245e31/src/Lucene.Net.Sandbox/Queries/SlowFuzzyQuery.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Sandbox/Queries/SlowFuzzyQuery.cs b/src/Lucene.Net.Sandbox/Queries/SlowFuzzyQuery.cs
new file mode 100644
index 0000000..545b567
--- /dev/null
+++ b/src/Lucene.Net.Sandbox/Queries/SlowFuzzyQuery.cs
@@ -0,0 +1,215 @@
+using Lucene.Net.Index;
+using Lucene.Net.Search;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using Lucene.Net.Util.Automaton;
+using System;
+using System.Text;
+
+namespace Lucene.Net.Sandbox.Queries
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Implements the classic fuzzy search query. The similarity measurement
+    /// is based on the Levenshtein (edit distance) algorithm.
+    /// <para/>
+    /// Note that, unlike <see cref="FuzzyQuery"/>, this query will silently allow
+    /// for a (possibly huge) number of edit distances in comparisons, and may
+    /// be extremely slow (comparing every term in the index).
+    /// </summary>
+    [Obsolete("Use FuzzyQuery instead.")]
+    public class SlowFuzzyQuery : MultiTermQuery
+    {
+        public readonly static float defaultMinSimilarity = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE;
+        public readonly static int defaultPrefixLength = 0;
+        public readonly static int defaultMaxExpansions = 50;
+
+        private float minimumSimilarity;
+        private int prefixLength;
+        private bool termLongEnough = false;
+
+        protected Term term;
+
+        /// <summary>
+        /// Create a new <see cref="SlowFuzzyQuery"/> that will match terms with a similarity 
+        /// of at least <paramref name="minimumSimilarity"/> to <paramref name="term"/>.
+        /// If a <paramref name="prefixLength"/> &gt; 0 is specified, a common prefix
+        /// of that length is also required.
+        /// </summary>
+        /// <param name="term">the term to search for</param>
+        /// <param name="minimumSimilarity">
+        /// a value between 0 and 1 to set the required similarity
+        /// between the query term and the matching terms. For example, for a
+        /// <paramref name="minimumSimilarity"/> of <c>0.5</c> a term of the same length
+        /// as the query term is considered similar to the query term if the edit distance
+        /// between both terms is less than <c>length(term)*0.5</c>
+        /// <para/>
+        /// Alternatively, if <paramref name="minimumSimilarity"/> is >= 1f, it is interpreted
+        /// as a pure Levenshtein edit distance. For example, a value of <c>2f</c>
+        /// will match all terms within an edit distance of <c>2</c> from the
+        /// query term. Edit distances specified in this way may not be fractional.
+        /// </param>
+        /// <param name="prefixLength">length of common (non-fuzzy) prefix</param>
+        /// <param name="maxExpansions">
+        /// the maximum number of terms to match. If this number is
+        /// greater than <see cref="BooleanQuery.MaxClauseCount"/> when the query is rewritten,
+        /// then the maxClauseCount will be used instead.
+        /// </param>
+        /// <exception cref="ArgumentException">
+        /// if <paramref name="minimumSimilarity"/> is &gt;= 1 or &lt; 0
+        /// or if <paramref name="prefixLength"/> &lt; 0
+        /// </exception>
+        public SlowFuzzyQuery(Term term, float minimumSimilarity, int prefixLength,
+            int maxExpansions)
+            : base(term.Field)
+        {
+            this.term = term;
+
+            if (minimumSimilarity >= 1.0f && minimumSimilarity != (int)minimumSimilarity)
+                throw new ArgumentException("fractional edit distances are not allowed");
+            if (minimumSimilarity < 0.0f)
+                throw new ArgumentException("minimumSimilarity < 0");
+            if (prefixLength < 0)
+                throw new ArgumentException("prefixLength < 0");
+            if (maxExpansions < 0)
+                throw new ArgumentException("maxExpansions < 0");
+
+            SetRewriteMethod(new MultiTermQuery.TopTermsScoringBooleanQueryRewrite(maxExpansions));
+
+            string text = term.Text();
+            int len = text.CodePointCount(0, text.Length);
+            if (len > 0 && (minimumSimilarity >= 1f || len > 1.0f / (1.0f - minimumSimilarity)))
+            {
+                this.termLongEnough = true;
+            }
+
+            this.minimumSimilarity = minimumSimilarity;
+            this.prefixLength = prefixLength;
+        }
+
+        /// <summary>
+        /// Calls <see cref="SlowFuzzyQuery(Term, float)">SlowFuzzyQuery(term, minimumSimilarity, prefixLength, defaultMaxExpansions)</see>.
+        /// </summary>
+        public SlowFuzzyQuery(Term term, float minimumSimilarity, int prefixLength)
+            : this(term, minimumSimilarity, prefixLength, defaultMaxExpansions)
+        {
+        }
+
+        /// <summary>
+        /// Calls <see cref="SlowFuzzyQuery(Term, float)">SlowFuzzyQuery(term, minimumSimilarity, 0, defaultMaxExpansions)</see>.
+        /// </summary>
+        public SlowFuzzyQuery(Term term, float minimumSimilarity)
+            : this(term, minimumSimilarity, defaultPrefixLength, defaultMaxExpansions)
+        {
+        }
+
+        /// <summary>
+        /// Calls <see cref="SlowFuzzyQuery(Term, float)">SlowFuzzyQuery(term, defaultMinSimilarity, 0, defaultMaxExpansions)</see>.
+        /// </summary>
+        public SlowFuzzyQuery(Term term)
+            : this(term, defaultMinSimilarity, defaultPrefixLength, defaultMaxExpansions)
+        {
+        }
+
+        /// <summary>
+        /// Gets the minimum similarity that is required for this query to match.
+        /// Returns float value between 0.0 and 1.0.
+        /// </summary>
+        public virtual float MinSimilarity
+        {
+            get { return minimumSimilarity; }
+        }
+
+        /// <summary>
+        /// Gets the non-fuzzy prefix length. This is the number of characters at the start
+        /// of a term that must be identical (not fuzzy) to the query term if the query
+        /// is to match that term.
+        /// </summary>
+        public virtual int PrefixLength
+        {
+            get { return prefixLength; }
+        }
+
+        public override TermsEnum GetTermsEnum(Terms terms, AttributeSource atts)
+        {
+            if (!termLongEnough)
+            {  // can only match if it's exact
+                return new SingleTermsEnum(terms.Iterator(null), term.Bytes);
+            }
+            return new SlowFuzzyTermsEnum(terms, atts, Term, minimumSimilarity, prefixLength);
+        }
+
+        /// <summary>
+        /// Gets the pattern term.
+        /// </summary>
+        public virtual Term Term
+        {
+            get { return term; }
+        }
+
+        public override string ToString(string field)
+        {
+            StringBuilder buffer = new StringBuilder();
+            if (!term.Field.Equals(field))
+            {
+                buffer.Append(term.Field);
+                buffer.Append(":");
+            }
+            buffer.Append(term.Text());
+            buffer.Append('~');
+            buffer.Append(Number.ToString(minimumSimilarity));
+            buffer.Append(ToStringUtils.Boost(Boost));
+            return buffer.ToString();
+        }
+
+        public override int GetHashCode()
+        {
+            int prime = 31;
+            int result = base.GetHashCode();
+            result = prime * result + Number.FloatToIntBits(minimumSimilarity);
+            result = prime * result + prefixLength;
+            result = prime * result + ((term == null) ? 0 : term.GetHashCode());
+            return result;
+        }
+
+        public override bool Equals(object obj)
+        {
+            if (this == obj)
+                return true;
+            if (!base.Equals(obj))
+                return false;
+            if (GetType() != obj.GetType())
+                return false;
+            SlowFuzzyQuery other = (SlowFuzzyQuery)obj;
+            if (Number.FloatToIntBits(minimumSimilarity) != Number
+                .FloatToIntBits(other.minimumSimilarity))
+                return false;
+            if (prefixLength != other.prefixLength)
+                return false;
+            if (term == null)
+            {
+                if (other.term != null)
+                    return false;
+            }
+            else if (!term.Equals(other.term))
+                return false;
+            return true;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/87245e31/src/Lucene.Net.Sandbox/Queries/SlowFuzzyTermsEnum.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Sandbox/Queries/SlowFuzzyTermsEnum.cs b/src/Lucene.Net.Sandbox/Queries/SlowFuzzyTermsEnum.cs
new file mode 100644
index 0000000..8be182c
--- /dev/null
+++ b/src/Lucene.Net.Sandbox/Queries/SlowFuzzyTermsEnum.cs
@@ -0,0 +1,293 @@
+using Lucene.Net.Index;
+using Lucene.Net.Search;
+using Lucene.Net.Util;
+using System;
+
+namespace Lucene.Net.Sandbox.Queries
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Potentially slow fuzzy <see cref="TermsEnum"/> for enumerating all terms that are similar
+    /// to the specified filter term.
+    /// <para/>
+    /// If the minSimilarity or maxEdits is greater than the Automaton's
+    /// allowable range, this backs off to the classic (brute force)
+    /// fuzzy terms enum method by calling <see cref="FuzzyTermsEnum.GetAutomatonEnum(int, BytesRef)"/>.
+    /// <para/>
+    /// Term enumerations are always ordered by
+    /// <see cref="FuzzyTermsEnum.Comparator"/>. Each term in the enumeration is
+    /// greater than all that precede it.
+    /// </summary>
+    [Obsolete("Use FuzzyTermsEnum instead.")]
+    public class SlowFuzzyTermsEnum : FuzzyTermsEnum
+    {
+        public SlowFuzzyTermsEnum(Terms terms, AttributeSource atts, Term term,
+            float minSimilarity, int prefixLength)
+            : base(terms, atts, term, minSimilarity, prefixLength, false)
+        {
+        }
+
+        protected override void MaxEditDistanceChanged(BytesRef lastTerm, int maxEdits, bool init)
+        {
+            TermsEnum newEnum = GetAutomatonEnum(maxEdits, lastTerm);
+            if (newEnum != null)
+            {
+                Enum = newEnum;
+            }
+            else if (init)
+            {
+                Enum = new LinearFuzzyTermsEnum(this);
+            }
+        }
+
+        /// <summary>
+        /// Implement fuzzy enumeration with linear brute force.
+        /// </summary>
+        private class LinearFuzzyTermsEnum : FilteredTermsEnum
+        {
+            private readonly SlowFuzzyTermsEnum outerInstance;
+
+            /// <summary>
+            /// Allows us save time required to create a new array
+            /// every time similarity is called.
+            /// </summary>
+            private int[] d;
+            private int[] p;
+
+            /// <summary>this is the text, minus the prefix</summary>
+            private readonly int[] text;
+
+            private readonly IBoostAttribute boostAtt;
+
+            /// <summary>
+            /// Constructor for enumeration of all terms from specified <c>reader</c> which share a prefix of
+            /// length <c>prefixLength</c> with <c>term</c> and which have a fuzzy similarity &gt;
+            /// <c>minSimilarity</c>.
+            /// <para/>
+            /// After calling the constructor the enumeration is already pointing to the first 
+            /// valid term if such a term exists.
+            /// </summary>
+            /// <exception cref="IOException">If there is a low-level I/O error.</exception>
+            public LinearFuzzyTermsEnum(SlowFuzzyTermsEnum outerInstance)
+                : base(outerInstance.Terms.Iterator(null))
+            {
+                this.outerInstance = outerInstance;
+                this.boostAtt = Attributes().AddAttribute<IBoostAttribute>();
+
+                this.text = new int[outerInstance.TermLength - outerInstance.RealPrefixLength];
+                System.Array.Copy(outerInstance.TermText, outerInstance.RealPrefixLength, text, 0, text.Length);
+                string prefix = UnicodeUtil.NewString(outerInstance.TermText, 0, outerInstance.RealPrefixLength);
+                prefixBytesRef = new BytesRef(prefix);
+                this.d = new int[this.text.Length + 1];
+                this.p = new int[this.text.Length + 1];
+
+
+                InitialSeekTerm = prefixBytesRef;
+            }
+
+            private readonly BytesRef prefixBytesRef;
+            /// <summary>used for unicode conversion from BytesRef byte[] to int[]</summary>
+            private readonly IntsRef utf32 = new IntsRef(20);
+
+            /// <summary>
+            /// <para>
+            /// The termCompare method in FuzzyTermEnum uses Levenshtein distance to
+            /// calculate the distance between the given term and the comparing term.
+            /// </para>
+            /// <para>
+            /// If the minSimilarity is >= 1.0, this uses the maxEdits as the comparison.
+            /// Otherwise, this method uses the following logic to calculate similarity.
+            /// <code>
+            ///     similarity = 1 - ((float)distance / (float) (prefixLength + Math.min(textlen, targetlen)));
+            /// </code>
+            /// where distance is the Levenshtein distance for the two words.
+            /// </para>
+            /// </summary>
+            protected override sealed AcceptStatus Accept(BytesRef term)
+            {
+                if (StringHelper.StartsWith(term, prefixBytesRef))
+                {
+                    UnicodeUtil.UTF8toUTF32(term, utf32);
+                    int distance = CalcDistance(utf32.Ints, outerInstance.RealPrefixLength, utf32.Length - outerInstance.RealPrefixLength);
+
+                    //Integer.MIN_VALUE is the sentinel that Levenshtein stopped early
+                    if (distance == int.MinValue)
+                    {
+                        return AcceptStatus.NO;
+                    }
+                    //no need to calc similarity, if raw is true and distance > maxEdits
+                    if (outerInstance.Raw == true && distance > outerInstance.MaxEdits)
+                    {
+                        return AcceptStatus.NO;
+                    }
+                    float similarity = CalcSimilarity(distance, (utf32.Length - outerInstance.RealPrefixLength), text.Length);
+
+                    //if raw is true, then distance must also be <= maxEdits by now
+                    //given the previous if statement
+                    if (outerInstance.Raw == true ||
+                          (outerInstance.Raw == false && similarity > outerInstance.MinSimilarity))
+                    {
+                        boostAtt.Boost = (similarity - outerInstance.MinSimilarity) * outerInstance.Scale_factor;
+                        return AcceptStatus.YES;
+                    }
+                    else
+                    {
+                        return AcceptStatus.NO;
+                    }
+                }
+                else
+                {
+                    return AcceptStatus.END;
+                }
+            }
+
+            /******************************
+             * Compute Levenshtein distance
+             ******************************/
+
+            /// <summary>
+            /// <para>
+            /// <see cref="CalcDistance(int[], int, int)"/> returns the Levenshtein distance between the query term
+            /// and the target term.
+            /// </para>
+            /// <para>
+            /// Embedded within this algorithm is a fail-fast Levenshtein distance
+            /// algorithm.  The fail-fast algorithm differs from the standard Levenshtein
+            /// distance algorithm in that it is aborted if it is discovered that the
+            /// minimum distance between the words is greater than some threshold.
+            /// </para>
+            /// <para>
+            /// Levenshtein distance (also known as edit distance) is a measure of similarity
+            /// between two strings where the distance is measured as the number of character
+            /// deletions, insertions or substitutions required to transform one string to
+            /// the other string.
+            /// </para>
+            /// </summary>
+            /// <param name="target">the target word or phrase</param>
+            /// <param name="offset">the offset at which to start the comparison</param>
+            /// <param name="length">the length of what's left of the string to compare</param>
+            /// <returns>
+            /// the number of edits or <see cref="int.MaxValue"/> if the edit distance is
+            /// greater than maxDistance.
+            /// </returns>
+            private int CalcDistance(int[] target, int offset, int length)
+            {
+                int m = length;
+                int n = text.Length;
+                if (n == 0)
+                {
+                    //we don't have anything to compare.  That means if we just add
+                    //the letters for m we get the new word
+                    return m;
+                }
+                if (m == 0)
+                {
+                    return n;
+                }
+
+                int maxDistance = CalculateMaxDistance(m);
+
+                if (maxDistance < Math.Abs(m - n))
+                {
+                    //just adding the characters of m to n or vice-versa results in
+                    //too many edits
+                    //for example "pre" length is 3 and "prefixes" length is 8.  We can see that
+                    //given this optimal circumstance, the edit distance cannot be less than 5.
+                    //which is 8-3 or more precisely Math.abs(3-8).
+                    //if our maximum edit distance is 4, then we can discard this word
+                    //without looking at it.
+                    return int.MinValue;
+                }
+
+                // init matrix d
+                for (int i = 0; i <= n; ++i)
+                {
+                    p[i] = i;
+                }
+
+                // start computing edit distance
+                for (int j = 1; j <= m; ++j)
+                { // iterates through target
+                    int bestPossibleEditDistance = m;
+                    int t_j = target[offset + j - 1]; // jth character of t
+                    d[0] = j;
+
+                    for (int i = 1; i <= n; ++i)
+                    { // iterates through text
+                      // minimum of cell to the left+1, to the top+1, diagonally left and up +(0|1)
+                        if (t_j != text[i - 1])
+                        {
+                            d[i] = Math.Min(Math.Min(d[i - 1], p[i]), p[i - 1]) + 1;
+                        }
+                        else
+                        {
+                            d[i] = Math.Min(Math.Min(d[i - 1] + 1, p[i] + 1), p[i - 1]);
+                        }
+                        bestPossibleEditDistance = Math.Min(bestPossibleEditDistance, d[i]);
+                    }
+
+                    //After calculating row i, the best possible edit distance
+                    //can be found by found by finding the smallest value in a given column.
+                    //If the bestPossibleEditDistance is greater than the max distance, abort.
+
+                    if (j > maxDistance && bestPossibleEditDistance > maxDistance)
+                    {  //equal is okay, but not greater
+                       //the closest the target can be to the text is just too far away.
+                       //this target is leaving the party early.
+                        return int.MinValue;
+                    }
+
+                    // copy current distance counts to 'previous row' distance counts: swap p and d
+                    int[] _d = p;
+                    p = d;
+                    d = _d;
+                }
+
+                // our last action in the above loop was to switch d and p, so p now
+                // actually has the most recent cost counts
+
+                return p[n];
+            }
+
+            private float CalcSimilarity(int edits, int m, int n)
+            {
+                // this will return less than 0.0 when the edit distance is
+                // greater than the number of characters in the shorter word.
+                // but this was the formula that was previously used in FuzzyTermEnum,
+                // so it has not been changed (even though minimumSimilarity must be
+                // greater than 0.0)
+
+                return 1.0f - ((float)edits / (float)(outerInstance.RealPrefixLength + Math.Min(n, m)));
+            }
+
+            /// <summary>
+            /// The max Distance is the maximum Levenshtein distance for the text
+            /// compared to some other value that results in score that is
+            /// better than the minimum similarity.
+            /// </summary>
+            /// <param name="m">the length of the "other value"</param>
+            /// <returns>the maximum levenshtein distance that we care about</returns>
+            private int CalculateMaxDistance(int m)
+            {
+                return outerInstance.Raw ? outerInstance.MaxEdits : Math.Min(outerInstance.MaxEdits,
+                    (int)((1 - outerInstance.MinSimilarity) * (Math.Min(text.Length, m) + outerInstance.RealPrefixLength)));
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/87245e31/src/Lucene.Net.Sandbox/Queries/SortedSetSortField.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Sandbox/Queries/SortedSetSortField.cs b/src/Lucene.Net.Sandbox/Queries/SortedSetSortField.cs
new file mode 100644
index 0000000..cb53d81
--- /dev/null
+++ b/src/Lucene.Net.Sandbox/Queries/SortedSetSortField.cs
@@ -0,0 +1,384 @@
+using Lucene.Net.Index;
+using Lucene.Net.Search;
+using Lucene.Net.Util;
+using System;
+using System.Diagnostics;
+using System.Text;
+
+namespace Lucene.Net.Sandbox.Queries
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// SortField for <see cref="SortedSetDocValues"/>.
+    /// <para/>
+    /// A <see cref="SortedSetDocValues"/> contains multiple values for a field, so sorting with
+    /// this technique "selects" a value as the representative sort value for the document.
+    /// <para/>
+    /// By default, the minimum value in the set is selected as the sort value, but
+    /// this can be customized. Selectors other than the default do have some limitations
+    /// (see below) to ensure that all selections happen in constant-time for performance.
+    /// <para/>
+    /// Like sorting by string, this also supports sorting missing values as first or last,
+    /// via <see cref="SetMissingValue(object)"/>.
+    /// <para/>
+    /// Limitations:
+    /// <list type="bullet">
+    ///     <item>
+    ///     Fields containing <see cref="int.MaxValue"/> or more unique values
+    ///     are unsupported.
+    ///     </item>
+    ///     <item>
+    ///     Selectors other than the default <see cref="Selector.MIN"/> require 
+    ///     optional codec support. However several codecs provided by Lucene,
+    ///     including the current default codec, support this.
+    ///     </item>
+    /// </list>
+    /// </summary>
+    public class SortedSetSortField : SortField
+    {
+        // LUCENENET NOTE: Selector enum moved outside of this class to prevent
+        // naming conflicts.
+
+        private readonly Selector selector;
+
+        /// <summary>
+        /// Creates a sort, possibly in reverse, by the minimum value in the set 
+        /// for the document.
+        /// </summary>
+        /// <param name="field">Name of field to sort by.  Must not be null.</param>
+        /// <param name="reverse">True if natural order should be reversed.</param>
+        public SortedSetSortField(string field, bool reverse)
+                  : this(field, reverse, Selector.MIN)
+        {
+        }
+
+        /// <summary>
+        /// Creates a sort, possibly in reverse, specifying how the sort value from 
+        /// the document's set is selected.
+        /// </summary>
+        /// <param name="field">Name of field to sort by.  Must not be null.</param>
+        /// <param name="reverse">True if natural order should be reversed.</param>
+        /// <param name="selector">
+        /// custom selector for choosing the sort value from the set.
+        /// <para/>
+        /// NOTE: selectors other than <see cref="Selector.MIN"/> require optional codec support.
+        /// </param>
+        public SortedSetSortField(string field, bool reverse, Selector selector)
+            : base(field, SortField.Type_e.CUSTOM, reverse)
+        {
+            // LUCENENET NOTE: Selector enum cannot be null in .NET, so we avoid this issue by not making the parameter nullable
+            //if (selector == null)
+            //{
+            //    throw new NullReferenceException();
+            //}
+            this.selector = selector;
+        }
+
+        /// <summary>Returns the selector in use for this sort</summary>
+        public Selector Selector
+        {
+            get { return selector; }
+        }
+
+        public override int GetHashCode()
+        {
+            return 31 * base.GetHashCode() + selector.GetHashCode();
+        }
+
+        public override bool Equals(object obj)
+        {
+            if (this == obj) return true;
+            if (!base.Equals(obj)) return false;
+            if (GetType() != obj.GetType()) return false;
+            SortedSetSortField other = (SortedSetSortField)obj;
+            if (selector != other.selector) return false;
+            return true;
+        }
+
+        public override string ToString()
+        {
+            StringBuilder buffer = new StringBuilder();
+            buffer.Append("<sortedset" + ": \"").Append(Field).Append("\">");
+            if (Reverse) buffer.Append('!');
+            if (missingValue != null)
+            {
+                buffer.Append(" missingValue=");
+                buffer.Append(missingValue);
+            }
+            buffer.Append(" selector=");
+            buffer.Append(selector);
+
+            return buffer.ToString();
+        }
+
+        /// <summary>
+        /// Set how missing values (the empty set) are sorted.
+        /// <para/>
+        /// Note that this must be <see cref="SortField.STRING_FIRST"/> or 
+        /// <see cref="SortField.STRING_LAST"/>.
+        /// </summary>
+        public override object MissingValue
+        {
+            set
+            {
+                if (value != STRING_FIRST && value != STRING_LAST)
+                {
+                    throw new ArgumentException("For SORTED_SET type, missing value must be either STRING_FIRST or STRING_LAST");
+                }
+                this.missingValue = value;
+            }
+        }
+
+        internal class TermOrdValComparatorAnonymousHelper : FieldComparator.TermOrdValComparator
+        {
+            private readonly SortedSetSortField outerInstance;
+            private readonly int numHits;
+
+            public TermOrdValComparatorAnonymousHelper(SortedSetSortField outerInstance, int numHits)
+                : base(numHits, outerInstance.Field, outerInstance.missingValue == STRING_LAST)
+            {
+                this.outerInstance = outerInstance;
+                this.numHits = numHits;
+            }
+
+            protected override SortedDocValues GetSortedDocValues(AtomicReaderContext context, string field)
+            {
+                SortedSetDocValues sortedSet = FieldCache.DEFAULT.GetDocTermOrds(context.AtomicReader, field);
+
+                if (sortedSet.ValueCount >= int.MaxValue)
+                {
+                    throw new NotSupportedException("fields containing more than " + (int.MaxValue - 1) + " unique terms are unsupported");
+                }
+
+                SortedDocValues singleton = DocValues.UnwrapSingleton(sortedSet);
+                if (singleton != null)
+                {
+                    // it's actually single-valued in practice, but indexed as multi-valued,
+                    // so just sort on the underlying single-valued dv directly.
+                    // regardless of selector type, this optimization is safe!
+                    return singleton;
+                }
+                else if (outerInstance.selector == Selector.MIN)
+                {
+                    return new MinValue(sortedSet);
+                }
+                else
+                {
+                    if (sortedSet is RandomAccessOrds == false)
+                    {
+                        throw new NotSupportedException("codec does not support random access ordinals, cannot use selector: " + outerInstance.selector);
+                    }
+                    RandomAccessOrds randomOrds = (RandomAccessOrds)sortedSet;
+                    switch (outerInstance.selector)
+                    {
+                        case Selector.MAX: return new MaxValue(randomOrds);
+                        case Selector.MIDDLE_MIN: return new MiddleMinValue(randomOrds);
+                        case Selector.MIDDLE_MAX: return new MiddleMaxValue(randomOrds);
+                        case Selector.MIN:
+                        default:
+                            Debug.Assert(false);
+                            return null;
+                    }
+                }
+            }
+        }
+
+        public override FieldComparator GetComparator(int numHits, int sortPos)
+        {
+            return new TermOrdValComparatorAnonymousHelper(this, numHits);
+        }
+
+        /// <summary>Wraps a <see cref="SortedSetDocValues"/> and returns the first ordinal (min)</summary>
+        internal class MinValue : SortedDocValues
+        {
+            internal readonly SortedSetDocValues @in;
+
+            internal MinValue(SortedSetDocValues @in)
+            {
+                this.@in = @in;
+            }
+
+            public override int GetOrd(int docID)
+            {
+                @in.Document = (docID);
+                return (int)@in.NextOrd();
+            }
+
+            public override void LookupOrd(int ord, BytesRef result)
+            {
+                @in.LookupOrd(ord, result);
+            }
+
+            public override int ValueCount
+            {
+                get { return (int)@in.ValueCount; }
+            }
+
+            public override int LookupTerm(BytesRef key)
+            {
+                return (int)@in.LookupTerm(key);
+            }
+        }
+
+        /// <summary>Wraps a <see cref="SortedSetDocValues"/> and returns the last ordinal (max)</summary>
+        internal class MaxValue : SortedDocValues
+        {
+            internal readonly RandomAccessOrds @in;
+
+            internal MaxValue(RandomAccessOrds @in)
+            {
+                this.@in = @in;
+            }
+
+            public override int GetOrd(int docID)
+            {
+                @in.Document = (docID);
+                int count = @in.Cardinality();
+                if (count == 0)
+                {
+                    return -1;
+                }
+                else
+                {
+                    return (int)@in.OrdAt(count - 1);
+                }
+            }
+
+            public override void LookupOrd(int ord, BytesRef result)
+            {
+                @in.LookupOrd(ord, result);
+            }
+
+            public override int ValueCount
+            {
+                get { return (int)@in.ValueCount; }
+            }
+
+            public override int LookupTerm(BytesRef key)
+            {
+                return (int)@in.LookupTerm(key);
+            }
+        }
+
+        /// <summary>Wraps a <see cref="SortedSetDocValues"/> and returns the middle ordinal (or min of the two)</summary>
+        internal class MiddleMinValue : SortedDocValues
+        {
+            internal readonly RandomAccessOrds @in;
+
+            internal MiddleMinValue(RandomAccessOrds @in)
+            {
+                this.@in = @in;
+            }
+
+            public override int GetOrd(int docID)
+            {
+                @in.Document = (docID);
+                int count = @in.Cardinality();
+                if (count == 0)
+                {
+                    return -1;
+                }
+                else
+                {
+                    return (int)@in.OrdAt((int)((uint)(count - 1)) >> 1);
+                }
+            }
+
+            public override void LookupOrd(int ord, BytesRef result)
+            {
+                @in.LookupOrd(ord, result);
+            }
+
+            public override int ValueCount
+            {
+                get { return (int)@in.ValueCount; }
+            }
+
+            public override int LookupTerm(BytesRef key)
+            {
+                return (int)@in.LookupTerm(key);
+            }
+        }
+
+        /// <summary>Wraps a <see cref="SortedSetDocValues"/> and returns the middle ordinal (or max of the two)</summary>
+        internal class MiddleMaxValue : SortedDocValues
+        {
+            internal readonly RandomAccessOrds @in;
+
+            internal MiddleMaxValue(RandomAccessOrds @in)
+            {
+                this.@in = @in;
+            }
+
+            public override int GetOrd(int docID)
+            {
+                @in.Document = (docID);
+                int count = @in.Cardinality();
+                if (count == 0)
+                {
+                    return -1;
+                }
+                else
+                {
+                    return (int)@in.OrdAt((int)((uint)count >> 1));
+                }
+            }
+
+            public override void LookupOrd(int ord, BytesRef result)
+            {
+                @in.LookupOrd(ord, result);
+            }
+
+            public override int ValueCount
+            {
+                get { return (int)@in.ValueCount; }
+            }
+
+            public override int LookupTerm(BytesRef key)
+            {
+                return (int)@in.LookupTerm(key);
+            }
+        }
+    }
+
+    /// <summary>Selects a value from the document's set to use as the sort value</summary>
+    public enum Selector
+    {
+        /// <summary>
+        /// Selects the minimum value in the set 
+        /// </summary>
+        MIN,
+        /// <summary>
+        /// Selects the maximum value in the set 
+        /// </summary>
+        MAX,
+        /// <summary>
+        /// Selects the middle value in the set.
+        /// <para/>
+        /// If the set has an even number of values, the lower of the middle two is chosen.
+        /// </summary>
+        MIDDLE_MIN,
+        /// <summary>
+        /// Selects the middle value in the set.
+        /// <para/>
+        /// If the set has an even number of values, the higher of the middle two is chosen
+        /// </summary>
+        MIDDLE_MAX
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/87245e31/src/Lucene.Net.Tests.Sandbox/Lucene.Net.Tests.Sandbox.csproj
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Sandbox/Lucene.Net.Tests.Sandbox.csproj b/src/Lucene.Net.Tests.Sandbox/Lucene.Net.Tests.Sandbox.csproj
new file mode 100644
index 0000000..f0493af
--- /dev/null
+++ b/src/Lucene.Net.Tests.Sandbox/Lucene.Net.Tests.Sandbox.csproj
@@ -0,0 +1,92 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="14.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
+  <PropertyGroup>
+    <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+    <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+    <ProjectGuid>{7865CBC8-2C6B-462C-ACF5-B2C4D60D93C9}</ProjectGuid>
+    <OutputType>Library</OutputType>
+    <AppDesignerFolder>Properties</AppDesignerFolder>
+    <RootNamespace>Lucene.Net.Sandbox</RootNamespace>
+    <AssemblyName>Lucene.Net.Tests.Sandbox</AssemblyName>
+    <TargetFrameworkVersion>v4.5.1</TargetFrameworkVersion>
+    <FileAlignment>512</FileAlignment>
+    <TargetFrameworkProfile />
+  </PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
+    <DebugSymbols>true</DebugSymbols>
+    <DebugType>full</DebugType>
+    <Optimize>false</Optimize>
+    <OutputPath>bin\Debug\</OutputPath>
+    <DefineConstants>DEBUG;TRACE</DefineConstants>
+    <ErrorReport>prompt</ErrorReport>
+    <WarningLevel>4</WarningLevel>
+  </PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
+    <DebugType>pdbonly</DebugType>
+    <Optimize>true</Optimize>
+    <OutputPath>bin\Release\</OutputPath>
+    <DefineConstants>TRACE</DefineConstants>
+    <ErrorReport>prompt</ErrorReport>
+    <WarningLevel>4</WarningLevel>
+  </PropertyGroup>
+  <ItemGroup>
+    <Reference Include="nunit.framework, Version=2.6.3.13283, Culture=neutral, PublicKeyToken=96d09a1eb7f44a77, processorArchitecture=MSIL">
+      <HintPath>..\packages\NUnit.2.6.3\lib\nunit.framework.dll</HintPath>
+      <Private>True</Private>
+    </Reference>
+    <Reference Include="System" />
+    <Reference Include="System.Core" />
+    <Reference Include="System.Xml.Linq" />
+    <Reference Include="System.Data.DataSetExtensions" />
+    <Reference Include="Microsoft.CSharp" />
+    <Reference Include="System.Data" />
+    <Reference Include="System.Net.Http" />
+    <Reference Include="System.Xml" />
+  </ItemGroup>
+  <ItemGroup>
+    <Compile Include="Properties\AssemblyInfo.cs" />
+    <Compile Include="Queries\DuplicateFilterTest.cs" />
+    <Compile Include="Queries\FuzzyLikeThisQueryTest.cs" />
+    <Compile Include="Queries\TestSlowFuzzyQuery.cs" />
+    <Compile Include="Queries\TestSlowFuzzyQuery2.cs" />
+    <Compile Include="Queries\TestSortedSetSortField.cs" />
+    <Compile Include="Queries\TestSortedSetSortFieldDocValues.cs" />
+    <Compile Include="Queries\TestSortedSetSortFieldSelectors.cs" />
+  </ItemGroup>
+  <ItemGroup>
+    <Service Include="{82A7F48D-3B50-4B1E-B82E-3ADA8210C358}" />
+  </ItemGroup>
+  <ItemGroup>
+    <ProjectReference Include="..\Lucene.Net.Codecs\Lucene.Net.Codecs.csproj">
+      <Project>{3F79B6D4-4359-4F83-B64F-07F4F6262425}</Project>
+      <Name>Lucene.Net.Codecs</Name>
+    </ProjectReference>
+    <ProjectReference Include="..\Lucene.Net.Core\Lucene.Net.csproj">
+      <Project>{5D4AD9BE-1FFB-41AB-9943-25737971BF57}</Project>
+      <Name>Lucene.Net</Name>
+    </ProjectReference>
+    <ProjectReference Include="..\Lucene.Net.Sandbox\Lucene.Net.Sandbox.csproj">
+      <Project>{13274ba9-9052-4354-8ffe-e3f32593368f}</Project>
+      <Name>Lucene.Net.Sandbox</Name>
+    </ProjectReference>
+    <ProjectReference Include="..\Lucene.Net.TestFramework\Lucene.Net.TestFramework.csproj">
+      <Project>{B2C0D749-CE34-4F62-A15E-00CB2FF5DDB3}</Project>
+      <Name>Lucene.Net.TestFramework</Name>
+    </ProjectReference>
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="packages.config" />
+  </ItemGroup>
+  <ItemGroup>
+    <EmbeddedResource Include="Queries\fuzzyTestData.txt" />
+  </ItemGroup>
+  <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
+  <!-- To modify your build process, add your task inside one of the targets below and uncomment it. 
+       Other similar extension points exist, see Microsoft.Common.targets.
+  <Target Name="BeforeBuild">
+  </Target>
+  <Target Name="AfterBuild">
+  </Target>
+  -->
+</Project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/87245e31/src/Lucene.Net.Tests.Sandbox/Properties/AssemblyInfo.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Sandbox/Properties/AssemblyInfo.cs b/src/Lucene.Net.Tests.Sandbox/Properties/AssemblyInfo.cs
new file mode 100644
index 0000000..18e0d47
--- /dev/null
+++ b/src/Lucene.Net.Tests.Sandbox/Properties/AssemblyInfo.cs
@@ -0,0 +1,36 @@
+using System.Reflection;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+// General Information about an assembly is controlled through the following 
+// set of attributes. Change these attribute values to modify the information
+// associated with an assembly.
+[assembly: AssemblyTitle("Lucene.Net.Tests.Sandbox")]
+[assembly: AssemblyDescription("")]
+[assembly: AssemblyConfiguration("")]
+[assembly: AssemblyCompany("")]
+[assembly: AssemblyProduct("Lucene.Net.Tests.Sandbox")]
+[assembly: AssemblyCopyright("Copyright ©  2016")]
+[assembly: AssemblyTrademark("")]
+[assembly: AssemblyCulture("")]
+
+// Setting ComVisible to false makes the types in this assembly not visible 
+// to COM components.  If you need to access a type in this assembly from 
+// COM, set the ComVisible attribute to true on that type.
+[assembly: ComVisible(false)]
+
+// The following GUID is for the ID of the typelib if this project is exposed to COM
+[assembly: Guid("7865cbc8-2c6b-462c-acf5-b2c4d60d93c9")]
+
+// Version information for an assembly consists of the following four values:
+//
+//      Major Version
+//      Minor Version 
+//      Build Number
+//      Revision
+//
+// You can specify all the values or you can default the Build and Revision Numbers 
+// by using the '*' as shown below:
+// [assembly: AssemblyVersion("1.0.*")]
+[assembly: AssemblyVersion("1.0.0.0")]
+[assembly: AssemblyFileVersion("1.0.0.0")]

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/87245e31/src/Lucene.Net.Tests.Sandbox/Queries/DuplicateFilterTest.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Sandbox/Queries/DuplicateFilterTest.cs b/src/Lucene.Net.Tests.Sandbox/Queries/DuplicateFilterTest.cs
new file mode 100644
index 0000000..a0e4742
--- /dev/null
+++ b/src/Lucene.Net.Tests.Sandbox/Queries/DuplicateFilterTest.cs
@@ -0,0 +1,185 @@
+using Lucene.Net.Analysis;
+using Lucene.Net.Documents;
+using Lucene.Net.Index;
+using Lucene.Net.Search;
+using Lucene.Net.Store;
+using Lucene.Net.Util;
+using NUnit.Framework;
+using System.Collections.Generic;
+
+namespace Lucene.Net.Sandbox.Queries
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    public class DuplicateFilterTest : LuceneTestCase
+    {
+        private static readonly string KEY_FIELD = "url";
+        private Directory directory;
+        private IndexReader reader;
+        TermQuery tq = new TermQuery(new Term("text", "lucene"));
+        private IndexSearcher searcher;
+
+
+        public override void SetUp()
+        {
+            base.SetUp();
+            directory = NewDirectory();
+            RandomIndexWriter writer = new RandomIndexWriter(Random(), directory, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())).SetMergePolicy(NewLogMergePolicy()));
+
+            //Add series of docs with filterable fields : url, text and dates  flags
+            AddDoc(writer, "http://lucene.apache.org", "lucene 1.4.3 available", "20040101");
+            AddDoc(writer, "http://lucene.apache.org", "New release pending", "20040102");
+            AddDoc(writer, "http://lucene.apache.org", "Lucene 1.9 out now", "20050101");
+            AddDoc(writer, "http://www.bar.com", "Local man bites dog", "20040101");
+            AddDoc(writer, "http://www.bar.com", "Dog bites local man", "20040102");
+            AddDoc(writer, "http://www.bar.com", "Dog uses Lucene", "20050101");
+            AddDoc(writer, "http://lucene.apache.org", "Lucene 2.0 out", "20050101");
+            AddDoc(writer, "http://lucene.apache.org", "Oops. Lucene 2.1 out", "20050102");
+
+            // Until we fix LUCENE-2348, the index must
+            // have only 1 segment:
+            writer.ForceMerge(1);
+
+            reader = writer.Reader;
+            writer.Dispose();
+            searcher = NewSearcher(reader);
+
+        }
+
+        public override void TearDown()
+        {
+            reader.Dispose();
+            directory.Dispose();
+            base.TearDown();
+        }
+
+        private void AddDoc(RandomIndexWriter writer, string url, string text, string date)
+        {
+            Document doc = new Document();
+            doc.Add(NewStringField(KEY_FIELD, url, Field.Store.YES));
+            doc.Add(NewTextField("text", text, Field.Store.YES));
+            doc.Add(NewTextField("date", date, Field.Store.YES));
+            writer.AddDocument(doc);
+        }
+
+        [Test]
+        public void TestDefaultFilter()
+        {
+            DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
+            HashSet<string> results = new HashSet<string>();
+            ScoreDoc[] hits = searcher.Search(tq, df, 1000).ScoreDocs;
+
+            foreach (ScoreDoc hit in hits)
+            {
+                Document d = searcher.Doc(hit.Doc);
+                string url = d.Get(KEY_FIELD);
+                assertFalse("No duplicate urls should be returned", results.contains(url));
+                results.add(url);
+            }
+        }
+        [Test]
+        public void TestNoFilter()
+        {
+            HashSet<string> results = new HashSet<string>();
+            ScoreDoc[] hits = searcher.Search(tq, null, 1000).ScoreDocs;
+            assertTrue("Default searching should have found some matches", hits.Length > 0);
+            bool dupsFound = false;
+
+            foreach (ScoreDoc hit in hits)
+            {
+                Document d = searcher.Doc(hit.Doc);
+                string url = d.Get(KEY_FIELD);
+                if (!dupsFound)
+                    dupsFound = results.contains(url);
+                results.add(url);
+            }
+            assertTrue("Default searching should have found duplicate urls", dupsFound);
+        }
+
+        [Test]
+        public void TestFastFilter()
+        {
+            DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
+            df.ProcessingMode = (ProcessingMode.PM_FAST_INVALIDATION);
+            HashSet<string> results = new HashSet<string>();
+            ScoreDoc[] hits = searcher.Search(tq, df, 1000).ScoreDocs;
+            assertTrue("Filtered searching should have found some matches", hits.Length > 0);
+
+            foreach (ScoreDoc hit in hits)
+            {
+                Document d = searcher.Doc(hit.Doc);
+                string url = d.Get(KEY_FIELD);
+                assertFalse("No duplicate urls should be returned", results.contains(url));
+                results.add(url);
+            }
+            assertEquals("Two urls found", 2, results.size());
+        }
+
+        [Test]
+        public void TestKeepsLastFilter()
+        {
+            DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
+            df.KeepMode = (KeepMode.KM_USE_LAST_OCCURRENCE);
+            ScoreDoc[] hits = searcher.Search(tq, df, 1000).ScoreDocs;
+            assertTrue("Filtered searching should have found some matches", hits.Length > 0);
+            foreach (ScoreDoc hit in hits)
+            {
+                Document d = searcher.Doc(hit.Doc);
+                string url = d.Get(KEY_FIELD);
+                DocsEnum td = TestUtil.Docs(Random(), reader,
+                    KEY_FIELD,
+                    new BytesRef(url),
+                    MultiFields.GetLiveDocs(reader),
+                    null,
+                    0);
+
+                int lastDoc = 0;
+                while (td.NextDoc() != DocIdSetIterator.NO_MORE_DOCS)
+                {
+                    lastDoc = td.DocID();
+                }
+                assertEquals("Duplicate urls should return last doc", lastDoc, hit.Doc);
+            }
+        }
+
+        [Test]
+        public void TestKeepsFirstFilter()
+        {
+            DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
+            df.KeepMode = (KeepMode.KM_USE_FIRST_OCCURRENCE);
+            ScoreDoc[] hits = searcher.Search(tq, df, 1000).ScoreDocs;
+            assertTrue("Filtered searching should have found some matches", hits.Length > 0);
+            foreach (ScoreDoc hit in hits)
+            {
+                Document d = searcher.Doc(hit.Doc);
+                string url = d.Get(KEY_FIELD);
+                DocsEnum td = TestUtil.Docs(Random(), reader,
+                    KEY_FIELD,
+                    new BytesRef(url),
+                    MultiFields.GetLiveDocs(reader),
+                    null,
+                    0);
+
+                int lastDoc = 0;
+                td.NextDoc();
+                lastDoc = td.DocID();
+                assertEquals("Duplicate urls should return first doc", lastDoc, hit.Doc);
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/87245e31/src/Lucene.Net.Tests.Sandbox/Queries/FuzzyLikeThisQueryTest.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Sandbox/Queries/FuzzyLikeThisQueryTest.cs b/src/Lucene.Net.Tests.Sandbox/Queries/FuzzyLikeThisQueryTest.cs
new file mode 100644
index 0000000..4b830c6
--- /dev/null
+++ b/src/Lucene.Net.Tests.Sandbox/Queries/FuzzyLikeThisQueryTest.cs
@@ -0,0 +1,159 @@
+using Lucene.Net.Analysis;
+using Lucene.Net.Documents;
+using Lucene.Net.Index;
+using Lucene.Net.Search;
+using Lucene.Net.Store;
+using Lucene.Net.Util;
+using NUnit.Framework;
+using System.Collections.Generic;
+
+namespace Lucene.Net.Sandbox.Queries
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    public class FuzzyLikeThisQueryTest : LuceneTestCase
+    {
+        private Directory directory;
+        private IndexSearcher searcher;
+        private IndexReader reader;
+        private Analyzer analyzer;
+
+        public override void SetUp()
+        {
+            base.SetUp();
+
+            analyzer = new MockAnalyzer(Random());
+            directory = NewDirectory();
+            RandomIndexWriter writer = new RandomIndexWriter(Random(), directory, NewIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(Random())).SetMergePolicy(NewLogMergePolicy()));
+
+            //Add series of docs with misspelt names
+            AddDoc(writer, "jonathon smythe", "1");
+            AddDoc(writer, "jonathan smith", "2");
+            AddDoc(writer, "johnathon smyth", "3");
+            AddDoc(writer, "johnny smith", "4");
+            AddDoc(writer, "jonny smith", "5");
+            AddDoc(writer, "johnathon smythe", "6");
+            reader = writer.Reader;
+            writer.Dispose();
+            searcher = NewSearcher(reader);
+        }
+
+        public override void TearDown()
+        {
+            reader.Dispose();
+            directory.Dispose();
+            base.TearDown();
+        }
+
+        private void AddDoc(RandomIndexWriter writer, string name, string id)
+        {
+            Document doc = new Document();
+            doc.Add(NewTextField("name", name, Field.Store.YES));
+            doc.Add(NewTextField("id", id, Field.Store.YES));
+            writer.AddDocument(doc);
+        }
+
+
+        //Tests that idf ranking is not favouring rare mis-spellings over a strong edit-distance match
+        [Test]
+        public void TestClosestEditDistanceMatchComesFirst()
+        {
+            FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer);
+            flt.AddTerms("smith", "name", 0.3f, 1);
+            Query q = flt.Rewrite(searcher.IndexReader);
+            HashSet<Term> queryTerms = new HashSet<Term>();
+            q.ExtractTerms(queryTerms);
+            assertTrue("Should have variant smythe", queryTerms.contains(new Term("name", "smythe")));
+            assertTrue("Should have variant smith", queryTerms.contains(new Term("name", "smith")));
+            assertTrue("Should have variant smyth", queryTerms.contains(new Term("name", "smyth")));
+            TopDocs topDocs = searcher.Search(flt, 1);
+            ScoreDoc[] sd = topDocs.ScoreDocs;
+            assertTrue("score docs must match 1 doc", (sd != null) && (sd.Length > 0));
+            Document doc = searcher.Doc(sd[0].Doc);
+            assertEquals("Should match most similar not most rare variant", "2", doc.Get("id"));
+        }
+
+        //Test multiple input words are having variants produced
+        [Test]
+        public void TestMultiWord()
+        {
+            FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer);
+            flt.AddTerms("jonathin smoth", "name", 0.3f, 1);
+            Query q = flt.Rewrite(searcher.IndexReader);
+            HashSet<Term> queryTerms = new HashSet<Term>();
+            q.ExtractTerms(queryTerms);
+            assertTrue("Should have variant jonathan", queryTerms.contains(new Term("name", "jonathan")));
+            assertTrue("Should have variant smith", queryTerms.contains(new Term("name", "smith")));
+            TopDocs topDocs = searcher.Search(flt, 1);
+            ScoreDoc[] sd = topDocs.ScoreDocs;
+            assertTrue("score docs must match 1 doc", (sd != null) && (sd.Length > 0));
+            Document doc = searcher.Doc(sd[0].Doc);
+            assertEquals("Should match most similar when using 2 words", "2", doc.Get("id"));
+        }
+
+        // LUCENE-4809
+        [Test]
+        public void TestNonExistingField()
+        {
+            FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer);
+            flt.AddTerms("jonathin smoth", "name", 0.3f, 1);
+            flt.AddTerms("jonathin smoth", "this field does not exist", 0.3f, 1);
+            // don't fail here just because the field doesn't exits
+            Query q = flt.Rewrite(searcher.IndexReader);
+            HashSet<Term> queryTerms = new HashSet<Term>();
+            q.ExtractTerms(queryTerms);
+            assertTrue("Should have variant jonathan", queryTerms.contains(new Term("name", "jonathan")));
+            assertTrue("Should have variant smith", queryTerms.contains(new Term("name", "smith")));
+            TopDocs topDocs = searcher.Search(flt, 1);
+            ScoreDoc[] sd = topDocs.ScoreDocs;
+            assertTrue("score docs must match 1 doc", (sd != null) && (sd.Length > 0));
+            Document doc = searcher.Doc(sd[0].Doc);
+            assertEquals("Should match most similar when using 2 words", "2", doc.Get("id"));
+        }
+
+
+        //Test bug found when first query word does not match anything
+        [Test]
+        public void TestNoMatchFirstWordBug()
+        {
+            FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer);
+            flt.AddTerms("fernando smith", "name", 0.3f, 1);
+            Query q = flt.Rewrite(searcher.IndexReader);
+            HashSet<Term> queryTerms = new HashSet<Term>();
+            q.ExtractTerms(queryTerms);
+            assertTrue("Should have variant smith", queryTerms.contains(new Term("name", "smith")));
+            TopDocs topDocs = searcher.Search(flt, 1);
+            ScoreDoc[] sd = topDocs.ScoreDocs;
+            assertTrue("score docs must match 1 doc", (sd != null) && (sd.Length > 0));
+            Document doc = searcher.Doc(sd[0].Doc);
+            assertEquals("Should match most similar when using 2 words", "2", doc.Get("id"));
+        }
+
+        [Test]
+        public void TestFuzzyLikeThisQueryEquals()
+        {
+            Analyzer analyzer = new MockAnalyzer(Random());
+            FuzzyLikeThisQuery fltq1 = new FuzzyLikeThisQuery(10, analyzer);
+            fltq1.AddTerms("javi", "subject", 0.5f, 2);
+            FuzzyLikeThisQuery fltq2 = new FuzzyLikeThisQuery(10, analyzer);
+            fltq2.AddTerms("javi", "subject", 0.5f, 2);
+            assertEquals("FuzzyLikeThisQuery with same attributes is not equal", fltq1,
+                fltq2);
+        }
+    }
+}


Mime
View raw message