lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From d...@apache.org
Subject svn commit: r918703 [1/2] - in /lucene/lucene.net/trunk/C#/contrib/Queries.Net: ./ Queries.Net/ Queries.Net/Properties/ Queries.Net/Similar/ Test/ Test/Properties/ Test/Similar/
Date Wed, 03 Mar 2010 21:31:21 GMT
Author: digy
Date: Wed Mar  3 21:31:20 2010
New Revision: 918703

URL: http://svn.apache.org/viewvc?rev=918703&view=rev
Log:
LUCENENET-347 [Contrib] Port of Queries (Initial Port)

Added:
    lucene/lucene.net/trunk/C#/contrib/Queries.Net/
    lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/
    lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net.sln
    lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/BooleanFilter.cs
    lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/BoostingQuery.cs
    lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/DuplicateFilter.cs
    lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/FilterClause.cs
    lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/FuzzyLikeThisQuery.cs
    lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Properties/
    lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Properties/AssemblyInfo.cs
    lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Queries.Net.csproj
    lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Similar/
    lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Similar/MoreLikeThis.cs
    lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Similar/MoreLikeThisQuery.cs
    lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Similar/SimilarityQueries.cs
    lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Similar/package.html
    lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Support.cs
    lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/TermsFilter.cs
    lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/
    lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/BooleanFilterTest.cs
    lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/BoostingQueryTest.cs
    lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/DuplicateFilterTest.cs
    lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/FuzzyLikeThisQueryTest.cs
    lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/Properties/
    lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/Properties/AssemblyInfo.cs
    lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/Similar/
    lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/Similar/TestMoreLikeThis.cs
    lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/TermsFilterTest.cs
    lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/Test.csproj

Added: lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net.sln
URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/Queries.Net/Queries.Net.sln?rev=918703&view=auto
==============================================================================
--- lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net.sln (added)
+++ lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net.sln Wed Mar  3 21:31:20 2010
@@ -0,0 +1,26 @@
+
+Microsoft Visual Studio Solution File, Format Version 10.00
+# Visual C# Express 2008
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Queries.Net", "Queries.Net\Queries.Net.csproj",
"{481CF6E3-52AF-4621-9DEB-022122079AF6}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Test", "Test\Test.csproj", "{8685A826-9B7A-42C8-88F3-EEE6B41D6D81}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Any CPU = Debug|Any CPU
+		Release|Any CPU = Release|Any CPU
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{481CF6E3-52AF-4621-9DEB-022122079AF6}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{481CF6E3-52AF-4621-9DEB-022122079AF6}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{481CF6E3-52AF-4621-9DEB-022122079AF6}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{481CF6E3-52AF-4621-9DEB-022122079AF6}.Release|Any CPU.Build.0 = Release|Any CPU
+		{8685A826-9B7A-42C8-88F3-EEE6B41D6D81}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{8685A826-9B7A-42C8-88F3-EEE6B41D6D81}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{8685A826-9B7A-42C8-88F3-EEE6B41D6D81}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{8685A826-9B7A-42C8-88F3-EEE6B41D6D81}.Release|Any CPU.Build.0 = Release|Any CPU
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal

Added: lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/BooleanFilter.cs
URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/Queries.Net/Queries.Net/BooleanFilter.cs?rev=918703&view=auto
==============================================================================
--- lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/BooleanFilter.cs (added)
+++ lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/BooleanFilter.cs Wed Mar  3
21:31:20 2010
@@ -0,0 +1,32 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Search
+{
+    class BooleanFilter
+    {
+        public BooleanFilter()
+        {
+            throw new NotImplementedException("Not implemented yet.");
+        }
+    }
+}

Added: lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/BoostingQuery.cs
URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/Queries.Net/Queries.Net/BoostingQuery.cs?rev=918703&view=auto
==============================================================================
--- lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/BoostingQuery.cs (added)
+++ lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/BoostingQuery.cs Wed Mar  3
21:31:20 2010
@@ -0,0 +1,151 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+using Lucene.Net.Index;
+using Lucene.Net.Search;
+
+namespace Lucene.Net.Search
+{
+    /// <summary>
+    ///  The BoostingQuery class can be used to effectively demote results that match a given
query. 
+    ///  Unlike the "NOT" clause, this still selects documents that contain undesirable terms,

+    ///  but reduces their overall score:
+    /// 
+    ///      Query balancedQuery = new BoostingQuery(positiveQuery, negativeQuery, 0.01f);
+    ///  In this scenario the positiveQuery contains the mandatory, desirable criteria which
is used to 
+    ///  select all matching documents, and the negativeQuery contains the undesirable elements
which 
+    ///  are simply used to lessen the scores. Documents that match the negativeQuery have
their score 
+    ///  multiplied by the supplied "boost" parameter, so this should be less than 1 to achieve
a 
+    ///  demoting effect
+    ///  
+    ///  This code was originally made available here: [WWW] http://marc.theaimsgroup.com/?l=lucene-user&m=108058407130459&w=2
+    ///  and is documented here: http://wiki.apache.org/lucene-java/CommunityContributions
+    /// </summary>
+    public class BoostingQuery : Query
+    {
+        private float boost;                            // the amount to boost by
+        private Query match;                            // query to match
+        private Query context;                          // boost when matches too
+
+        public BoostingQuery(Query match, Query context, float boost)
+        {
+            this.match = match;
+            this.context = (Query)context.Clone();        // clone before boost
+            this.boost = boost;
+
+            this.context.SetBoost(0.0f);                      // ignore context-only matches
+        }
+
+        public override Query Rewrite(IndexReader reader)
+        {
+            BooleanQuery result = new AnonymousBooleanQuery(boost);
+
+            result.Add(match, BooleanClause.Occur.MUST);
+            result.Add(context, BooleanClause.Occur.SHOULD);
+
+            return result;
+        }
+
+        class AnonymousBooleanQuery : BooleanQuery
+        {
+            float boost;
+            public AnonymousBooleanQuery(float boost)
+            {
+                this.boost = boost;
+            }
+
+            public override Similarity GetSimilarity(Searcher searcher)
+            {
+                return new AnonymousDefaultSimilarity(boost);
+            }
+        }
+
+        class AnonymousDefaultSimilarity : DefaultSimilarity
+        {
+            float boost ;
+            public AnonymousDefaultSimilarity(float boost)
+            {
+                this.boost = boost;
+            }
+
+            public override float Coord(int overlap, int max)
+            {
+                switch (overlap)
+                {
+
+                    case 1:                               // matched only one clause
+                        return 1.0f;                        // use the score as-is
+
+                    case 2:                               // matched both clauses
+                        return boost;                       // multiply by boost
+
+                    default:
+                        return 0.0f;
+
+                }
+            }
+        }
+        
+        public override  int GetHashCode()
+        {
+            int prime = 31;
+            int result = 1;
+            result = prime * result + BitConverter.ToInt32(BitConverter.GetBytes(boost),0);
+            result = prime * result + ((context == null) ? 0 : context.GetHashCode());
+            result = prime * result + ((match == null) ? 0 : match.GetHashCode());
+            return result;
+        }
+
+        public override bool Equals(Object obj)
+        {
+            if (this == obj)
+                return true;
+            if (obj == null)
+                return false;
+            if (this.GetType() != obj.GetType())
+                return false;
+            BoostingQuery other = (BoostingQuery)obj;
+            if (BitConverter.ToInt32(BitConverter.GetBytes(boost),0) != BitConverter.ToInt32(BitConverter.GetBytes(other.boost),0)
)
+                return false;
+            if (context == null)
+            {
+                if (other.context != null)
+                    return false;
+            }
+            else if (!context.Equals(other.context))
+                return false;
+            if (match == null)
+            {
+                if (other.match != null)
+                    return false;
+            }
+            else if (!match.Equals(other.match))
+                return false;
+            return true;
+        }
+
+        public override String ToString(String field)
+        {
+            return match.ToString(field) + "/" + context.ToString(field);
+        }
+    }
+}

Added: lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/DuplicateFilter.cs
URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/Queries.Net/Queries.Net/DuplicateFilter.cs?rev=918703&view=auto
==============================================================================
--- lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/DuplicateFilter.cs (added)
+++ lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/DuplicateFilter.cs Wed Mar
 3 21:31:20 2010
@@ -0,0 +1,247 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+using Lucene.Net.Search;
+using Lucene.Net.Index;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Search
+{
+    public class DuplicateFilter : Filter
+    {
+
+        String fieldName;
+
+        /**
+         * KeepMode determines which document id to consider as the master, all others being

+         * identified as duplicates. Selecting the "first occurrence" can potentially save
on IO.
+         */
+        int keepMode = KM_USE_FIRST_OCCURRENCE;
+        public static int KM_USE_FIRST_OCCURRENCE = 1;
+        public static int KM_USE_LAST_OCCURRENCE = 2;
+
+        /**
+         * "Full" processing mode starts by setting all bits to false and only setting bits
+         * for documents that contain the given field and are identified as none-duplicates.

+
+         * "Fast" processing sets all bits to true then unsets all duplicate docs found for
the
+         * given field. This approach avoids the need to read TermDocs for terms that are
seen 
+         * to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially

+         * faster approach , the downside is that bitsets produced will include bits set
for 
+         * documents that do not actually contain the field given.
+         * 
+         */
+        int processingMode = PM_FULL_VALIDATION;
+        public static int PM_FULL_VALIDATION = 1;
+        public static int PM_FAST_INVALIDATION = 2;
+
+
+
+        public DuplicateFilter(String fieldName) :  this(fieldName, KM_USE_LAST_OCCURRENCE,
PM_FULL_VALIDATION)
+        {
+        }
+
+
+        public DuplicateFilter(String fieldName, int keepMode, int processingMode)
+        {
+            this.fieldName = fieldName;
+            this.keepMode = keepMode;
+            this.processingMode = processingMode;
+        }
+
+        public override DocIdSet GetDocIdSet(IndexReader reader)
+        {
+            if (processingMode == PM_FAST_INVALIDATION)
+            {
+                return FastBits(reader);
+            }
+            else
+            {
+                return CorrectBits(reader);
+            }
+        }
+
+        private OpenBitSet CorrectBits(IndexReader reader)
+        {
+
+            OpenBitSet bits = new OpenBitSet(reader.MaxDoc()); //assume all are INvalid
+            Term startTerm = new Term(fieldName);
+            TermEnum te = reader.Terms(startTerm);
+            if (te != null)
+            {
+                Term currTerm = te.Term();
+                while ((currTerm != null) && (currTerm.Field() == startTerm.Field()))
//term fieldnames are interned
+                {
+                    int lastDoc = -1;
+                    //set non duplicates
+                    TermDocs td = reader.TermDocs(currTerm);
+                    if (td.Next())
+                    {
+                        if (keepMode == KM_USE_FIRST_OCCURRENCE)
+                        {
+                            bits.Set(td.Doc());
+                        }
+                        else
+                        {
+                            do
+                            {
+                                lastDoc = td.Doc();
+                            } while (td.Next());
+                            bits.Set(lastDoc);
+                        }
+                    }
+                    if (!te.Next())
+                    {
+                        break;
+                    }
+                    currTerm = te.Term();
+                }
+            }
+            return bits;
+        }
+
+        private OpenBitSet FastBits(IndexReader reader)
+        {
+
+            OpenBitSet bits = new OpenBitSet(reader.MaxDoc());
+            bits.Set(0, reader.MaxDoc()); //assume all are valid
+            Term startTerm = new Term(fieldName);
+            TermEnum te = reader.Terms(startTerm);
+            if (te != null)
+            {
+                Term currTerm = te.Term();
+
+                while ((currTerm != null) && (currTerm.Field() == startTerm.Field()))
//term fieldnames are interned
+                {
+                    if (te.DocFreq() > 1)
+                    {
+                        int lastDoc = -1;
+                        //unset potential duplicates
+                        TermDocs td = reader.TermDocs(currTerm);
+                        td.Next();
+                        if (keepMode == KM_USE_FIRST_OCCURRENCE)
+                        {
+                            td.Next();
+                        }
+                        do
+                        {
+                            lastDoc = td.Doc();
+                            bits.Clear(lastDoc);
+                        } while (td.Next());
+                        if (keepMode == KM_USE_LAST_OCCURRENCE)
+                        {
+                            //restore the last bit
+                            bits.Set(lastDoc);
+                        }
+                    }
+                    if (!te.Next())
+                    {
+                        break;
+                    }
+                    currTerm = te.Term();
+                }
+            }
+            return bits;
+        }
+
+        //    /**
+        //     * @param args
+        //     * @throws IOException 
+        //     * @throws Exception 
+        //     */
+        //    public static void main(String[] args) 
+        //    {
+        //        IndexReader r=IndexReader.open("/indexes/personCentricAnon");
+        ////		IndexReader r=IndexReader.open("/indexes/enron");
+        //        long start=System.currentTimeMillis();
+        ////		DuplicateFilter df = new DuplicateFilter("threadId",KM_USE_FIRST_OCCURRENCE,
PM_FAST_INVALIDATION);
+        ////		DuplicateFilter df = new DuplicateFilter("threadId",KM_USE_LAST_OCCURRENCE,
PM_FAST_INVALIDATION);
+        //        DuplicateFilter df = new DuplicateFilter("vehicle.vrm",KM_USE_LAST_OCCURRENCE,
PM_FAST_INVALIDATION);
+        ////		DuplicateFilter df = new DuplicateFilter("title",USE_LAST_OCCURRENCE);
+        ////		df.setProcessingMode(PM_SLOW_VALIDATION);
+        //        BitSet b = df.bits(r);
+        //        long end=System.currentTimeMillis()-start;
+        //        System.out.println(b.cardinality()+" in "+end+" ms ");
+
+        //    }
+
+
+        public String GetFieldName()
+        {
+            return fieldName;
+        }
+
+
+        public void SetFieldName(String fieldName)
+        {
+            this.fieldName = fieldName;
+        }
+
+
+        public int GetKeepMode()
+        {
+            return keepMode;
+        }
+
+
+        public void SetKeepMode(int keepMode)
+        {
+            this.keepMode = keepMode;
+        }
+
+
+        public override bool Equals(Object obj)
+        {
+            if (this == obj)
+                return true;
+            if ((obj == null) || (obj.GetType()!= this.GetType()))
+                return false;
+            DuplicateFilter other = (DuplicateFilter)obj;
+            return keepMode == other.keepMode &&
+            processingMode == other.processingMode &&
+                (fieldName == other.fieldName || (fieldName != null && fieldName.Equals(other.fieldName)));
+        }
+
+
+
+        public override int GetHashCode()
+        {
+            int hash = 217;
+            hash = 31 * hash + keepMode;
+            hash = 31 * hash + processingMode;
+            hash = 31 * hash + fieldName.GetHashCode();
+            return hash;
+        }
+
+
+        public int GetProcessingMode()
+        {
+            return processingMode;
+        }
+
+
+        public void SetProcessingMode(int processingMode)
+        {
+            this.processingMode = processingMode;
+        }
+    }
+}

Added: lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/FilterClause.cs
URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/Queries.Net/Queries.Net/FilterClause.cs?rev=918703&view=auto
==============================================================================
--- lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/FilterClause.cs (added)
+++ lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/FilterClause.cs Wed Mar  3
21:31:20 2010
@@ -0,0 +1,32 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Search
+{
+    class FilterClause
+    {
+        public FilterClause()
+        {
+            throw new NotImplementedException("Not implemented yet.");
+        }
+    }
+}

Added: lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/FuzzyLikeThisQuery.cs
URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/Queries.Net/Queries.Net/FuzzyLikeThisQuery.cs?rev=918703&view=auto
==============================================================================
--- lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/FuzzyLikeThisQuery.cs (added)
+++ lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/FuzzyLikeThisQuery.cs Wed Mar
 3 21:31:20 2010
@@ -0,0 +1,452 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+using Lucene.Net.Search;
+using Lucene.Net.Index;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Search
+{
+    /// <summary>
+    /// Fuzzifies ALL terms provided as strings and then picks the best n differentiating
terms.
+    /// In effect this mixes the behaviour of FuzzyQuery and MoreLikeThis but with special
consideration
+    /// of fuzzy scoring factors.
+    /// This generally produces good results for queries where users may provide details
in a number of 
+    /// fields and have no knowledge of boolean query syntax and also want a degree of fuzzy
matching and
+    /// a fast query.
+    /// 
+    /// For each source term the fuzzy variants are held in a BooleanQuery with no coord
factor (because
+    /// we are not looking for matches on multiple variants in any one doc). Additionally,
a specialized
+    /// TermQuery is used for variants and does not use that variant term's IDF because this
would favour rarer 
+    /// terms eg misspellings. Instead, all variants use the same IDF ranking (the one for
the source query 
+    /// term) and this is factored into the variant's boost. If the source query term does
not exist in the
+    /// index the average IDF of the variants is used.
+    /// </summary>
+    public class FuzzyLikeThisQuery : Query
+    {
+        static Similarity sim = new DefaultSimilarity();
+        Query rewrittenQuery = null;
+        ArrayList fieldVals = new ArrayList();
+        Analyzer analyzer;
+
+        ScoreTermQueue q;
+        int MAX_VARIANTS_PER_TERM = 50;
+        bool ignoreTF = false;
+        private int maxNumTerms;
+
+        public override int GetHashCode()
+        {
+            int prime = 31;
+            int result = 1;
+            result = prime * result + ((analyzer == null) ? 0 : analyzer.GetHashCode());
+            result = prime * result
+                + ((fieldVals == null) ? 0 : fieldVals.GetHashCode());
+            result = prime * result + (ignoreTF ? 1231 : 1237);
+            result = prime * result + maxNumTerms;
+            return result;
+        }
+
+        public override bool Equals(Object obj)
+        {
+            if (this == obj)
+                return true;
+            if (obj == null)
+                return false;
+            if (GetType() != obj.GetType())
+                return false;
+            FuzzyLikeThisQuery other = (FuzzyLikeThisQuery)obj;
+            if (analyzer == null)
+            {
+                if (other.analyzer != null)
+                    return false;
+            }
+            else if (!analyzer.Equals(other.analyzer))
+                return false;
+            if (fieldVals == null)
+            {
+                if (other.fieldVals != null)
+                    return false;
+            }
+            else if (!fieldVals.EqualsToArrayList(other.fieldVals))
+                return false;
+            if (ignoreTF != other.ignoreTF)
+                return false;
+            if (maxNumTerms != other.maxNumTerms)
+                return false;
+            return true;
+        }
+
+
+        /**
+         * 
+         * @param maxNumTerms The total number of terms clauses that will appear once rewritten
as a BooleanQuery
+         * @param analyzer
+         */
+        public FuzzyLikeThisQuery(int maxNumTerms, Analyzer analyzer)
+        {
+            q = new ScoreTermQueue(maxNumTerms);
+            this.analyzer = analyzer;
+            this.maxNumTerms = maxNumTerms;
+        }
+
+        class FieldVals
+        {
+            internal String queryString;
+            internal String fieldName;
+            internal float minSimilarity;
+            internal int prefixLength;
+            public FieldVals(String name, float similarity, int length, String queryString)
+            {
+                fieldName = name;
+                minSimilarity = similarity;
+                prefixLength = length;
+                this.queryString = queryString;
+            }
+
+            public override int GetHashCode()
+            {
+                int prime = 31;
+                int result = 1;
+                result = prime * result
+                    + ((fieldName == null) ? 0 : fieldName.GetHashCode());
+                result = prime * result + BitConverter.ToInt32(BitConverter.GetBytes(minSimilarity),0);
+                result = prime * result + prefixLength;
+                result = prime * result
+                    + ((queryString == null) ? 0 : queryString.GetHashCode());
+                return result;
+            }
+
+            public override bool Equals(Object obj)
+            {
+                if (this == obj)
+                    return true;
+                if (obj == null)
+                    return false;
+                if (GetType() != obj.GetType())
+                    return false;
+                FieldVals other = (FieldVals)obj;
+                if (fieldName == null)
+                {
+                    if (other.fieldName != null)
+                        return false;
+                }
+                else if (!fieldName.Equals(other.fieldName))
+                    return false;
+                if (BitConverter.ToInt32(BitConverter.GetBytes(minSimilarity), 0) != BitConverter.ToInt32(BitConverter.GetBytes(other.minSimilarity),
0))
+                //if (Float.floatToIntBits(minSimilarity) != Float.floatToIntBits(other.minSimilarity))
+                    return false;
+                if (prefixLength != other.prefixLength)
+                    return false;
+                if (queryString == null)
+                {
+                    if (other.queryString != null)
+                        return false;
+                }
+                else if (!queryString.Equals(other.queryString))
+                    return false;
+                return true;
+            }
+
+
+
+        }
+
+        /**
+         * Adds user input for "fuzzification" 
+         * @param queryString The string which will be parsed by the analyzer and for which
fuzzy variants will be parsed
+         * @param fieldName
+         * @param minSimilarity The minimum similarity of the term variants (see FuzzyTermEnum)
+         * @param prefixLength Length of required common prefix on variant terms (see FuzzyTermEnum)
+         */
+        public void AddTerms(String queryString, String fieldName, float minSimilarity, int
prefixLength)
+        {
+            fieldVals.Add(new FieldVals(fieldName, minSimilarity, prefixLength, queryString));
+        }
+
+
+        private void AddTerms(IndexReader reader, FieldVals f)
+        {
+            if (f.queryString == null) return;
+            TokenStream ts = analyzer.TokenStream(f.fieldName, new System.IO.StringReader(f.queryString));
+            TermAttribute termAtt = (TermAttribute)ts.AddAttribute(typeof(TermAttribute));
+
+            int corpusNumDocs = reader.NumDocs();
+            Term internSavingTemplateTerm = new Term(f.fieldName); //optimization to avoid
constructing new Term() objects
+            Hashtable processedTerms = new Hashtable();
+            while (ts.IncrementToken())
+            {
+                String term = termAtt.Term();
+                if (!processedTerms.Contains(term))
+                {
+                    processedTerms.Add(term,term);
+                    ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM);
//maxNum variants considered for any one term
+                    float minScore = 0;
+                    Term startTerm = internSavingTemplateTerm.CreateTerm(term);
+                    FuzzyTermEnum fe = new FuzzyTermEnum(reader, startTerm, f.minSimilarity,
f.prefixLength);
+                    TermEnum origEnum = reader.Terms(startTerm);
+                    int df = 0;
+                    if (startTerm.Equals(origEnum.Term()))
+                    {
+                        df = origEnum.DocFreq(); //store the df so all variants use same
idf
+                    }
+                    int numVariants = 0;
+                    int totalVariantDocFreqs = 0;
+                    do
+                    {
+                        Term possibleMatch = fe.Term();
+                        if (possibleMatch != null)
+                        {
+                            numVariants++;
+                            totalVariantDocFreqs += fe.DocFreq();
+                            float score = fe.Difference();
+                            if (variantsQ.Size() < MAX_VARIANTS_PER_TERM || score >
minScore)
+                            {
+                                ScoreTerm st = new ScoreTerm(possibleMatch, score, startTerm);
+                                variantsQ.Insert(st);
+                                minScore = ((ScoreTerm)variantsQ.Top()).score; // maintain
minScore
+                            }
+                        }
+                    }
+                    while (fe.Next());
+                    if (numVariants > 0)
+                    {
+                        int avgDf = totalVariantDocFreqs / numVariants;
+                        if (df == 0)//no direct match we can use as df for all variants 
+                        {
+                            df = avgDf; //use avg df of all variants
+                        }
+
+                        // take the top variants (scored by edit distance) and reset the
score
+                        // to include an IDF factor then add to the global queue for ranking

+                        // overall top query terms
+                        int size = variantsQ.Size();
+                        for (int i = 0; i < size; i++)
+                        {
+                            ScoreTerm st = (ScoreTerm)variantsQ.Pop();
+                            st.score = (st.score * st.score) * sim.Idf(df, corpusNumDocs);
+                            q.Insert(st);
+                        }
+                    }
+                }
+            }
+        }
+
+        public override Query Rewrite(IndexReader reader)
+        {
+            if (rewrittenQuery != null)
+            {
+                return rewrittenQuery;
+            }
+            //load up the list of possible terms
+            foreach (FieldVals f in fieldVals)
+            {
+                AddTerms(reader, f);
+            }
+            //for (Iterator iter = fieldVals.iterator(); iter.hasNext(); )
+            //{
+            //    FieldVals f = (FieldVals)iter.next();
+            //    addTerms(reader, f);
+            //}
+            //clear the list of fields
+            fieldVals.Clear();
+
+            BooleanQuery bq = new BooleanQuery();
+
+
+            //create BooleanQueries to hold the variants for each token/field pair and ensure
it
+            // has no coord factor
+            //Step 1: sort the termqueries by term/field
+            Hashtable variantQueries = new Hashtable();
+            int size = q.Size();
+            for (int i = 0; i < size; i++)
+            {
+                ScoreTerm st = (ScoreTerm)q.Pop();
+                ArrayList l = (ArrayList)variantQueries[st.fuzziedSourceTerm];
+                if (l == null)
+                {
+                    l = new ArrayList();
+                    variantQueries.Add(st.fuzziedSourceTerm, l);
+                }
+                l.Add(st);
+            }
+            //Step 2: Organize the sorted termqueries into zero-coord scoring boolean queries
+            foreach(ArrayList variants in variantQueries.Values)
+            //for (Iterator iter = variantQueries.values().iterator(); iter.hasNext(); )
+            {
+                //ArrayList variants = (ArrayList)iter.next();
+                if (variants.Count == 1)
+                {
+                    //optimize where only one selected variant
+                    ScoreTerm st = (ScoreTerm)variants[0];
+                    TermQuery tq = new FuzzyTermQuery(st.term, ignoreTF);
+                    tq.SetBoost(st.score); // set the boost to a mix of IDF and score
+                    bq.Add(tq, BooleanClause.Occur.SHOULD);
+                }
+                else
+                {
+                    BooleanQuery termVariants = new BooleanQuery(true); //disable coord and
IDF for these term variants
+                    foreach(ScoreTerm st in variants)
+                    //for (Iterator iterator2 = variants.iterator(); iterator2.hasNext();
)
+                    {
+                        //ScoreTerm st = (ScoreTerm)iterator2.next();
+                        TermQuery tq = new FuzzyTermQuery(st.term, ignoreTF);      // found
a match
+                        tq.SetBoost(st.score); // set the boost using the ScoreTerm's score
+                        termVariants.Add(tq, BooleanClause.Occur.SHOULD);          // add
to query                    
+                    }
+                    bq.Add(termVariants, BooleanClause.Occur.SHOULD);          // add to
query
+                }
+            }
+            //TODO possible alternative step 3 - organize above booleans into a new layer
of field-based
+            // booleans with a minimum-should-match of NumFields-1?
+            bq.SetBoost(GetBoost());
+            this.rewrittenQuery = bq;
+            return bq;
+        }
+
+        //Holds info for a fuzzy term variant - initially score is set to edit distance (for
ranking best
+        // term variants) then is reset with IDF for use in ranking against all other
+        // terms/fields
+        private class ScoreTerm
+        {
+            public Term term;
+            public float score;
+            internal Term fuzziedSourceTerm;
+
+            public ScoreTerm(Term term, float score, Term fuzziedSourceTerm)
+            {
+                this.term = term;
+                this.score = score;
+                this.fuzziedSourceTerm = fuzziedSourceTerm;
+            }
+        }
+
+        private class ScoreTermQueue : PriorityQueue
+        {
+            public ScoreTermQueue(int size)
+            {
+                Initialize(size);
+            }
+
+            /* (non-Javadoc)
+             * @see org.apache.lucene.util.PriorityQueue#lessThan(java.lang.Object, java.lang.Object)
+             */
+            public override bool LessThan(Object a, Object b)
+            {
+                ScoreTerm termA = (ScoreTerm)a;
+                ScoreTerm termB = (ScoreTerm)b;
+                if (termA.score == termB.score)
+                    return termA.term.CompareTo(termB.term) > 0;
+                else
+                    return termA.score < termB.score;
+            }
+
+        }
+
+        //overrides basic TermQuery to negate effects of IDF (idf is factored into boost
of containing BooleanQuery)
+        private class FuzzyTermQuery : TermQuery
+        {
+            bool ignoreTF;
+            
+            public FuzzyTermQuery(Term t, bool ignoreTF): base(t)
+            {
+                this.ignoreTF = ignoreTF;
+            }
+
+            public override Similarity GetSimilarity(Searcher searcher)
+            {
+                Similarity result = base.GetSimilarity(searcher);
+                result = new AnonymousSimilarityDelegator(this,result);
+                return result;
+            }
+
+            class AnonymousSimilarityDelegator : SimilarityDelegator
+            {
+                FuzzyTermQuery parent = null;
+                public AnonymousSimilarityDelegator(FuzzyTermQuery parent,Similarity result)
: base(result)
+                {
+                    this.parent = parent;
+                }
+
+                public override float Tf(float freq)
+                {
+                    if (parent.ignoreTF)
+                    {
+                        return 1; //ignore tf
+                    }
+                    return base.Tf(freq);
+                }
+
+                public override float Idf(int docFreq, int numDocs)
+                {
+                    //IDF is already factored into individual term boosts
+                    return 1;
+                }
+
+                public override float Coord(int overlap, int maxOverlap)
+                {
+                    return base.Coord(overlap, maxOverlap);
+                }
+
+                public override float LengthNorm(string fieldName, int numTokens)
+                {
+                    return base.LengthNorm(fieldName, numTokens);
+                }
+
+                public override float QueryNorm(float sumOfSquaredWeights)
+                {
+                    return base.QueryNorm(sumOfSquaredWeights);
+                }
+
+                public override float SloppyFreq(int distance)
+                {
+                    return base.SloppyFreq(distance);
+                }
+            }
+
+        }
+
+
+        /* (non-Javadoc)
+         * @see org.apache.lucene.search.Query#toString(java.lang.String)
+         */
+        public override String ToString(String field)
+        {
+            return null;
+        }
+
+
+        public bool IsIgnoreTF()
+        {
+            return ignoreTF;
+        }
+
+
+        public void SetIgnoreTF(bool ignoreTF)
+        {
+            this.ignoreTF = ignoreTF;
+        }
+
+    }
+}

Added: lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Properties/AssemblyInfo.cs
URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/Queries.Net/Queries.Net/Properties/AssemblyInfo.cs?rev=918703&view=auto
==============================================================================
--- lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Properties/AssemblyInfo.cs
(added)
+++ lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Properties/AssemblyInfo.cs
Wed Mar  3 21:31:20 2010
@@ -0,0 +1,36 @@
+using System.Reflection;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+// General Information about an assembly is controlled through the following 
+// set of attributes. Change these attribute values to modify the information
+// associated with an assembly.
+[assembly: AssemblyTitle("Queries.Net(Apache Lucene.Net)")]
+[assembly: AssemblyDescription("")]
+[assembly: AssemblyConfiguration("")]
+[assembly: AssemblyCompany("The Apache Software Foundation")]
+[assembly: AssemblyProduct("Queries.Net")]
+[assembly: AssemblyCopyright("Copyright 2006 - 2010 The Apache Software Foundation")]
+[assembly: AssemblyTrademark("Copyright 2006 - 2010 The Apache Software Foundation")]
+[assembly: AssemblyCulture("")]
+
+// Setting ComVisible to false makes the types in this assembly not visible 
+// to COM components.  If you need to access a type in this assembly from 
+// COM, set the ComVisible attribute to true on that type.
+[assembly: ComVisible(false)]
+
+// The following GUID is for the ID of the typelib if this project is exposed to COM
+[assembly: Guid("6107399b-3ded-4abc-ab60-9e41754258e1")]
+
+// Version information for an assembly consists of the following four values:
+//
+//      Major Version
+//      Minor Version 
+//      Build Number
+//      Revision
+//
+// You can specify all the values or you can default the Build and Revision Numbers 
+// by using the '*' as shown below:
+// [assembly: AssemblyVersion("1.0.*")]
+[assembly: AssemblyVersion("2.9.2")]
+[assembly: AssemblyFileVersion("2.9.2")]

Added: lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Queries.Net.csproj
URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/Queries.Net/Queries.Net/Queries.Net.csproj?rev=918703&view=auto
==============================================================================
--- lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Queries.Net.csproj (added)
+++ lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Queries.Net.csproj Wed Mar
 3 21:31:20 2010
@@ -0,0 +1,72 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="3.5" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+    <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+    <ProductVersion>9.0.21022</ProductVersion>
+    <SchemaVersion>2.0</SchemaVersion>
+    <ProjectGuid>{481CF6E3-52AF-4621-9DEB-022122079AF6}</ProjectGuid>
+    <OutputType>Library</OutputType>
+    <AppDesignerFolder>Properties</AppDesignerFolder>
+    <RootNamespace>Lucene.Net.Search</RootNamespace>
+    <AssemblyName>Queries.Net</AssemblyName>
+    <TargetFrameworkVersion>v3.5</TargetFrameworkVersion>
+    <FileAlignment>512</FileAlignment>
+  </PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
+    <DebugSymbols>true</DebugSymbols>
+    <DebugType>full</DebugType>
+    <Optimize>false</Optimize>
+    <OutputPath>bin\Debug\</OutputPath>
+    <DefineConstants>DEBUG;TRACE</DefineConstants>
+    <ErrorReport>prompt</ErrorReport>
+    <WarningLevel>4</WarningLevel>
+  </PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
+    <DebugType>pdbonly</DebugType>
+    <Optimize>true</Optimize>
+    <OutputPath>bin\Release\</OutputPath>
+    <DefineConstants>TRACE</DefineConstants>
+    <ErrorReport>prompt</ErrorReport>
+    <WarningLevel>4</WarningLevel>
+  </PropertyGroup>
+  <ItemGroup>
+    <Reference Include="Lucene.Net, Version=2.9.2.1, Culture=neutral, processorArchitecture=MSIL">
+      <SpecificVersion>False</SpecificVersion>
+      <HintPath>..\..\..\..\DotNet\Work for 2.9\src\Test\bin\Release\Lucene.Net.dll</HintPath>
+    </Reference>
+    <Reference Include="System" />
+    <Reference Include="System.Core">
+      <RequiredTargetFramework>3.5</RequiredTargetFramework>
+    </Reference>
+    <Reference Include="System.Xml.Linq">
+      <RequiredTargetFramework>3.5</RequiredTargetFramework>
+    </Reference>
+    <Reference Include="System.Data.DataSetExtensions">
+      <RequiredTargetFramework>3.5</RequiredTargetFramework>
+    </Reference>
+    <Reference Include="System.Data" />
+    <Reference Include="System.Xml" />
+  </ItemGroup>
+  <ItemGroup>
+    <Compile Include="BooleanFilter.cs" />
+    <Compile Include="BoostingQuery.cs" />
+    <Compile Include="DuplicateFilter.cs" />
+    <Compile Include="FilterClause.cs" />
+    <Compile Include="FuzzyLikeThisQuery.cs" />
+    <Compile Include="Similar\MoreLikeThis.cs" />
+    <Compile Include="Properties\AssemblyInfo.cs" />
+    <Compile Include="Similar\MoreLikeThisQuery.cs" />
+    <Compile Include="Similar\SimilarityQueries.cs" />
+    <Compile Include="Support.cs" />
+    <Compile Include="TermsFilter.cs" />
+  </ItemGroup>
+  <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
+  <!-- To modify your build process, add your task inside one of the targets below and
uncomment it. 
+       Other similar extension points exist, see Microsoft.Common.targets.
+  <Target Name="BeforeBuild">
+  </Target>
+  <Target Name="AfterBuild">
+  </Target>
+  -->
+</Project>
\ No newline at end of file



Mime
View raw message