lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From d...@apache.org
Subject svn commit: r918703 [2/2] - in /lucene/lucene.net/trunk/C#/contrib/Queries.Net: ./ Queries.Net/ Queries.Net/Properties/ Queries.Net/Similar/ Test/ Test/Properties/ Test/Similar/
Date Wed, 03 Mar 2010 21:31:21 GMT
Added: lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Similar/MoreLikeThis.cs
URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/Queries.Net/Queries.Net/Similar/MoreLikeThis.cs?rev=918703&view=auto
==============================================================================
--- lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Similar/MoreLikeThis.cs (added)
+++ lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Similar/MoreLikeThis.cs Wed Mar  3 21:31:20 2010
@@ -0,0 +1,1013 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+
+using PriorityQueue = Lucene.Net.Util.PriorityQueue;
+using IndexReader = Lucene.Net.Index.IndexReader;
+using Term = Lucene.Net.Index.Term;
+using TermFreqVector = Lucene.Net.Index.TermFreqVector;
+using BooleanClause = Lucene.Net.Search.BooleanClause;
+using DefaultSimilarity = Lucene.Net.Search.DefaultSimilarity;
+using TermQuery = Lucene.Net.Search.TermQuery;
+using BooleanQuery = Lucene.Net.Search.BooleanQuery;
+using IndexSearcher = Lucene.Net.Search.IndexSearcher;
+using Query = Lucene.Net.Search.Query;
+using Hits = Lucene.Net.Search.Hits;
+using Analyzer = Lucene.Net.Analysis.Analyzer;
+using TokenStream = Lucene.Net.Analysis.TokenStream;
+using StandardAnalyzer = Lucene.Net.Analysis.Standard.StandardAnalyzer;
+using Document = Lucene.Net.Documents.Document;
+using Lucene.Net.Analysis.Tokenattributes;
+
+namespace Lucene.Net.Search.Similar
+{
+
+
+    /// <summary> Generate "more like this" similarity queries. 
+    /// Based on this mail:
+    /// <code><pre>
+    /// Lucene does let you access the document frequency of terms, with IndexReader.DocFreq().
+    /// Term frequencies can be computed by re-tokenizing the text, which, for a single document,
+    /// is usually fast enough.  But looking up the DocFreq() of every term in the document is
+    /// probably too slow.
+    /// 
+    /// You can use some heuristics to prune the set of terms, to avoid calling DocFreq() too much,
+    /// or at all.  Since you're trying to maximize a tf*idf score, you're probably most interested
+    /// in terms with a high tf. Choosing a tf threshold even as low as two or three will radically
+    /// reduce the number of terms under consideration.  Another heuristic is that terms with a
+    /// high idf (i.e., a low df) tend to be longer.  So you could threshold the terms by the
+    /// number of characters, not selecting anything less than, e.g., six or seven characters.
+    /// With these sorts of heuristics you can usually find small set of, e.g., ten or fewer terms
+    /// that do a pretty good job of characterizing a document.
+    /// 
+    /// It all depends on what you're trying to do.  If you're trying to eek out that last percent
+    /// of precision and recall regardless of computational difficulty so that you can win a TREC
+    /// competition, then the techniques I mention above are useless.  But if you're trying to
+    /// provide a "more like this" button on a search results page that does a decent job and has
+    /// good performance, such techniques might be useful.
+    /// 
+    /// An efficient, effective "more-like-this" query generator would be a great contribution, if
+    /// anyone's interested.  I'd imagine that it would take a Reader or a String (the document's
+    /// text), analyzer Analyzer, and return a set of representative terms using heuristics like those
+    /// above.  The frequency and length thresholds could be parameters, etc.
+    /// 
+    /// Doug
+    /// </pre></code>
+    /// 
+    /// 
+    /// <p>
+    /// <h3>Initial Usage</h3>
+    /// 
+    /// This class has lots of options to try to make it efficient and flexible.
+    /// See the body of {@link #main Main()} below in the source for real code, or
+    /// if you want pseudo code, the simpliest possible usage is as follows. The bold
+    /// fragment is specific to this class.
+    /// 
+    /// <code><pre>
+    /// 
+    /// IndexReader ir = ...
+    /// IndexSearcher is = ...
+    /// <b>
+    /// MoreLikeThis mlt = new MoreLikeThis(ir);
+    /// Reader target = ... </b><em>// orig source of doc you want to find similarities to</em><b>
+    /// Query query = mlt.Like( target);
+    /// </b>
+    /// Hits hits = is.Search(query);
+    /// <em>// now the usual iteration thru 'hits' - the only thing to watch for is to make sure
+    /// you ignore the doc if it matches your 'target' document, as it should be similar to itself </em>
+    /// 
+    /// </pre></code>
+    /// 
+    /// Thus you:
+    /// <ol>
+    /// <li> do your normal, Lucene setup for searching,
+    /// <li> create a MoreLikeThis,
+    /// <li> get the text of the doc you want to find similaries to
+    /// <li> then call one of the Like() calls to generate a similarity query
+    /// <li> call the searcher to find the similar docs
+    /// </ol>
+    /// 
+    /// <h3>More Advanced Usage</h3>
+    /// 
+    /// You may want to use {@link #SetFieldNames SetFieldNames(...)} so you can examine
+    /// multiple fields (e.g. body and title) for similarity.
+    /// <p>
+    /// 
+    /// Depending on the size of your index and the size and makeup of your documents you
+    /// may want to call the other set methods to control how the similarity queries are
+    /// generated:
+    /// <ul>
+    /// <li> {@link #SetMinTermFreq SetMinTermFreq(...)} </li>
+    /// <li> {@link #SetMinDocFreq SetMinDocFreq(...)} </li>
+    /// <li> {@link #SetMinWordLen SetMinWordLen(...)} </li>
+    /// <li> {@link #SetMaxWordLen SetMaxWordLen(...)}</li>
+    /// <li> {@link #SetMaxQueryTerms SetMaxQueryTerms(...)}</li>
+    /// <li> {@link #SetMaxNumTokensParsed SetMaxNumTokensParsed(...)}</li>
+    /// <li> {@link #SetStopWords SetStopWord(...)} </li>
+    /// </ul> 
+    /// 
+    /// <hr/>
+    /// <pre>
+    /// Changes: Mark Harwood 29/02/04
+    /// Some bugfixing, some refactoring, some optimisation.
+    /// - bugfix: retrieveTerms(int docNum) was not working for indexes without a termvector -added missing code
+    /// - bugfix: No significant terms being created for fields with a termvector - because 
+    /// was only counting one occurence per term/field pair in calculations(ie not including frequency info from TermVector) 
+    /// - refactor: moved common code into isNoiseWord()
+    /// - optimise: when no termvector support available - used maxNumTermsParsed to limit amount of tokenization
+    /// </pre>
+    /// 
+    public sealed class MoreLikeThis
+    {
+
+        /// <summary> Default maximum number of tokens to parse in each example doc field that is not stored with TermVector support.</summary>
+        /// <seealso cref="#getMaxNumTokensParsed">
+        /// </seealso>
+        public const int DEFAULT_MAX_NUM_TOKENS_PARSED = 5000;
+
+
+        /// <summary> Default analyzer to parse source doc with.</summary>
+        /// <seealso cref="#getAnalyzer">
+        /// </seealso>
+        public static readonly Analyzer DEFAULT_ANALYZER = new StandardAnalyzer();
+
+        /// <summary> Ignore terms with less than this frequency in the source doc.</summary>
+        /// <seealso cref="#getMinTermFreq">
+        /// </seealso>
+        /// <seealso cref="#setMinTermFreq">
+        /// </seealso>
+        public const int DEFAULT_MIN_TERM_FREQ = 2;
+
+        /// <summary> Ignore words which do not occur in at least this many docs.</summary>
+        /// <seealso cref="#getMinDocFreq">
+        /// </seealso>
+        /// <seealso cref="#setMinDocFreq">
+        /// </seealso>
+        public const int DEFALT_MIN_DOC_FREQ = 5;
+
+        /// <summary> Boost terms in query based on score.</summary>
+        /// <seealso cref="#isBoost">
+        /// </seealso>
+        /// <seealso cref="#SetBoost">
+        /// </seealso>
+        public const bool DEFAULT_BOOST = false;
+
+        /// <summary> Default field names. Null is used to specify that the field names should be looked
+        /// up at runtime from the provided reader.
+        /// </summary>
+        public static readonly System.String[] DEFAULT_FIELD_NAMES = new System.String[] { "contents" };
+
+        /// <summary> Ignore words less than this length or if 0 then this has no effect.</summary>
+        /// <seealso cref="#getMinWordLen">
+        /// </seealso>
+        /// <seealso cref="#setMinWordLen">
+        /// </seealso>
+        public const int DEFAULT_MIN_WORD_LENGTH = 0;
+
+        /// <summary> Ignore words greater than this length or if 0 then this has no effect.</summary>
+        /// <seealso cref="#getMaxWordLen">
+        /// </seealso>
+        /// <seealso cref="#setMaxWordLen">
+        /// </seealso>
+        public const int DEFAULT_MAX_WORD_LENGTH = 0;
+
+        /// <summary> Default set of stopwords.
+        /// If null means to allow stop words.
+        /// 
+        /// </summary>
+        /// <seealso cref="#setStopWords">
+        /// </seealso>
+        /// <seealso cref="#getStopWords">
+        /// </seealso>
+        public static readonly System.Collections.Hashtable DEFAULT_STOP_WORDS = null;
+
+        /// <summary> Current set of stop words.</summary>
+        private System.Collections.Hashtable stopWords = DEFAULT_STOP_WORDS;
+
+        /// <summary> Return a Query with no more than this many terms.
+        /// 
+        /// </summary>
+        /// <seealso cref="BooleanQuery#getMaxClauseCount">
+        /// </seealso>
+        /// <seealso cref="#getMaxQueryTerms">
+        /// </seealso>
+        /// <seealso cref="#setMaxQueryTerms">
+        /// </seealso>
+        public const int DEFAULT_MAX_QUERY_TERMS = 25;
+
+        /// <summary> Analyzer that will be used to parse the doc.</summary>
+        private Analyzer analyzer = DEFAULT_ANALYZER;
+
+        /// <summary> Ignore words less freqent that this.</summary>
+        private int minTermFreq = DEFAULT_MIN_TERM_FREQ;
+
+        /// <summary> Ignore words which do not occur in at least this many docs.</summary>
+        private int minDocFreq = DEFALT_MIN_DOC_FREQ;
+
+        /// <summary> Should we apply a boost to the Query based on the scores?</summary>
+        private bool boost = DEFAULT_BOOST;
+
+        /// <summary> Field name we'll analyze.</summary>
+        private System.String[] fieldNames = DEFAULT_FIELD_NAMES;
+
+        /// <summary> The maximum number of tokens to parse in each example doc field that is not stored with TermVector support</summary>
+        private int maxNumTokensParsed = DEFAULT_MAX_NUM_TOKENS_PARSED;
+
+
+
+        /// <summary> Ignore words if less than this len.</summary>
+        private int minWordLen = DEFAULT_MIN_WORD_LENGTH;
+
+        /// <summary> Ignore words if greater than this len.</summary>
+        private int maxWordLen = DEFAULT_MAX_WORD_LENGTH;
+
+        /// <summary> Don't return a query longer than this.</summary>
+        private int maxQueryTerms = DEFAULT_MAX_QUERY_TERMS;
+
+        /// <summary> For idf() calculations.</summary>
+        private Lucene.Net.Search.Similarity similarity = null;
+
+        /// <summary> IndexReader to use</summary>
+        private IndexReader ir;
+
+        /// <summary> Boost factor to use when boosting the terms </summary>
+        private float boostFactor = 1;
+
+        /// <summary>
+        /// Returns the boost factor used when boosting terms
+        /// </summary>
+        /// <returns>Returns the boost factor used when boosting terms</returns>
+        public float GetBoostFactor()
+        {
+            return boostFactor;
+        }
+
+        /// <summary>
+        /// Sets the boost factor to use when boosting terms
+        /// </summary>
+        public void SetBoostFactor(float boostFactor)
+        {
+            this.boostFactor = boostFactor;
+        }
+
+        /// <summary> Constructor requiring an IndexReader.</summary>
+        public MoreLikeThis(IndexReader ir) : this(ir,new DefaultSimilarity() )
+        {
+        }
+
+        public MoreLikeThis(IndexReader ir, Lucene.Net.Search.Similarity sim)
+        {
+            this.ir = ir;
+            this.similarity = sim;
+        }
+
+        public Lucene.Net.Search.Similarity GetSimilarity()
+        {
+            return similarity;
+        }
+
+        public void SetSimilarity(Lucene.Net.Search.Similarity similarity)
+        {
+            this.similarity = similarity;
+        }
+
+        /// <summary> Returns an analyzer that will be used to parse source doc with. The default analyzer
+        /// is the {@link #DEFAULT_ANALYZER}.
+        /// 
+        /// </summary>
+        /// <returns> the analyzer that will be used to parse source doc with.
+        /// </returns>
+        /// <seealso cref="#DEFAULT_ANALYZER">
+        /// </seealso>
+        public Analyzer GetAnalyzer()
+        {
+            return analyzer;
+        }
+
+        /// <summary> Sets the analyzer to use. An analyzer is not required for generating a query with the
+        /// {@link #Like(int)} method, all other 'like' methods require an analyzer.
+        /// 
+        /// </summary>
+        /// <param name="analyzer">the analyzer to use to tokenize text.
+        /// </param>
+        public void SetAnalyzer(Analyzer analyzer)
+        {
+            this.analyzer = analyzer;
+        }
+
+        /// <summary> Returns the frequency below which terms will be ignored in the source doc. The default
+        /// frequency is the {@link #DEFAULT_MIN_TERM_FREQ}.
+        /// 
+        /// </summary>
+        /// <returns> the frequency below which terms will be ignored in the source doc.
+        /// </returns>
+        public int GetMinTermFreq()
+        {
+            return minTermFreq;
+        }
+
+        /// <summary> Sets the frequency below which terms will be ignored in the source doc.
+        /// 
+        /// </summary>
+        /// <param name="minTermFreq">the frequency below which terms will be ignored in the source doc.
+        /// </param>
+        public void SetMinTermFreq(int minTermFreq)
+        {
+            this.minTermFreq = minTermFreq;
+        }
+
+        /// <summary> Returns the frequency at which words will be ignored which do not occur in at least this
+        /// many docs. The default frequency is {@link #DEFALT_MIN_DOC_FREQ}.
+        /// 
+        /// </summary>
+        /// <returns> the frequency at which words will be ignored which do not occur in at least this
+        /// many docs.
+        /// </returns>
+        public int GetMinDocFreq()
+        {
+            return minDocFreq;
+        }
+
+        /// <summary> Sets the frequency at which words will be ignored which do not occur in at least this
+        /// many docs.
+        /// 
+        /// </summary>
+        /// <param name="minDocFreq">the frequency at which words will be ignored which do not occur in at
+        /// least this many docs.
+        /// </param>
+        public void SetMinDocFreq(int minDocFreq)
+        {
+            this.minDocFreq = minDocFreq;
+        }
+
+        /// <summary> Returns whether to boost terms in query based on "score" or not. The default is
+        /// {@link #DEFAULT_BOOST}.
+        /// 
+        /// </summary>
+        /// <returns> whether to boost terms in query based on "score" or not.
+        /// </returns>
+        /// <seealso cref="#SetBoost">
+        /// </seealso>
+        public bool IsBoost()
+        {
+            return boost;
+        }
+
+        /// <summary> Sets whether to boost terms in query based on "score" or not.
+        /// 
+        /// </summary>
+        /// <param name="boost">true to boost terms in query based on "score", false otherwise.
+        /// </param>
+        /// <seealso cref="#isBoost">
+        /// </seealso>
+        public void SetBoost(bool boost)
+        {
+            this.boost = boost;
+        }
+
+        /// <summary> Returns the field names that will be used when generating the 'More Like This' query.
+        /// The default field names that will be used is {@link #DEFAULT_FIELD_NAMES}.
+        /// 
+        /// </summary>
+        /// <returns> the field names that will be used when generating the 'More Like This' query.
+        /// </returns>
+        public System.String[] GetFieldNames()
+        {
+            return fieldNames;
+        }
+
+        /// <summary> Sets the field names that will be used when generating the 'More Like This' query.
+        /// Set this to null for the field names to be determined at runtime from the IndexReader
+        /// provided in the constructor.
+        /// 
+        /// </summary>
+        /// <param name="fieldNames">the field names that will be used when generating the 'More Like This'
+        /// query.
+        /// </param>
+        public void SetFieldNames(System.String[] fieldNames)
+        {
+            this.fieldNames = fieldNames;
+        }
+
+        /// <summary> Returns the minimum word length below which words will be ignored. Set this to 0 for no
+        /// minimum word length. The default is {@link #DEFAULT_MIN_WORD_LENGTH}.
+        /// 
+        /// </summary>
+        /// <returns> the minimum word length below which words will be ignored.
+        /// </returns>
+        public int GetMinWordLen()
+        {
+            return minWordLen;
+        }
+
+        /// <summary> Sets the minimum word length below which words will be ignored.
+        /// 
+        /// </summary>
+        /// <param name="minWordLen">the minimum word length below which words will be ignored.
+        /// </param>
+        public void SetMinWordLen(int minWordLen)
+        {
+            this.minWordLen = minWordLen;
+        }
+
+        /// <summary> Returns the maximum word length above which words will be ignored. Set this to 0 for no
+        /// maximum word length. The default is {@link #DEFAULT_MAX_WORD_LENGTH}.
+        /// 
+        /// </summary>
+        /// <returns> the maximum word length above which words will be ignored.
+        /// </returns>
+        public int GetMaxWordLen()
+        {
+            return maxWordLen;
+        }
+
+        /// <summary> Sets the maximum word length above which words will be ignored.
+        /// 
+        /// </summary>
+        /// <param name="maxWordLen">the maximum word length above which words will be ignored.
+        /// </param>
+        public void SetMaxWordLen(int maxWordLen)
+        {
+            this.maxWordLen = maxWordLen;
+        }
+
+        /// <summary> Set the set of stopwords.
+        /// Any word in this set is considered "uninteresting" and ignored.
+        /// Even if your Analyzer allows stopwords, you might want to tell the MoreLikeThis code to ignore them, as
+        /// for the purposes of document similarity it seems reasonable to assume that "a stop word is never interesting".
+        /// 
+        /// </summary>
+        /// <param name="stopWords">set of stopwords, if null it means to allow stop words
+        /// 
+        /// </param>
+        /// <seealso cref="StopFilter.makeStopSet()">
+        /// </seealso>
+        /// <seealso cref="#getStopWords">
+        /// </seealso>
+        public void SetStopWords(System.Collections.Hashtable stopWords)
+        {
+            this.stopWords = stopWords;
+        }
+
+        /// <summary> Get the current stop words being used.</summary>
+        /// <seealso cref="#setStopWords">
+        /// </seealso>
+        public System.Collections.Hashtable GetStopWords()
+        {
+            return stopWords;
+        }
+
+
+        /// <summary> Returns the maximum number of query terms that will be included in any generated query.
+        /// The default is {@link #DEFAULT_MAX_QUERY_TERMS}.
+        /// 
+        /// </summary>
+        /// <returns> the maximum number of query terms that will be included in any generated query.
+        /// </returns>
+        public int GetMaxQueryTerms()
+        {
+            return maxQueryTerms;
+        }
+
+        /// <summary> Sets the maximum number of query terms that will be included in any generated query.
+        /// 
+        /// </summary>
+        /// <param name="maxQueryTerms">the maximum number of query terms that will be included in any
+        /// generated query.
+        /// </param>
+        public void SetMaxQueryTerms(int maxQueryTerms)
+        {
+            this.maxQueryTerms = maxQueryTerms;
+        }
+
+        /// <returns> The maximum number of tokens to parse in each example doc field that is not stored with TermVector support
+        /// </returns>
+        /// <seealso cref="#DEFAULT_MAX_NUM_TOKENS_PARSED">
+        /// </seealso>
+        public int GetMaxNumTokensParsed()
+        {
+            return maxNumTokensParsed;
+        }
+
+        /// <param name="i">The maximum number of tokens to parse in each example doc field that is not stored with TermVector support
+        /// </param>
+        public void SetMaxNumTokensParsed(int i)
+        {
+            maxNumTokensParsed = i;
+        }
+
+
+
+
+        /// <summary> Return a query that will return docs like the passed lucene document ID.
+        /// 
+        /// </summary>
+        /// <param name="docNum">the documentID of the lucene doc to generate the 'More Like This" query for.
+        /// </param>
+        /// <returns> a query that will return docs like the passed lucene document ID.
+        /// </returns>
+        public Query Like(int docNum)
+        {
+            if (fieldNames == null)
+            {
+                // gather list of valid fields from lucene
+                System.Collections.Generic.ICollection<string> fields = ir.GetFieldNames(IndexReader.FieldOption.INDEXED);
+                fieldNames = new string[fields.Count];
+                fields.CopyTo(fieldNames, 0);
+            }
+
+            return CreateQuery(RetrieveTerms(docNum));
+        }
+
+        /// <summary> Return a query that will return docs like the passed file.
+        /// 
+        /// </summary>
+        /// <returns> a query that will return docs like the passed file.
+        /// </returns>
+        public Query Like(System.IO.FileInfo f)
+        {
+            if (fieldNames == null)
+            {
+                // gather list of valid fields from lucene
+                System.Collections.Generic.ICollection<string> fields = ir.GetFieldNames(IndexReader.FieldOption.INDEXED);
+                fieldNames = new string[fields.Count];
+                fields.CopyTo(fieldNames, 0);
+            }
+
+            return Like(new System.IO.StreamReader(f.FullName, System.Text.Encoding.Default));
+        }
+
+        /// <summary> Return a query that will return docs like the passed URL.
+        /// 
+        /// </summary>
+        /// <returns> a query that will return docs like the passed URL.
+        /// </returns>
+        public Query Like(System.Uri u)
+        {
+            return Like(new System.IO.StreamReader(((System.Net.HttpWebRequest)System.Net.WebRequest.Create(u)).GetResponse().GetResponseStream(), System.Text.Encoding.Default));
+        }
+
+        /// <summary> Return a query that will return docs like the passed stream.
+        /// 
+        /// </summary>
+        /// <returns> a query that will return docs like the passed stream.
+        /// </returns>
+        public Query Like(System.IO.Stream is_Renamed)
+        {
+            return Like(new System.IO.StreamReader(is_Renamed, System.Text.Encoding.Default));
+        }
+
+        /// <summary> Return a query that will return docs like the passed Reader.
+        /// 
+        /// </summary>
+        /// <returns> a query that will return docs like the passed Reader.
+        /// </returns>
+        public Query Like(System.IO.TextReader r)
+        {
+            return CreateQuery(RetrieveTerms(r));
+        }
+
+        /// <summary> Create the More like query from a PriorityQueue</summary>
+        private Query CreateQuery(PriorityQueue q)
+        {
+            BooleanQuery query = new BooleanQuery();
+            System.Object cur;
+            int qterms = 0;
+            float bestScore = 0;
+
+            while (((cur = q.Pop()) != null))
+            {
+                System.Object[] ar = (System.Object[])cur;
+                TermQuery tq = new TermQuery(new Term((System.String)ar[1], (System.String)ar[0]));
+
+                if (boost)
+                {
+                    if (qterms == 0)
+                    {
+                        bestScore = (float)((System.Single)ar[2]);
+                    }
+                    float myScore = (float)((System.Single)ar[2]);
+
+                    tq.SetBoost(boostFactor * myScore / bestScore);
+                }
+
+                try
+                {
+                    query.Add(tq, BooleanClause.Occur.SHOULD);
+                }
+                catch (BooleanQuery.TooManyClauses ignore)
+                {
+                    break;
+                }
+
+                qterms++;
+                if (maxQueryTerms > 0 && qterms >= maxQueryTerms)
+                {
+                    break;
+                }
+            }
+
+            return query;
+        }
+
+        /// <summary> Create a PriorityQueue from a word->tf map.
+        /// 
+        /// </summary>
+        /// <param name="words">a map of words keyed on the word(String) with Int objects as the values.
+        /// </param>
+        private PriorityQueue CreateQueue(System.Collections.IDictionary words)
+        {
+            // have collected all words in doc and their freqs
+            int numDocs = ir.NumDocs();
+            FreqQ res = new FreqQ(words.Count); // will order words by score
+
+            System.Collections.IEnumerator it = words.Keys.GetEnumerator();
+            while (it.MoveNext())
+            {
+                // for every word
+                System.String word = (System.String)it.Current;
+
+                int tf = ((Int)words[word]).x; // term freq in the source doc
+                if (minTermFreq > 0 && tf < minTermFreq)
+                {
+                    continue; // filter out words that don't occur enough times in the source
+                }
+
+                // go through all the fields and find the largest document frequency
+                System.String topField = fieldNames[0];
+                int docFreq = 0;
+                for (int i = 0; i < fieldNames.Length; i++)
+                {
+                    int freq = ir.DocFreq(new Term(fieldNames[i], word));
+                    topField = (freq > docFreq) ? fieldNames[i] : topField;
+                    docFreq = (freq > docFreq) ? freq : docFreq;
+                }
+
+                if (minDocFreq > 0 && docFreq < minDocFreq)
+                {
+                    continue; // filter out words that don't occur in enough docs
+                }
+
+                if (docFreq == 0)
+                {
+                    continue; // index update problem?
+                }
+
+                float idf = similarity.Idf(docFreq, numDocs);
+                float score = tf * idf;
+
+                // only really need 1st 3 entries, other ones are for troubleshooting
+                res.Insert(new System.Object[] { word, topField, (float)score, (float)idf, (System.Int32)docFreq, (System.Int32)tf });
+            }
+            return res;
+        }
+
+        /// <summary> Describe the parameters that control how the "more like this" query is formed.</summary>
+        public System.String DescribeParams()
+        {
+            System.Text.StringBuilder sb = new System.Text.StringBuilder();
+            sb.Append("\t" + "maxQueryTerms  : " + maxQueryTerms + "\n");
+            sb.Append("\t" + "minWordLen     : " + minWordLen + "\n");
+            sb.Append("\t" + "maxWordLen     : " + maxWordLen + "\n");
+            sb.Append("\t" + "fieldNames     : \"");
+            System.String delim = "";
+            for (int i = 0; i < fieldNames.Length; i++)
+            {
+                System.String fieldName = fieldNames[i];
+                sb.Append(delim).Append(fieldName);
+                delim = ", ";
+            }
+            sb.Append("\n");
+            sb.Append("\t" + "boost          : " + boost + "\n");
+            sb.Append("\t" + "minTermFreq    : " + minTermFreq + "\n");
+            sb.Append("\t" + "minDocFreq     : " + minDocFreq + "\n");
+            return sb.ToString();
+        }
+
+        /// <summary> Test driver.
+        /// Pass in "-i INDEX" and then either "-fn FILE" or "-url URL".
+        /// </summary>
+        [STAThread]
+        public static void Main(System.String[] a)
+        {
+            System.String indexName = "localhost_index";
+            System.String fn = "c:/Program Files/Apache Group/Apache/htdocs/manual/vhosts/index.html.en";
+            System.Uri url = null;
+            for (int i = 0; i < a.Length; i++)
+            {
+                if (a[i].Equals("-i"))
+                {
+                    indexName = a[++i];
+                }
+                else if (a[i].Equals("-f"))
+                {
+                    fn = a[++i];
+                }
+                else if (a[i].Equals("-url"))
+                {
+                    url = new System.Uri(a[++i]);
+                }
+            }
+
+            System.IO.StreamWriter temp_writer;
+            temp_writer = new System.IO.StreamWriter(System.Console.OpenStandardOutput(), System.Console.Out.Encoding);
+            temp_writer.AutoFlush = true;
+            System.IO.StreamWriter o = temp_writer;
+            IndexReader r = IndexReader.Open(indexName);
+            o.WriteLine("Open index " + indexName + " which has " + r.NumDocs() + " docs");
+
+            MoreLikeThis mlt = new MoreLikeThis(r);
+
+            o.WriteLine("Query generation parameters:");
+            o.WriteLine(mlt.DescribeParams());
+            o.WriteLine();
+
+            Query query = null;
+            if (url != null)
+            {
+                o.WriteLine("Parsing URL: " + url);
+                query = mlt.Like(url);
+            }
+            else if (fn != null)
+            {
+                o.WriteLine("Parsing file: " + fn);
+                query = mlt.Like(new System.IO.FileInfo(fn));
+            }
+
+            o.WriteLine("q: " + query);
+            o.WriteLine();
+            IndexSearcher searcher = new IndexSearcher(indexName);
+
+            Hits hits = searcher.Search(query);
+            int len = hits.Length();
+            o.WriteLine("found: " + len + " documents matching");
+            o.WriteLine();
+            for (int i = 0; i < System.Math.Min(25, len); i++)
+            {
+                Document d = hits.Doc(i);
+                System.String summary = d.Get("summary");
+                o.WriteLine("score  : " + hits.Score(i));
+                o.WriteLine("url    : " + d.Get("url"));
+                o.WriteLine("\ttitle  : " + d.Get("title"));
+                if (summary != null)
+                    o.WriteLine("\tsummary: " + d.Get("summary"));
+                o.WriteLine();
+            }
+        }
+
+        /// <summary> Find words for a more-like-this query former.
+        /// 
+        /// </summary>
+        /// <param name="docNum">the id of the lucene document from which to find terms
+        /// </param>
+        private PriorityQueue RetrieveTerms(int docNum)
+        {
+            System.Collections.IDictionary termFreqMap = new System.Collections.Hashtable();
+            for (int i = 0; i < fieldNames.Length; i++)
+            {
+                System.String fieldName = fieldNames[i];
+                TermFreqVector vector = ir.GetTermFreqVector(docNum, fieldName);
+
+                // field does not store term vector info
+                if (vector == null)
+                {
+                    Document d = ir.Document(docNum);
+                    System.String[] text = d.GetValues(fieldName);
+                    if (text != null)
+                    {
+                        for (int j = 0; j < text.Length; j++)
+                        {
+                            AddTermFrequencies(new System.IO.StringReader(text[j]), termFreqMap, fieldName);
+                        }
+                    }
+                }
+                else
+                {
+                    AddTermFrequencies(termFreqMap, vector);
+                }
+            }
+
+            return CreateQueue(termFreqMap);
+        }
+
+        /// <summary> Adds terms and frequencies found in vector into the Map termFreqMap</summary>
+        /// <param name="termFreqMap">a Map of terms and their frequencies
+        /// </param>
+        /// <param name="vector">List of terms and their frequencies for a doc/field
+        /// </param>
+        private void AddTermFrequencies(System.Collections.IDictionary termFreqMap, TermFreqVector vector)
+        {
+            System.String[] terms = vector.GetTerms();
+            int[] freqs = vector.GetTermFrequencies();
+            for (int j = 0; j < terms.Length; j++)
+            {
+                System.String term = terms[j];
+
+                if (IsNoiseWord(term))
+                {
+                    continue;
+                }
+                // increment frequency
+                Int cnt = (Int)termFreqMap[term];
+                if (cnt == null)
+                {
+                    cnt = new Int();
+                    termFreqMap[term] = cnt;
+                    cnt.x = freqs[j];
+                }
+                else
+                {
+                    cnt.x += freqs[j];
+                }
+            }
+        }
+        /// <summary> Adds term frequencies found by tokenizing text from reader into the Map words</summary>
+        /// <param name="r">a source of text to be tokenized
+        /// </param>
+        /// <param name="termFreqMap">a Map of terms and their frequencies
+        /// </param>
+        /// <param name="fieldName">Used by analyzer for any special per-field analysis
+        /// </param>
+        private void AddTermFrequencies(System.IO.TextReader r, System.Collections.IDictionary termFreqMap, System.String fieldName)
+        {
+            TokenStream ts = analyzer.TokenStream(fieldName, r);
+			int tokenCount=0;
+			// for every token
+			TermAttribute termAtt = (TermAttribute) ts.AddAttribute(typeof(TermAttribute));
+			
+			while (ts.IncrementToken()) {
+				string word = termAtt.Term();
+				tokenCount++;
+				if(tokenCount>maxNumTokensParsed)
+				{
+					break;
+				}
+				if(IsNoiseWord(word)){
+					continue;
+				}
+				
+				// increment frequency
+				Int cnt = (Int) termFreqMap[word];
+				if (cnt == null) {
+                    termFreqMap[word] = new Int();
+				}
+				else {
+					cnt.x++;
+				}
+			}
+        }
+
+
+        /// <summary>determines if the passed term is likely to be of interest in "more like" comparisons 
+        /// 
+        /// </summary>
+        /// <param name="term">The word being considered
+        /// </param>
+        /// <returns> true if should be ignored, false if should be used in further analysis
+        /// </returns>
+        private bool IsNoiseWord(System.String term)
+        {
+            int len = term.Length;
+            if (minWordLen > 0 && len < minWordLen)
+            {
+                return true;
+            }
+            if (maxWordLen > 0 && len > maxWordLen)
+            {
+                return true;
+            }
+            if (stopWords != null && stopWords.Contains(term))
+            {
+                return true;
+            }
+            return false;
+        }
+
+
+        /// <summary> Find words for a more-like-this query former.
+        /// The result is a priority queue of arrays with one entry for <b>every word</b> in the document.
+        /// Each array has 6 elements.
+        /// The elements are:
+        /// <ol>
+        /// <li> The word (String)
+        /// <li> The top field that this word comes from (String)
+        /// <li> The score for this word (Float)
+        /// <li> The IDF value (Float)
+        /// <li> The frequency of this word in the index (Integer)
+        /// <li> The frequency of this word in the source document (Integer)	 	 
+        /// </ol>
+        /// This is a somewhat "advanced" routine, and in general only the 1st entry in the array is of interest.
+        /// This method is exposed so that you can identify the "interesting words" in a document.
+        /// For an easier method to call see {@link #retrieveInterestingTerms retrieveInterestingTerms()}.
+        /// 
+        /// </summary>
+        /// <param name="r">the reader that has the content of the document
+        /// </param>
+        /// <returns> the most intresting words in the document ordered by score, with the highest scoring, or best entry, first
+        /// 
+        /// </returns>
+        /// <seealso cref="#retrieveInterestingTerms">
+        /// </seealso>
+        public PriorityQueue RetrieveTerms(System.IO.TextReader r)
+        {
+            System.Collections.IDictionary words = new System.Collections.Hashtable();
+            for (int i = 0; i < fieldNames.Length; i++)
+            {
+                System.String fieldName = fieldNames[i];
+                AddTermFrequencies(r, words, fieldName);
+            }
+            return CreateQueue(words);
+        }
+
+
+        public System.String[] RetrieveInterestingTerms(int docNum)
+        {
+            System.Collections.ArrayList al = new System.Collections.ArrayList(maxQueryTerms);
+            PriorityQueue pq = RetrieveTerms(docNum);
+            System.Object cur;
+            int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
+            // we just want to return the top words
+            while (((cur = pq.Pop()) != null) && lim-- > 0)
+            {
+                System.Object[] ar = (System.Object[])cur;
+                al.Add(ar[0]); // the 1st entry is the interesting word
+            }
+            System.String[] res = new System.String[al.Count];
+            // return (System.String[]) SupportClass.ICollectionSupport.ToArray(al, res);
+            return (System.String[])al.ToArray(typeof(System.String));
+        }
+
+        /// <summary> Convenience routine to make it easy to return the most interesting words in a document.
+        /// More advanced users will call {@link #RetrieveTerms(java.io.Reader) retrieveTerms()} directly.
+        /// </summary>
+        /// <param name="r">the source document
+        /// </param>
+        /// <returns> the most interesting words in the document
+        /// 
+        /// </returns>
+        /// <seealso cref="#RetrieveTerms(java.io.Reader)">
+        /// </seealso>
+        /// <seealso cref="#setMaxQueryTerms">
+        /// </seealso>
+        public System.String[] RetrieveInterestingTerms(System.IO.TextReader r)
+        {
+            System.Collections.ArrayList al = new System.Collections.ArrayList(maxQueryTerms);
+            PriorityQueue pq = RetrieveTerms(r);
+            System.Object cur;
+            int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
+            // we just want to return the top words
+            while (((cur = pq.Pop()) != null) && lim-- > 0)
+            {
+                System.Object[] ar = (System.Object[])cur;
+                al.Add(ar[0]); // the 1st entry is the interesting word
+            }
+            System.String[] res = new System.String[al.Count];
+            // return (System.String[]) SupportClass.ICollectionSupport.ToArray(al, res);
+            return (System.String[])al.ToArray(typeof(System.String));
+        }
+
+        /// <summary> PriorityQueue that orders words by score.</summary>
+        private class FreqQ : PriorityQueue
+        {
+            internal FreqQ(int s)
+            {
+                Initialize(s);
+            }
+
+            override public bool LessThan(System.Object a, System.Object b)
+            {
+                System.Object[] aa = (System.Object[])a;
+                System.Object[] bb = (System.Object[])b;
+                System.Single fa = (System.Single)aa[2];
+                System.Single fb = (System.Single)bb[2];
+                return (float)fa > (float)fb;
+            }
+        }
+
+        /// <summary> Use for frequencies and to avoid renewing Integers.</summary>
+        private class Int
+        {
+            internal int x;
+
+            internal Int()
+            {
+                x = 1;
+            }
+        }
+    }
+}
\ No newline at end of file

Added: lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Similar/MoreLikeThisQuery.cs
URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/Queries.Net/Queries.Net/Similar/MoreLikeThisQuery.cs?rev=918703&view=auto
==============================================================================
--- lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Similar/MoreLikeThisQuery.cs (added)
+++ lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Similar/MoreLikeThisQuery.cs Wed Mar  3 21:31:20 2010
@@ -0,0 +1,163 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+using Lucene.Net.Search;
+using Lucene.Net.Analysis;
+using Lucene.Net.Index;
+
+namespace Lucene.Net.Search.Similar
+{
+    /**
+ * A simple wrapper for MoreLikeThis for use in scenarios where a Query object is required eg
+ * in custom QueryParser extensions. At query.rewrite() time the reader is used to construct the
+ * actual MoreLikeThis object and obtain the real Query object.
+ */
+    public class MoreLikeThisQuery : Query
+    {
+
+
+        private String likeText;
+        private String[] moreLikeFields;
+        private Analyzer analyzer;
+        float percentTermsToMatch = 0.3f;
+        int minTermFrequency = 1;
+        int maxQueryTerms = 5;
+        System.Collections.Hashtable stopWords = null;
+        int minDocFreq = -1;
+
+
+        /**
+         * @param moreLikeFields
+         */
+        public MoreLikeThisQuery(String likeText, String[] moreLikeFields, Analyzer analyzer)
+        {
+            this.likeText = likeText;
+            this.moreLikeFields = moreLikeFields;
+            this.analyzer = analyzer;
+        }
+
+        public override Query Rewrite(IndexReader reader)
+        {
+            MoreLikeThis mlt = new MoreLikeThis(reader);
+
+            mlt.SetFieldNames(moreLikeFields);
+            mlt.SetAnalyzer(analyzer);
+            mlt.SetMinTermFreq(minTermFrequency);
+            if (minDocFreq >= 0)
+            {
+                mlt.SetMinDocFreq(minDocFreq);
+            }
+            mlt.SetMaxQueryTerms(maxQueryTerms);
+            mlt.SetStopWords(stopWords);
+            BooleanQuery bq = (BooleanQuery)mlt.Like( new System.IO.StringReader(likeText));
+            BooleanClause[] clauses = bq.GetClauses();
+            //make at least half the terms match
+            bq.SetMinimumNumberShouldMatch((int)(clauses.Length * percentTermsToMatch));
+            return bq;
+        }
+        /* (non-Javadoc)
+         * @see org.apache.lucene.search.Query#toString(java.lang.String)
+         */
+        public override String ToString(String field)
+        {
+            return "like:" + likeText;
+        }
+
+        public float GetPercentTermsToMatch()
+        {
+            return percentTermsToMatch;
+        }
+        public void SetPercentTermsToMatch(float percentTermsToMatch)
+        {
+            this.percentTermsToMatch = percentTermsToMatch;
+        }
+
+        public  Analyzer GetAnalyzer()
+        {
+            return analyzer;
+        }
+
+        public void SetAnalyzer(Analyzer analyzer)
+        {
+            this.analyzer = analyzer;
+        }
+
+        public String GetLikeText()
+        {
+            return likeText;
+        }
+
+        public void SetLikeText(String likeText)
+        {
+            this.likeText = likeText;
+        }
+
+        public int GetMaxQueryTerms()
+        {
+            return maxQueryTerms;
+        }
+
+        public void SetMaxQueryTerms(int maxQueryTerms)
+        {
+            this.maxQueryTerms = maxQueryTerms;
+        }
+
+        public int GetMinTermFrequency()
+        {
+            return minTermFrequency;
+        }
+
+        public void SetMinTermFrequency(int minTermFrequency)
+        {
+            this.minTermFrequency = minTermFrequency;
+        }
+
+        public String[] GetMoreLikeFields()
+        {
+            return moreLikeFields;
+        }
+
+        public void SetMoreLikeFields(String[] moreLikeFields)
+        {
+            this.moreLikeFields = moreLikeFields;
+        }
+        public System.Collections.Hashtable GetStopWords()
+        {
+            return stopWords;
+        }
+        public void SetStopWords(System.Collections.Hashtable stopWords)
+        {
+            this.stopWords = stopWords;
+        }
+
+        public int GetMinDocFreq()
+        {
+            return minDocFreq;
+        }
+
+        public void SetMinDocFreq(int minDocFreq)
+        {
+            this.minDocFreq = minDocFreq;
+        }
+    }
+}

Added: lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Similar/SimilarityQueries.cs
URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/Queries.Net/Queries.Net/Similar/SimilarityQueries.cs?rev=918703&view=auto
==============================================================================
--- lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Similar/SimilarityQueries.cs (added)
+++ lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Similar/SimilarityQueries.cs Wed Mar  3 21:31:20 2010
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+
+using Analyzer = Lucene.Net.Analysis.Analyzer;
+using TokenStream = Lucene.Net.Analysis.TokenStream;
+using Term = Lucene.Net.Index.Term;
+using BooleanQuery = Lucene.Net.Search.BooleanQuery;
+using IndexSearcher = Lucene.Net.Search.IndexSearcher;
+using Query = Lucene.Net.Search.Query;
+using TermQuery = Lucene.Net.Search.TermQuery;
+using BooleanClause = Lucene.Net.Search.BooleanClause;
+using Lucene.Net.Analysis.Tokenattributes;
+
+namespace Similarity.Net
+{
+
+    /// <summary> Simple similarity measures.
+    /// 
+    /// 
+    /// </summary>
+    /// <seealso cref="MoreLikeThis">
+    /// </seealso>
+    public sealed class SimilarityQueries
+    {
+        /// <summary> </summary>
+        private SimilarityQueries()
+        {
+        }
+
+        /// <summary> Simple similarity query generators.
+        /// Takes every unique word and forms a boolean query where all words are optional.
+        /// After you get this you'll use to to query your {@link IndexSearcher} for similar docs.
+        /// The only caveat is the first hit returned <b>should be</b> your source document - you'll
+        /// need to then ignore that.
+        /// 
+        /// <p>
+        /// 
+        /// So, if you have a code fragment like this:
+        /// <br>
+        /// <code>
+        /// Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null);
+        /// </code>
+        /// 
+        /// <p>
+        /// 
+        /// </summary>
+        /// <summary> The query returned, in string form, will be <code>'(i use lucene to search fast searchers are good')</code>.
+        /// 
+        /// <p>
+        /// The philosophy behind this method is "two documents are similar if they share lots of words".
+        /// Note that behind the scenes, Lucenes scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words.
+        /// 
+        /// <P>
+        /// This method is fail-safe in that if a long 'body' is passed in and
+        /// {@link BooleanQuery#add BooleanQuery.add()} (used internally)
+        /// throws
+        /// {@link org.apache.lucene.search.BooleanQuery.TooManyClauses BooleanQuery.TooManyClauses}, the
+        /// query as it is will be returned.
+        /// 
+        /// 
+        /// 
+        /// 
+        /// 
+        /// </summary>
+        /// <param name="body">the body of the document you want to find similar documents to
+        /// </param>
+        /// <param name="a">the analyzer to use to parse the body
+        /// </param>
+        /// <param name="field">the field you want to search on, probably something like "contents" or "body"
+        /// </param>
+        /// <param name="stop">optional set of stop words to ignore
+        /// </param>
+        /// <returns> a query with all unique words in 'body'
+        /// </returns>
+        /// <throws>  IOException this can't happen... </throws>
+        public static Query FormSimilarQuery(System.String body, Analyzer a, System.String field, System.Collections.Hashtable stop)
+        {
+            TokenStream ts = a.TokenStream(field, new System.IO.StringReader(body));
+            TermAttribute termAtt = (TermAttribute)ts.AddAttribute(typeof(TermAttribute));
+
+            BooleanQuery tmp = new BooleanQuery();
+            System.Collections.Hashtable already = new System.Collections.Hashtable(); // ignore dups
+            while (ts.IncrementToken())
+            {
+                String word = termAtt.Term();
+                // ignore opt stop words
+                if (stop != null && stop.Contains(word))
+                    continue;
+                // ignore dups
+                if (already.Contains(word) == true)
+                    continue;
+                already.Add(word, word);
+                // add to query
+                TermQuery tq = new TermQuery(new Term(field, word));
+                try
+                {
+                    tmp.Add(tq, BooleanClause.Occur.SHOULD);
+                }
+                catch (BooleanQuery.TooManyClauses)
+                {
+                    // fail-safe, just return what we have, not the end of the world
+                    break;
+                }
+            }
+            return tmp;
+        }
+    }
+}
\ No newline at end of file

Added: lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Similar/package.html
URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/Queries.Net/Queries.Net/Similar/package.html?rev=918703&view=auto
==============================================================================
--- lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Similar/package.html (added)
+++ lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Similar/package.html Wed Mar  3 21:31:20 2010
@@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html>
+<body>
+Document similarity query generators.
+</body>
+</html>
\ No newline at end of file

Added: lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Support.cs
URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/Queries.Net/Queries.Net/Support.cs?rev=918703&view=auto
==============================================================================
--- lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Support.cs (added)
+++ lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/Support.cs Wed Mar  3 21:31:20 2010
@@ -0,0 +1,21 @@
+using System;
+using System.Collections;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Search
+{
+    static class Extensions
+    {
+        internal static bool EqualsToArrayList(this ArrayList me, ArrayList other)
+        {
+            if (me.Count != other.Count) return false;
+            for (int i = 0; i < me.Count; i++)
+            {
+                if (me[i].Equals(other[i]) == false) return false;
+            }
+            return true;
+        }
+    }
+}

Added: lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/TermsFilter.cs
URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/Queries.Net/Queries.Net/TermsFilter.cs?rev=918703&view=auto
==============================================================================
--- lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/TermsFilter.cs (added)
+++ lucene/lucene.net/trunk/C#/contrib/Queries.Net/Queries.Net/TermsFilter.cs Wed Mar  3 21:31:20 2010
@@ -0,0 +1,32 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Search
+{
+    class TermsFilter
+    {
+        public TermsFilter()
+        {
+            throw new NotImplementedException("Not implemented yet.");
+        }
+    }
+}

Added: lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/BooleanFilterTest.cs
URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/Queries.Net/Test/BooleanFilterTest.cs?rev=918703&view=auto
==============================================================================
--- lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/BooleanFilterTest.cs (added)
+++ lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/BooleanFilterTest.cs Wed Mar  3 21:31:20 2010
@@ -0,0 +1,30 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+using NUnit.Framework;
+
+namespace Lucene.Net.Search
+{
+    class BooleanFilterTest
+    {
+    }
+}

Added: lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/BoostingQueryTest.cs
URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/Queries.Net/Test/BoostingQueryTest.cs?rev=918703&view=auto
==============================================================================
--- lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/BoostingQueryTest.cs (added)
+++ lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/BoostingQueryTest.cs Wed Mar  3 21:31:20 2010
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+using Lucene.Net.Index;
+
+using NUnit.Framework;
+
+namespace Lucene.Net.Search
+{
+    [TestFixture]
+    public class BoostingQueryTest : Lucene.Net.TestCase
+    {
+        [Test]
+        public void TestBoostingQueryEquals()
+        {
+            TermQuery q1 = new TermQuery(new Term("subject:", "java"));
+            TermQuery q2 = new TermQuery(new Term("subject:", "java"));
+            Assert.AreEqual(q1, q2, "Two TermQueries with same attributes should be equal");
+            BoostingQuery bq1 = new BoostingQuery(q1, q2, 0.1f);
+            BoostingQuery bq2 = new BoostingQuery(q1, q2, 0.1f);
+            Assert.AreEqual(bq1, bq2, "BoostingQuery with same attributes is not equal");
+        }
+    }
+}

Added: lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/DuplicateFilterTest.cs
URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/Queries.Net/Test/DuplicateFilterTest.cs?rev=918703&view=auto
==============================================================================
--- lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/DuplicateFilterTest.cs (added)
+++ lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/DuplicateFilterTest.cs Wed Mar  3 21:31:20 2010
@@ -0,0 +1,173 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+using Lucene.Net.Documents;
+using Lucene.Net.Store;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Standard;
+using Lucene.Net.Index;
+
+using NUnit.Framework;
+
+namespace Lucene.Net.Search
+{
+    [TestFixture]
+    public class DuplicateFilterTest : TestCase
+    {
+        private static String KEY_FIELD = "url";
+        private RAMDirectory directory;
+        private IndexReader reader;
+        TermQuery tq = new TermQuery(new Term("text", "lucene"));
+        private IndexSearcher searcher;
+
+        [SetUp]
+        public void SetUp()
+        {
+            directory = new RAMDirectory();
+            IndexWriter writer = new IndexWriter(directory, new StandardAnalyzer(), true);
+
+            //Add series of docs with filterable fields : url, text and dates  flags
+            AddDoc(writer, "http://lucene.apache.org", "lucene 1.4.3 available", "20040101");
+            AddDoc(writer, "http://lucene.apache.org", "New release pending", "20040102");
+            AddDoc(writer, "http://lucene.apache.org", "Lucene 1.9 out now", "20050101");
+            AddDoc(writer, "http://www.bar.com", "Local man bites dog", "20040101");
+            AddDoc(writer, "http://www.bar.com", "Dog bites local man", "20040102");
+            AddDoc(writer, "http://www.bar.com", "Dog uses Lucene", "20050101");
+            AddDoc(writer, "http://lucene.apache.org", "Lucene 2.0 out", "20050101");
+            AddDoc(writer, "http://lucene.apache.org", "Oops. Lucene 2.1 out", "20050102");
+
+            writer.Close();
+            reader = IndexReader.Open(directory,true);
+            searcher = new IndexSearcher(reader);
+
+        }
+
+        [TearDown]
+        public void TearDown()
+        {
+            reader.Close();
+            searcher.Close();
+            directory.Close();
+        }
+
+        private void AddDoc(IndexWriter writer, String url, String text, String date)
+        {
+            Document doc = new Document();
+            doc.Add(new Field(KEY_FIELD, url, Field.Store.YES, Field.Index.NOT_ANALYZED));
+            doc.Add(new Field("text", text, Field.Store.YES, Field.Index.ANALYZED));
+            doc.Add(new Field("date", date, Field.Store.YES, Field.Index.ANALYZED));
+            writer.AddDocument(doc);
+        }
+
+        [Test]
+        public void TestDefaultFilter()
+        {
+            DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
+            Hashtable results = new Hashtable();
+            Hits h = searcher.Search(tq, df);
+            for (int i = 0; i < h.Length(); i++)
+            {
+                Document d = h.Doc(i);
+                String url = d.Get(KEY_FIELD);
+                Assert.IsFalse(results.Contains(url), "No duplicate urls should be returned");
+                results.Add(url,url);
+            }
+        }
+
+        [Test]
+        public void TestNoFilter()
+        {
+            Hashtable results = new Hashtable();
+            Hits h = searcher.Search(tq);
+            Assert.IsTrue(h.Length() > 0, "Default searching should have found some matches");
+            bool dupsFound = false;
+            for (int i = 0; i < h.Length(); i++)
+            {
+                Document d = h.Doc(i);
+                String url = d.Get(KEY_FIELD);
+                if (!dupsFound)
+                    dupsFound = results.Contains(url);
+                results[url]=url;
+            }
+            Assert.IsTrue(dupsFound, "Default searching should have found duplicate urls");
+        }
+
+        [Test]
+        public void TestFastFilter()
+        {
+            DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
+            df.SetProcessingMode(DuplicateFilter.PM_FAST_INVALIDATION);
+            Hashtable results = new Hashtable();
+            Hits h = searcher.Search(tq, df);
+            Assert.IsTrue(h.Length() > 0, "Filtered searching should have found some matches");
+            for (int i = 0; i < h.Length(); i++)
+            {
+                Document d = h.Doc(i);
+                String url = d.Get(KEY_FIELD);
+                Assert.IsFalse(results.Contains(url), "No duplicate urls should be returned");
+                results.Add(url,url);
+            }
+            Assert.AreEqual(2, results.Count, "Two urls found");
+        }
+
+        [Test]
+        public void TestKeepsLastFilter()
+        {
+            DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
+            df.SetKeepMode(DuplicateFilter.KM_USE_LAST_OCCURRENCE);
+            Hits h = searcher.Search(tq, df);
+            Assert.IsTrue(h.Length() > 0, "Filtered searching should have found some matches");
+            for (int i = 0; i < h.Length(); i++)
+            {
+                Document d = h.Doc(i);
+                String url = d.Get(KEY_FIELD);
+                TermDocs td = reader.TermDocs(new Term(KEY_FIELD, url));
+                int lastDoc = 0;
+                while (td.Next())
+                {
+                    lastDoc = td.Doc();
+                }
+                Assert.AreEqual(lastDoc, h.Id((i)), "Duplicate urls should return last doc");
+            }
+        }
+
+        [Test]
+        public void TestKeepsFirstFilter()
+        {
+            DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
+            df.SetKeepMode(DuplicateFilter.KM_USE_FIRST_OCCURRENCE);
+            Hits h = searcher.Search(tq, df);
+            Assert.IsTrue(h.Length() > 0, "Filtered searching should have found some matches");
+            for (int i = 0; i < h.Length(); i++)
+            {
+                Document d = h.Doc(i);
+                String url = d.Get(KEY_FIELD);
+                TermDocs td = reader.TermDocs(new Term(KEY_FIELD, url));
+                int lastDoc = 0;
+                td.Next();
+                lastDoc = td.Doc();
+                Assert.AreEqual(lastDoc, h.Id((i)), "Duplicate urls should return first doc");
+            }
+        }
+    }
+}

Added: lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/FuzzyLikeThisQueryTest.cs
URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/Queries.Net/Test/FuzzyLikeThisQueryTest.cs?rev=918703&view=auto
==============================================================================
--- lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/FuzzyLikeThisQueryTest.cs (added)
+++ lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/FuzzyLikeThisQueryTest.cs Wed Mar  3 21:31:20 2010
@@ -0,0 +1,133 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+using Lucene.Net.Documents;
+using Lucene.Net.Store;
+using Lucene.Net.Search;
+using Lucene.Net.Analysis;
+using Lucene.Net.Index;
+
+using NUnit.Framework;
+
+namespace Lucene.Net.Search
+{
+    [TestFixture]
+    public class FuzzyLikeThisQueryTest : Lucene.Net.TestCase
+    {
+        private RAMDirectory directory;
+        private IndexSearcher searcher;
+        private Analyzer analyzer = new WhitespaceAnalyzer();
+
+        [SetUp]
+        public void SetUp()
+        {
+            directory = new RAMDirectory();
+            IndexWriter writer = new IndexWriter(directory, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
+
+            //Add series of docs with misspelt names
+            AddDoc(writer, "jonathon smythe", "1");
+            AddDoc(writer, "jonathan smith", "2");
+            AddDoc(writer, "johnathon smyth", "3");
+            AddDoc(writer, "johnny smith", "4");
+            AddDoc(writer, "jonny smith", "5");
+            AddDoc(writer, "johnathon smythe", "6");
+
+            writer.Close();
+            searcher = new IndexSearcher(directory,true);
+        }
+
+        private void AddDoc(IndexWriter writer, String name, String id)
+        {
+            Document doc = new Document();
+            doc.Add(new Field("name", name, Field.Store.YES, Field.Index.ANALYZED));
+            doc.Add(new Field("id", id, Field.Store.YES, Field.Index.ANALYZED));
+            writer.AddDocument(doc);
+        }
+
+
+        //Tests that idf ranking is not favouring rare mis-spellings over a strong edit-distance match 
+        [Test]
+        public void TestClosestEditDistanceMatchComesFirst()
+        {
+            FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer);
+            flt.AddTerms("smith", "name", 0.3f, 1);
+            Query q = flt.Rewrite(searcher.GetIndexReader());
+            Hashtable queryTerms = new Hashtable();
+            q.ExtractTerms(queryTerms);
+            Assert.IsTrue(queryTerms.Contains(new Term("name", "smythe")),"Should have variant smythe");
+            Assert.IsTrue(queryTerms.Contains(new Term("name", "smith")), "Should have variant smith");
+            Assert.IsTrue(queryTerms.Contains(new Term("name", "smyth")), "Should have variant smyth");
+            TopDocs topDocs = searcher.Search(flt, 1);
+            ScoreDoc[] sd = topDocs.scoreDocs;
+            Assert.IsTrue((sd != null) && (sd.Length > 0), "score docs must match 1 doc");
+            Document doc = searcher.Doc(sd[0].doc);
+            Assert.AreEqual("2", doc.Get("id"), "Should match most similar not most rare variant");
+        }
+
+        //Test multiple input words are having variants produced
+        [Test]
+        public void TestMultiWord()
+        {
+            FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer);
+            flt.AddTerms("jonathin smoth", "name", 0.3f, 1);
+            Query q = flt.Rewrite(searcher.GetIndexReader());
+            Hashtable queryTerms = new Hashtable();
+            q.ExtractTerms(queryTerms);
+            Assert.IsTrue(queryTerms.Contains(new Term("name", "jonathan")),"Should have variant jonathan");
+            Assert.IsTrue(queryTerms.Contains(new Term("name", "smith")), "Should have variant smith");
+            TopDocs topDocs = searcher.Search(flt, 1);
+            ScoreDoc[] sd = topDocs.scoreDocs;
+            Assert.IsTrue((sd != null) && (sd.Length > 0), "score docs must match 1 doc");
+            Document doc = searcher.Doc(sd[0].doc);
+            Assert.AreEqual("2", doc.Get("id"), "Should match most similar when using 2 words");
+        }
+
+        //Test bug found when first query word does not match anything
+        [Test]
+        public void TestNoMatchFirstWordBug()
+        {
+            FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer);
+            flt.AddTerms("fernando smith", "name", 0.3f, 1);
+            Query q = flt.Rewrite(searcher.GetIndexReader());
+            Hashtable queryTerms = new Hashtable();
+            q.ExtractTerms(queryTerms);
+            Assert.IsTrue(queryTerms.Contains(new Term("name", "smith")), "Should have variant smith");
+            TopDocs topDocs = searcher.Search(flt, 1);
+            ScoreDoc[] sd = topDocs.scoreDocs;
+            Assert.IsTrue((sd != null) && (sd.Length > 0), "score docs must match 1 doc");
+            Document doc = searcher.Doc(sd[0].doc);
+            Assert.AreEqual("2", doc.Get("id"), "Should match most similar when using 2 words");
+        }
+
+        [Test]
+        public void TestFuzzyLikeThisQueryEquals()
+        {
+            Analyzer analyzer = new WhitespaceAnalyzer();
+            FuzzyLikeThisQuery fltq1 = new FuzzyLikeThisQuery(10, analyzer);
+            fltq1.AddTerms("javi", "subject", 0.5f, 2);
+            FuzzyLikeThisQuery fltq2 = new FuzzyLikeThisQuery(10, analyzer);
+            fltq2.AddTerms("javi", "subject", 0.5f, 2);
+            Assert.AreEqual(fltq1, fltq2, "FuzzyLikeThisQuery with same attributes is not equal");
+        }
+    }
+}

Added: lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/Properties/AssemblyInfo.cs
URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/Queries.Net/Test/Properties/AssemblyInfo.cs?rev=918703&view=auto
==============================================================================
--- lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/Properties/AssemblyInfo.cs (added)
+++ lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/Properties/AssemblyInfo.cs Wed Mar  3 21:31:20 2010
@@ -0,0 +1,36 @@
+using System.Reflection;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+// General Information about an assembly is controlled through the following 
+// set of attributes. Change these attribute values to modify the information
+// associated with an assembly.
+[assembly: AssemblyTitle("Test for Queries.Net(Apache Lucene.Net)")]
+[assembly: AssemblyDescription("")]
+[assembly: AssemblyConfiguration("")]
+[assembly: AssemblyCompany("The Apache Software Foundation")]
+[assembly: AssemblyProduct("Test for Queries.Net")]
+[assembly: AssemblyCopyright("Copyright 2006 - 2010 The Apache Software Foundation")]
+[assembly: AssemblyTrademark("Copyright 2006 - 2010 The Apache Software Foundation")]
+[assembly: AssemblyCulture("")]
+
+// Setting ComVisible to false makes the types in this assembly not visible 
+// to COM components.  If you need to access a type in this assembly from 
+// COM, set the ComVisible attribute to true on that type.
+[assembly: ComVisible(false)]
+
+// The following GUID is for the ID of the typelib if this project is exposed to COM
+[assembly: Guid("9feace6d-c1c2-410b-b58c-8f16378aa9fd")]
+
+// Version information for an assembly consists of the following four values:
+//
+//      Major Version
+//      Minor Version 
+//      Build Number
+//      Revision
+//
+// You can specify all the values or you can default the Build and Revision Numbers 
+// by using the '*' as shown below:
+// [assembly: AssemblyVersion("1.0.*")]
+[assembly: AssemblyVersion("2.9.2")]
+[assembly: AssemblyFileVersion("2.9.2")]

Added: lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/Similar/TestMoreLikeThis.cs
URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/Queries.Net/Test/Similar/TestMoreLikeThis.cs?rev=918703&view=auto
==============================================================================
--- lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/Similar/TestMoreLikeThis.cs (added)
+++ lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/Similar/TestMoreLikeThis.cs Wed Mar  3 21:31:20 2010
@@ -0,0 +1,134 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+using Lucene.Net.Documents;
+using Lucene.Net.Index;
+using Lucene.Net.Search;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Standard;
+using Lucene.Net.Store;
+using Lucene.Net.Util;
+
+
+using NUnit.Framework;
+
+namespace Lucene.Net.Search.Similar
+{
+    [TestFixture]
+    public class TestMoreLikeThis : LuceneTestCase
+    {
+        private RAMDirectory directory;
+        private IndexReader reader;
+        private IndexSearcher searcher;
+
+        [SetUp]
+        public new void SetUp()
+        {
+            base.SetUp();
+            directory = new RAMDirectory();
+            IndexWriter writer = new IndexWriter(directory, new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_29),true, IndexWriter.MaxFieldLength.UNLIMITED);
+
+            // Add series of docs with specific information for MoreLikeThis
+            AddDoc(writer, "lucene");
+            AddDoc(writer, "lucene release");
+
+            writer.Close();
+            reader = IndexReader.Open(directory, true);
+            searcher = new IndexSearcher(reader);
+
+        }
+
+        [TearDown]
+        public new void TearDown()
+        {
+            reader.Close();
+            searcher.Close();
+            directory.Close();
+            base.TearDown();
+        }
+
+        private void AddDoc(IndexWriter writer, String text)
+        {
+            Document doc = new Document();
+            doc.Add(new Field("text", text, Field.Store.YES, Field.Index.ANALYZED));
+            writer.AddDocument(doc);
+        }
+
+        [Test]
+        public void TestBoostFactor()
+        {
+            Hashtable originalValues = GetOriginalValues();
+
+            MoreLikeThis mlt = new MoreLikeThis(
+                reader);
+            mlt.SetMinDocFreq(1);
+            mlt.SetMinTermFreq(1);
+            mlt.SetMinWordLen(1);
+            mlt.SetFieldNames(new String[] { "text" });
+            mlt.SetBoost(true);
+
+            // this mean that every term boost factor will be multiplied by this
+            // number
+            float boostFactor = 5;
+            mlt.SetBoostFactor(boostFactor);
+
+            BooleanQuery query = (BooleanQuery)mlt.Like(new System.IO.StringReader("lucene release"));
+            IList clauses = query.Clauses();
+
+            Assert.AreEqual(originalValues.Count, clauses.Count,"Expected " + originalValues.Count + " clauses.");
+
+            for (int i = 0; i < clauses.Count; i++)
+            {
+                BooleanClause clause = (BooleanClause)clauses[i];
+                TermQuery tq = (TermQuery)clause.GetQuery();
+                float termBoost = (float)originalValues[tq.GetTerm().Text()];
+                Assert.IsNotNull(termBoost,"Expected term " + tq.GetTerm().Text());
+
+                float totalBoost = termBoost * boostFactor;
+                Assert.AreEqual(totalBoost, tq.GetBoost(), 0.0001,"Expected boost of " + totalBoost + " for term '"
+                                 + tq.GetTerm().Text() + "' got " + tq.GetBoost());
+            }
+        }
+
+        private Hashtable GetOriginalValues()
+        {
+            Hashtable originalValues = new Hashtable();
+            MoreLikeThis mlt = new MoreLikeThis(reader);
+            mlt.SetMinDocFreq(1);
+            mlt.SetMinTermFreq(1);
+            mlt.SetMinWordLen(1);
+            mlt.SetFieldNames(new String[] { "text" });
+            mlt.SetBoost(true);
+            BooleanQuery query = (BooleanQuery)mlt.Like(new System.IO.StringReader("lucene release"));
+            IList clauses = query.Clauses();
+
+            for (int i = 0; i < clauses.Count; i++)
+            {
+                BooleanClause clause = (BooleanClause)clauses[i];
+                TermQuery tq = (TermQuery)clause.GetQuery();
+                originalValues.Add(tq.GetTerm().Text(), tq.GetBoost());
+            }
+            return originalValues;
+        }
+    }
+}

Added: lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/TermsFilterTest.cs
URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/Queries.Net/Test/TermsFilterTest.cs?rev=918703&view=auto
==============================================================================
--- lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/TermsFilterTest.cs (added)
+++ lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/TermsFilterTest.cs Wed Mar  3 21:31:20 2010
@@ -0,0 +1,30 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+using NUnit.Framework;
+
+namespace Lucene.Net.Search
+{
+    class TermsFilterTest
+    {
+    }
+}

Added: lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/Test.csproj
URL: http://svn.apache.org/viewvc/lucene/lucene.net/trunk/C%23/contrib/Queries.Net/Test/Test.csproj?rev=918703&view=auto
==============================================================================
--- lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/Test.csproj (added)
+++ lucene/lucene.net/trunk/C#/contrib/Queries.Net/Test/Test.csproj Wed Mar  3 21:31:20 2010
@@ -0,0 +1,83 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="3.5" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <PropertyGroup>
+    <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+    <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+    <ProductVersion>9.0.21022</ProductVersion>
+    <SchemaVersion>2.0</SchemaVersion>
+    <ProjectGuid>{8685A826-9B7A-42C8-88F3-EEE6B41D6D81}</ProjectGuid>
+    <OutputType>Library</OutputType>
+    <AppDesignerFolder>Properties</AppDesignerFolder>
+    <RootNamespace>Lucene.Net.Search</RootNamespace>
+    <AssemblyName>Test</AssemblyName>
+    <TargetFrameworkVersion>v3.5</TargetFrameworkVersion>
+    <FileAlignment>512</FileAlignment>
+  </PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
+    <DebugSymbols>true</DebugSymbols>
+    <DebugType>full</DebugType>
+    <Optimize>false</Optimize>
+    <OutputPath>bin\Debug\</OutputPath>
+    <DefineConstants>DEBUG;TRACE</DefineConstants>
+    <ErrorReport>prompt</ErrorReport>
+    <WarningLevel>4</WarningLevel>
+  </PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
+    <DebugType>pdbonly</DebugType>
+    <Optimize>true</Optimize>
+    <OutputPath>bin\Release\</OutputPath>
+    <DefineConstants>TRACE</DefineConstants>
+    <ErrorReport>prompt</ErrorReport>
+    <WarningLevel>4</WarningLevel>
+  </PropertyGroup>
+  <ItemGroup>
+    <Reference Include="Lucene.Net, Version=2.9.2.1, Culture=neutral, processorArchitecture=MSIL">
+      <SpecificVersion>False</SpecificVersion>
+      <HintPath>..\..\..\..\DotNet\Work for 2.9\src\Test\bin\Release\Lucene.Net.dll</HintPath>
+    </Reference>
+    <Reference Include="Lucene.Net.Test, Version=2.9.2.1, Culture=neutral, processorArchitecture=MSIL">
+      <SpecificVersion>False</SpecificVersion>
+      <HintPath>..\..\..\..\DotNet\Work for 2.9\src\Test\bin\Release\Lucene.Net.Test.dll</HintPath>
+    </Reference>
+    <Reference Include="nunit.framework, Version=2.5.2.9222, Culture=neutral, PublicKeyToken=96d09a1eb7f44a77, processorArchitecture=MSIL">
+      <SpecificVersion>False</SpecificVersion>
+      <HintPath>..\..\..\..\DotNet\Work for 2.9\src\Test\bin\Release\nunit.framework.dll</HintPath>
+    </Reference>
+    <Reference Include="System" />
+    <Reference Include="System.Core">
+      <RequiredTargetFramework>3.5</RequiredTargetFramework>
+    </Reference>
+    <Reference Include="System.Xml.Linq">
+      <RequiredTargetFramework>3.5</RequiredTargetFramework>
+    </Reference>
+    <Reference Include="System.Data.DataSetExtensions">
+      <RequiredTargetFramework>3.5</RequiredTargetFramework>
+    </Reference>
+    <Reference Include="System.Data" />
+    <Reference Include="System.Xml" />
+    <Reference Include="Test, Version=1.0.0.0, Culture=neutral, processorArchitecture=MSIL" />
+  </ItemGroup>
+  <ItemGroup>
+    <Compile Include="BooleanFilterTest.cs" />
+    <Compile Include="BoostingQueryTest.cs" />
+    <Compile Include="DuplicateFilterTest.cs" />
+    <Compile Include="FuzzyLikeThisQueryTest.cs" />
+    <Compile Include="Properties\AssemblyInfo.cs" />
+    <Compile Include="TermsFilterTest.cs" />
+    <Compile Include="Similar\TestMoreLikeThis.cs" />
+  </ItemGroup>
+  <ItemGroup>
+    <ProjectReference Include="..\Queries.Net\Queries.Net.csproj">
+      <Project>{481CF6E3-52AF-4621-9DEB-022122079AF6}</Project>
+      <Name>Queries.Net</Name>
+    </ProjectReference>
+  </ItemGroup>
+  <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
+  <!-- To modify your build process, add your task inside one of the targets below and uncomment it. 
+       Other similar extension points exist, see Microsoft.Common.targets.
+  <Target Name="BeforeBuild">
+  </Target>
+  <Target Name="AfterBuild">
+  </Target>
+  -->
+</Project>
\ No newline at end of file



Mime
View raw message