lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From aro...@apache.org
Subject svn commit: r516769 - in /incubator/lucene.net/trunk/C#/contrib: ./ Similarity.Net/ Similarity.Net/Similarity.Net/ Similarity.Net/Similarity.Net/Similar/
Date Sat, 10 Mar 2007 19:26:34 GMT
Author: aroush
Date: Sat Mar 10 11:26:32 2007
New Revision: 516769

URL: http://svn.apache.org/viewvc?view=rev&rev=516769
Log:
Added Similarity.Net

Added:
    incubator/lucene.net/trunk/C#/contrib/Similarity.Net/
    incubator/lucene.net/trunk/C#/contrib/Similarity.Net/ABOUT.txt
    incubator/lucene.net/trunk/C#/contrib/Similarity.Net/HISTORY.txt
    incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/
    incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/AssemblyInfo.cs
    incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Build.xml
    incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/HISTORY.txt
    incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/README.txt
    incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Similar/
    incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Similar/MoreLikeThis.cs
    incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Similar/Package.html
    incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Similar/SimilarityQueries.cs
    incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Similarity.Net-2.0.0.csproj
    incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/SimilarityNet.sln
Modified:
    incubator/lucene.net/trunk/C#/contrib/README.txt

Modified: incubator/lucene.net/trunk/C#/contrib/README.txt
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/README.txt?view=diff&rev=516769&r1=516768&r2=516769
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/README.txt (original)
+++ incubator/lucene.net/trunk/C#/contrib/README.txt Sat Mar 10 11:26:32 2007
@@ -8,6 +8,7 @@
 Snowball.Net
 SpellChecker.Net
 WordNet.Net
+Similarity.Net
 
 
 Contributed code:

Added: incubator/lucene.net/trunk/C#/contrib/Similarity.Net/ABOUT.txt
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Similarity.Net/ABOUT.txt?view=auto&rev=516769
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Similarity.Net/ABOUT.txt (added)
+++ incubator/lucene.net/trunk/C#/contrib/Similarity.Net/ABOUT.txt Sat Mar 10 11:26:32 2007
@@ -0,0 +1 @@
+Similarity.Net is a port of Java Similarity to C#.  The port from Java to C# of version 2.0.0 is done by George Aroush.  To contact George Aroush please visit http://www.aroush.net/

Added: incubator/lucene.net/trunk/C#/contrib/Similarity.Net/HISTORY.txt
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Similarity.Net/HISTORY.txt?view=auto&rev=516769
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Similarity.Net/HISTORY.txt (added)
+++ incubator/lucene.net/trunk/C#/contrib/Similarity.Net/HISTORY.txt Sat Mar 10 11:26:32 2007
@@ -0,0 +1,6 @@
+Similarity.Net History
+----------------------
+
+
+10Mar07:
+	- Release: Similarity.Net 2.0.0 build 001

Added: incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/AssemblyInfo.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Similarity.Net/Similarity.Net/AssemblyInfo.cs?view=auto&rev=516769
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/AssemblyInfo.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/AssemblyInfo.cs Sat Mar 10 11:26:32 2007
@@ -0,0 +1,64 @@
+using System.Reflection;
+using System.Runtime.CompilerServices;
+
+//
+// General Information about an assembly is controlled through the following
+// set of attributes. Change these attribute values to modify the information
+// associated with an assembly.
+//
+[assembly: AssemblyTitle("Apache Lucene.Net (Similarity)")]
+[assembly: AssemblyDescription("The Apache Software Foundation Lucene.Net a full-text search engine library")]
+[assembly: AssemblyConfiguration("")]
+[assembly: AssemblyCompany("The Apache Software Foundation")]
+[assembly: AssemblyProduct("Similarity.Net")]
+[assembly: AssemblyCopyright("Copyright 2007 The Apache Software Foundation")]
+[assembly: AssemblyTrademark("Copyright 2007 The Apache Software Foundation")]
+[assembly: AssemblyDefaultAlias("Lucene.Net.Similarity")]
+[assembly: AssemblyCulture("")]
+
+[assembly: AssemblyInformationalVersionAttribute("2.0")]
+
+
+// Version information for an assembly consists of the following four values:
+//
+//      Major Version
+//      Minor Version
+//      Revision
+//      Build Number
+//
+// You can specify all the values or you can default the Revision and Build Numbers
+// by using the '*' as shown below:
+
+[assembly: AssemblyVersion("2.0.0.1")]
+
+//
+// In order to sign your assembly you must specify a key to use. Refer to the 
+// Microsoft .NET Framework documentation for more information on assembly signing.
+//
+// Use the attributes below to control which key is used for signing. 
+//
+// Notes: 
+//   (*) If no key is specified, the assembly is not signed.
+//   (*) KeyName refers to a key that has been installed in the Crypto Service
+//       Provider (CSP) on your machine. KeyFile refers to a file which contains
+//       a key.
+//   (*) If the KeyFile and the KeyName values are both specified, the 
+//       following processing occurs:
+//       (1) If the KeyName can be found in the CSP, that key is used.
+//       (2) If the KeyName does not exist and the KeyFile does exist, the key 
+//           in the KeyFile is installed into the CSP and used.
+//   (*) In order to create a KeyFile, you can use the sn.exe (Strong Name) utility.
+//       When specifying the KeyFile, the location of the KeyFile should be
+//       relative to the project output directory which is
+//       %Project Directory%\obj\<configuration>. For example, if your KeyFile is
+//       located in the project directory, you would specify the AssemblyKeyFile 
+//       attribute as [assembly: AssemblyKeyFile("..\..\mykey.snk")]
+//   (*) Delay Signing is an advanced option - see the Microsoft .NET Framework
+//       documentation for more information on this.
+//
+
+[assembly: AssemblyDelaySign(false)]
+[assembly: AssemblyKeyFile("")]
+[assembly: AssemblyKeyName("")]
+
+

Added: incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Build.xml
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Similarity.Net/Similarity.Net/Build.xml?view=auto&rev=516769
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Build.xml (added)
+++ incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Build.xml Sat Mar 10 11:26:32 2007
@@ -0,0 +1,10 @@
+<?xml version="1.0"?>
+
+<project name="similarity" default="default">
+
+  <description>
+    Similarity - MoreLikeThis
+  </description>
+
+  <import file="../contrib-build.xml"/>
+</project>

Added: incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/HISTORY.txt
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Similarity.Net/Similarity.Net/HISTORY.txt?view=auto&rev=516769
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/HISTORY.txt (added)
+++ incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/HISTORY.txt Sat Mar 10 11:26:32 2007
@@ -0,0 +1,6 @@
+Similarity.Net History
+----------------------
+
+
+11Mar07:
+	- Release: Similarity.Net 2.0.0 build 001

Added: incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/README.txt
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Similarity.Net/Similarity.Net/README.txt?view=auto&rev=516769
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/README.txt (added)
+++ incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/README.txt Sat Mar 10 11:26:32 2007
@@ -0,0 +1,3 @@
+Document similarity measures. 
+This most significant contribution here is MoreLikeThis,
+in /src/java/org/apache/lucene/search/similar.

Added: incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Similar/MoreLikeThis.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Similarity.Net/Similarity.Net/Similar/MoreLikeThis.cs?view=auto&rev=516769
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Similar/MoreLikeThis.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Similar/MoreLikeThis.cs Sat Mar 10 11:26:32 2007
@@ -0,0 +1,975 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+
+using PriorityQueue = Lucene.Net.Util.PriorityQueue;
+using IndexReader = Lucene.Net.Index.IndexReader;
+using Term = Lucene.Net.Index.Term;
+using TermFreqVector = Lucene.Net.Index.TermFreqVector;
+using BooleanClause = Lucene.Net.Search.BooleanClause;
+using DefaultSimilarity = Lucene.Net.Search.DefaultSimilarity;
+using TermQuery = Lucene.Net.Search.TermQuery;
+using BooleanQuery = Lucene.Net.Search.BooleanQuery;
+using IndexSearcher = Lucene.Net.Search.IndexSearcher;
+using Query = Lucene.Net.Search.Query;
+using Hits = Lucene.Net.Search.Hits;
+using Analyzer = Lucene.Net.Analysis.Analyzer;
+using TokenStream = Lucene.Net.Analysis.TokenStream;
+using StandardAnalyzer = Lucene.Net.Analysis.Standard.StandardAnalyzer;
+using Document = Lucene.Net.Documents.Document;
+
+namespace Similarity.Net
+{
+	
+	
+    /// <summary> Generate "more like this" similarity queries. 
+    /// Based on this mail:
+    /// <code><pre>
+    /// Lucene does let you access the document frequency of terms, with IndexReader.DocFreq().
+    /// Term frequencies can be computed by re-tokenizing the text, which, for a single document,
+    /// is usually fast enough.  But looking up the DocFreq() of every term in the document is
+    /// probably too slow.
+    /// 
+    /// You can use some heuristics to prune the set of terms, to avoid calling DocFreq() too much,
+    /// or at all.  Since you're trying to maximize a tf*idf score, you're probably most interested
+    /// in terms with a high tf. Choosing a tf threshold even as low as two or three will radically
+    /// reduce the number of terms under consideration.  Another heuristic is that terms with a
+    /// high idf (i.e., a low df) tend to be longer.  So you could threshold the terms by the
+    /// number of characters, not selecting anything less than, e.g., six or seven characters.
+    /// With these sorts of heuristics you can usually find small set of, e.g., ten or fewer terms
+    /// that do a pretty good job of characterizing a document.
+    /// 
+    /// It all depends on what you're trying to do.  If you're trying to eek out that last percent
+    /// of precision and recall regardless of computational difficulty so that you can win a TREC
+    /// competition, then the techniques I mention above are useless.  But if you're trying to
+    /// provide a "more like this" button on a search results page that does a decent job and has
+    /// good performance, such techniques might be useful.
+    /// 
+    /// An efficient, effective "more-like-this" query generator would be a great contribution, if
+    /// anyone's interested.  I'd imagine that it would take a Reader or a String (the document's
+    /// text), analyzer Analyzer, and return a set of representative terms using heuristics like those
+    /// above.  The frequency and length thresholds could be parameters, etc.
+    /// 
+    /// Doug
+    /// </pre></code>
+    /// 
+    /// 
+    /// <p>
+    /// <h3>Initial Usage</h3>
+    /// 
+    /// This class has lots of options to try to make it efficient and flexible.
+    /// See the body of {@link #main Main()} below in the source for real code, or
+    /// if you want pseudo code, the simpliest possible usage is as follows. The bold
+    /// fragment is specific to this class.
+    /// 
+    /// <code><pre>
+    /// 
+    /// IndexReader ir = ...
+    /// IndexSearcher is = ...
+    /// <b>
+    /// MoreLikeThis mlt = new MoreLikeThis(ir);
+    /// Reader target = ... </b><em>// orig source of doc you want to find similarities to</em><b>
+    /// Query query = mlt.Like( target);
+    /// </b>
+    /// Hits hits = is.Search(query);
+    /// <em>// now the usual iteration thru 'hits' - the only thing to watch for is to make sure
+    /// you ignore the doc if it matches your 'target' document, as it should be similar to itself </em>
+    /// 
+    /// </pre></code>
+    /// 
+    /// Thus you:
+    /// <ol>
+    /// <li> do your normal, Lucene setup for searching,
+    /// <li> create a MoreLikeThis,
+    /// <li> get the text of the doc you want to find similaries to
+    /// <li> then call one of the Like() calls to generate a similarity query
+    /// <li> call the searcher to find the similar docs
+    /// </ol>
+    /// 
+    /// <h3>More Advanced Usage</h3>
+    /// 
+    /// You may want to use {@link #SetFieldNames SetFieldNames(...)} so you can examine
+    /// multiple fields (e.g. body and title) for similarity.
+    /// <p>
+    /// 
+    /// Depending on the size of your index and the size and makeup of your documents you
+    /// may want to call the other set methods to control how the similarity queries are
+    /// generated:
+    /// <ul>
+    /// <li> {@link #SetMinTermFreq SetMinTermFreq(...)}
+    /// <li> {@link #SetMinDocFreq SetMinDocFreq(...)}
+    /// <li> {@link #SetMinWordLen SetMinWordLen(...)}
+    /// <li> {@link #SetMaxWordLen SetMaxWordLen(...)}
+    /// <li> {@link #SetMaxQueryTerms SetMaxQueryTerms(...)}
+    /// <li> {@link #SetMaxNumTokensParsed SetMaxNumTokensParsed(...)}
+    /// <li> {@link #SetStopWords SetStopWord(...)} 
+    /// </ul> 
+    /// 
+    /// <hr>
+    /// <pre>
+    /// Changes: Mark Harwood 29/02/04
+    /// Some bugfixing, some refactoring, some optimisation.
+    /// - bugfix: retrieveTerms(int docNum) was not working for indexes without a termvector -added missing code
+    /// - bugfix: No significant terms being created for fields with a termvector - because 
+    /// was only counting one occurence per term/field pair in calculations(ie not including frequency info from TermVector) 
+    /// - refactor: moved common code into isNoiseWord()
+    /// - optimise: when no termvector support available - used maxNumTermsParsed to limit amount of tokenization
+    /// </pre>
+    /// 
+    /// </summary>
+    /// <author>  David Spencer
+    /// </author>
+    /// <author>  Bruce Ritchie
+    /// </author>
+    /// <author>  Mark Harwood
+    /// </author>
+    public sealed class MoreLikeThis
+    {
+		
+        /// <summary> Default maximum number of tokens to parse in each example doc field that is not stored with TermVector support.</summary>
+        /// <seealso cref="#getMaxNumTokensParsed">
+        /// </seealso>
+        public const int DEFAULT_MAX_NUM_TOKENS_PARSED = 5000;
+		
+		
+        /// <summary> Default analyzer to parse source doc with.</summary>
+        /// <seealso cref="#getAnalyzer">
+        /// </seealso>
+        public static readonly Analyzer DEFAULT_ANALYZER = new StandardAnalyzer();
+		
+        /// <summary> Ignore terms with less than this frequency in the source doc.</summary>
+        /// <seealso cref="#getMinTermFreq">
+        /// </seealso>
+        /// <seealso cref="#setMinTermFreq">
+        /// </seealso>
+        public const int DEFAULT_MIN_TERM_FREQ = 2;
+		
+        /// <summary> Ignore words which do not occur in at least this many docs.</summary>
+        /// <seealso cref="#getMinDocFreq">
+        /// </seealso>
+        /// <seealso cref="#setMinDocFreq">
+        /// </seealso>
+        public const int DEFALT_MIN_DOC_FREQ = 5;
+		
+        /// <summary> Boost terms in query based on score.</summary>
+        /// <seealso cref="#isBoost">
+        /// </seealso>
+        /// <seealso cref="#SetBoost">
+        /// </seealso>
+        public const bool DEFAULT_BOOST = false;
+		
+        /// <summary> Default field names. Null is used to specify that the field names should be looked
+        /// up at runtime from the provided reader.
+        /// </summary>
+        public static readonly System.String[] DEFAULT_FIELD_NAMES = new System.String[]{"contents"};
+		
+        /// <summary> Ignore words less than this length or if 0 then this has no effect.</summary>
+        /// <seealso cref="#getMinWordLen">
+        /// </seealso>
+        /// <seealso cref="#setMinWordLen">
+        /// </seealso>
+        public const int DEFAULT_MIN_WORD_LENGTH = 0;
+		
+        /// <summary> Ignore words greater than this length or if 0 then this has no effect.</summary>
+        /// <seealso cref="#getMaxWordLen">
+        /// </seealso>
+        /// <seealso cref="#setMaxWordLen">
+        /// </seealso>
+        public const int DEFAULT_MAX_WORD_LENGTH = 0;
+		
+        /// <summary> Default set of stopwords.
+        /// If null means to allow stop words.
+        /// 
+        /// </summary>
+        /// <seealso cref="#setStopWords">
+        /// </seealso>
+        /// <seealso cref="#getStopWords">
+        /// </seealso>
+        public static readonly System.Collections.Hashtable DEFAULT_STOP_WORDS = null;
+		
+        /// <summary> Current set of stop words.</summary>
+        private System.Collections.Hashtable stopWords = DEFAULT_STOP_WORDS;
+		
+        /// <summary> Return a Query with no more than this many terms.
+        /// 
+        /// </summary>
+        /// <seealso cref="BooleanQuery#getMaxClauseCount">
+        /// </seealso>
+        /// <seealso cref="#getMaxQueryTerms">
+        /// </seealso>
+        /// <seealso cref="#setMaxQueryTerms">
+        /// </seealso>
+        public const int DEFAULT_MAX_QUERY_TERMS = 25;
+		
+        /// <summary> Analyzer that will be used to parse the doc.</summary>
+        private Analyzer analyzer = DEFAULT_ANALYZER;
+		
+        /// <summary> Ignore words less freqent that this.</summary>
+        private int minTermFreq = DEFAULT_MIN_TERM_FREQ;
+		
+        /// <summary> Ignore words which do not occur in at least this many docs.</summary>
+        private int minDocFreq = DEFALT_MIN_DOC_FREQ;
+		
+        /// <summary> Should we apply a boost to the Query based on the scores?</summary>
+        private bool boost = DEFAULT_BOOST;
+		
+        /// <summary> Field name we'll analyze.</summary>
+        private System.String[] fieldNames = DEFAULT_FIELD_NAMES;
+		
+        /// <summary> The maximum number of tokens to parse in each example doc field that is not stored with TermVector support</summary>
+        private int maxNumTokensParsed = DEFAULT_MAX_NUM_TOKENS_PARSED;
+		
+		
+		
+        /// <summary> Ignore words if less than this len.</summary>
+        private int minWordLen = DEFAULT_MIN_WORD_LENGTH;
+		
+        /// <summary> Ignore words if greater than this len.</summary>
+        private int maxWordLen = DEFAULT_MAX_WORD_LENGTH;
+		
+        /// <summary> Don't return a query longer than this.</summary>
+        private int maxQueryTerms = DEFAULT_MAX_QUERY_TERMS;
+		
+        /// <summary> For idf() calculations.</summary>
+        private Lucene.Net.Search.Similarity similarity = new DefaultSimilarity();
+		
+        /// <summary> IndexReader to use</summary>
+        private IndexReader ir;
+		
+        /// <summary> Constructor requiring an IndexReader.</summary>
+        public MoreLikeThis(IndexReader ir)
+        {
+            this.ir = ir;
+        }
+		
+        /// <summary> Returns an analyzer that will be used to parse source doc with. The default analyzer
+        /// is the {@link #DEFAULT_ANALYZER}.
+        /// 
+        /// </summary>
+        /// <returns> the analyzer that will be used to parse source doc with.
+        /// </returns>
+        /// <seealso cref="#DEFAULT_ANALYZER">
+        /// </seealso>
+        public Analyzer GetAnalyzer()
+        {
+            return analyzer;
+        }
+		
+        /// <summary> Sets the analyzer to use. An analyzer is not required for generating a query with the
+        /// {@link #Like(int)} method, all other 'like' methods require an analyzer.
+        /// 
+        /// </summary>
+        /// <param name="analyzer">the analyzer to use to tokenize text.
+        /// </param>
+        public void  SetAnalyzer(Analyzer analyzer)
+        {
+            this.analyzer = analyzer;
+        }
+		
+        /// <summary> Returns the frequency below which terms will be ignored in the source doc. The default
+        /// frequency is the {@link #DEFAULT_MIN_TERM_FREQ}.
+        /// 
+        /// </summary>
+        /// <returns> the frequency below which terms will be ignored in the source doc.
+        /// </returns>
+        public int GetMinTermFreq()
+        {
+            return minTermFreq;
+        }
+		
+        /// <summary> Sets the frequency below which terms will be ignored in the source doc.
+        /// 
+        /// </summary>
+        /// <param name="minTermFreq">the frequency below which terms will be ignored in the source doc.
+        /// </param>
+        public void  SetMinTermFreq(int minTermFreq)
+        {
+            this.minTermFreq = minTermFreq;
+        }
+		
+        /// <summary> Returns the frequency at which words will be ignored which do not occur in at least this
+        /// many docs. The default frequency is {@link #DEFALT_MIN_DOC_FREQ}.
+        /// 
+        /// </summary>
+        /// <returns> the frequency at which words will be ignored which do not occur in at least this
+        /// many docs.
+        /// </returns>
+        public int GetMinDocFreq()
+        {
+            return minDocFreq;
+        }
+		
+        /// <summary> Sets the frequency at which words will be ignored which do not occur in at least this
+        /// many docs.
+        /// 
+        /// </summary>
+        /// <param name="minDocFreq">the frequency at which words will be ignored which do not occur in at
+        /// least this many docs.
+        /// </param>
+        public void  SetMinDocFreq(int minDocFreq)
+        {
+            this.minDocFreq = minDocFreq;
+        }
+		
+        /// <summary> Returns whether to boost terms in query based on "score" or not. The default is
+        /// {@link #DEFAULT_BOOST}.
+        /// 
+        /// </summary>
+        /// <returns> whether to boost terms in query based on "score" or not.
+        /// </returns>
+        /// <seealso cref="#SetBoost">
+        /// </seealso>
+        public bool IsBoost()
+        {
+            return boost;
+        }
+		
+        /// <summary> Sets whether to boost terms in query based on "score" or not.
+        /// 
+        /// </summary>
+        /// <param name="boost">true to boost terms in query based on "score", false otherwise.
+        /// </param>
+        /// <seealso cref="#isBoost">
+        /// </seealso>
+        public void  SetBoost(bool boost)
+        {
+            this.boost = boost;
+        }
+		
+        /// <summary> Returns the field names that will be used when generating the 'More Like This' query.
+        /// The default field names that will be used is {@link #DEFAULT_FIELD_NAMES}.
+        /// 
+        /// </summary>
+        /// <returns> the field names that will be used when generating the 'More Like This' query.
+        /// </returns>
+        public System.String[] GetFieldNames()
+        {
+            return fieldNames;
+        }
+		
+        /// <summary> Sets the field names that will be used when generating the 'More Like This' query.
+        /// Set this to null for the field names to be determined at runtime from the IndexReader
+        /// provided in the constructor.
+        /// 
+        /// </summary>
+        /// <param name="fieldNames">the field names that will be used when generating the 'More Like This'
+        /// query.
+        /// </param>
+        public void  SetFieldNames(System.String[] fieldNames)
+        {
+            this.fieldNames = fieldNames;
+        }
+		
+        /// <summary> Returns the minimum word length below which words will be ignored. Set this to 0 for no
+        /// minimum word length. The default is {@link #DEFAULT_MIN_WORD_LENGTH}.
+        /// 
+        /// </summary>
+        /// <returns> the minimum word length below which words will be ignored.
+        /// </returns>
+        public int GetMinWordLen()
+        {
+            return minWordLen;
+        }
+		
+        /// <summary> Sets the minimum word length below which words will be ignored.
+        /// 
+        /// </summary>
+        /// <param name="minWordLen">the minimum word length below which words will be ignored.
+        /// </param>
+        public void  SetMinWordLen(int minWordLen)
+        {
+            this.minWordLen = minWordLen;
+        }
+		
+        /// <summary> Returns the maximum word length above which words will be ignored. Set this to 0 for no
+        /// maximum word length. The default is {@link #DEFAULT_MAX_WORD_LENGTH}.
+        /// 
+        /// </summary>
+        /// <returns> the maximum word length above which words will be ignored.
+        /// </returns>
+        public int GetMaxWordLen()
+        {
+            return maxWordLen;
+        }
+		
+        /// <summary> Sets the maximum word length above which words will be ignored.
+        /// 
+        /// </summary>
+        /// <param name="maxWordLen">the maximum word length above which words will be ignored.
+        /// </param>
+        public void  SetMaxWordLen(int maxWordLen)
+        {
+            this.maxWordLen = maxWordLen;
+        }
+		
+        /// <summary> Set the set of stopwords.
+        /// Any word in this set is considered "uninteresting" and ignored.
+        /// Even if your Analyzer allows stopwords, you might want to tell the MoreLikeThis code to ignore them, as
+        /// for the purposes of document similarity it seems reasonable to assume that "a stop word is never interesting".
+        /// 
+        /// </summary>
+        /// <param name="stopWords">set of stopwords, if null it means to allow stop words
+        /// 
+        /// </param>
+        /// <seealso cref="StopFilter.makeStopSet()">
+        /// </seealso>
+        /// <seealso cref="#getStopWords">
+        /// </seealso>
+        public void  SetStopWords(System.Collections.Hashtable stopWords)
+        {
+            this.stopWords = stopWords;
+        }
+		
+        /// <summary> Get the current stop words being used.</summary>
+        /// <seealso cref="#setStopWords">
+        /// </seealso>
+        public System.Collections.Hashtable GetStopWords()
+        {
+            return stopWords;
+        }
+		
+		
+        /// <summary> Returns the maximum number of query terms that will be included in any generated query.
+        /// The default is {@link #DEFAULT_MAX_QUERY_TERMS}.
+        /// 
+        /// </summary>
+        /// <returns> the maximum number of query terms that will be included in any generated query.
+        /// </returns>
+        public int GetMaxQueryTerms()
+        {
+            return maxQueryTerms;
+        }
+		
+        /// <summary> Sets the maximum number of query terms that will be included in any generated query.
+        /// 
+        /// </summary>
+        /// <param name="maxQueryTerms">the maximum number of query terms that will be included in any
+        /// generated query.
+        /// </param>
+        public void  SetMaxQueryTerms(int maxQueryTerms)
+        {
+            this.maxQueryTerms = maxQueryTerms;
+        }
+		
+        /// <returns> The maximum number of tokens to parse in each example doc field that is not stored with TermVector support
+        /// </returns>
+        /// <seealso cref="#DEFAULT_MAX_NUM_TOKENS_PARSED">
+        /// </seealso>
+        public int GetMaxNumTokensParsed()
+        {
+            return maxNumTokensParsed;
+        }
+		
+        /// <param name="i">The maximum number of tokens to parse in each example doc field that is not stored with TermVector support
+        /// </param>
+        public void  SetMaxNumTokensParsed(int i)
+        {
+            maxNumTokensParsed = i;
+        }
+		
+		
+		
+		
+        /// <summary> Return a query that will return docs like the passed lucene document ID.
+        /// 
+        /// </summary>
+        /// <param name="docNum">the documentID of the lucene doc to generate the 'More Like This" query for.
+        /// </param>
+        /// <returns> a query that will return docs like the passed lucene document ID.
+        /// </returns>
+        public Query Like(int docNum)
+        {
+            if (fieldNames == null)
+            {
+                // gather list of valid fields from lucene
+                System.Collections.ICollection fields = ir.GetFieldNames(IndexReader.FieldOption.INDEXED);
+                System.Collections.IEnumerator e = fields.GetEnumerator();
+                fieldNames = new System.String[fields.Count];
+                int index = 0;
+                while (e.MoveNext())
+                    fieldNames[index++] = (System.String) e.Current;
+            }
+			
+            return CreateQuery(RetrieveTerms(docNum));
+        }
+		
+        /// <summary> Return a query that will return docs like the passed file.
+        /// 
+        /// </summary>
+        /// <returns> a query that will return docs like the passed file.
+        /// </returns>
+        public Query Like(System.IO.FileInfo f)
+        {
+            if (fieldNames == null)
+            {
+                // gather list of valid fields from lucene
+                System.Collections.ICollection fields = ir.GetFieldNames(IndexReader.FieldOption.INDEXED);
+                System.Collections.IEnumerator e = fields.GetEnumerator();
+                fieldNames = new System.String[fields.Count];
+                int index = 0;
+                while (e.MoveNext())
+                    fieldNames[index++] = (System.String) e.Current;
+            }
+			
+            return Like(new System.IO.StreamReader(f.FullName, System.Text.Encoding.Default));
+        }
+		
+        /// <summary> Return a query that will return docs like the passed URL.
+        /// 
+        /// </summary>
+        /// <returns> a query that will return docs like the passed URL.
+        /// </returns>
+        public Query Like(System.Uri u)
+        {
+            return Like(new System.IO.StreamReader(((System.Net.HttpWebRequest) System.Net.WebRequest.Create(u)).GetResponse().GetResponseStream(), System.Text.Encoding.Default));
+        }
+		
+        /// <summary> Return a query that will return docs like the passed stream.
+        /// 
+        /// </summary>
+        /// <returns> a query that will return docs like the passed stream.
+        /// </returns>
+        public Query Like(System.IO.Stream is_Renamed)
+        {
+            return Like(new System.IO.StreamReader(is_Renamed, System.Text.Encoding.Default));
+        }
+		
+        /// <summary> Return a query that will return docs like the passed Reader.
+        /// 
+        /// </summary>
+        /// <returns> a query that will return docs like the passed Reader.
+        /// </returns>
+        public Query Like(System.IO.StreamReader r)
+        {
+            return CreateQuery(RetrieveTerms(r));
+        }
+		
+        /// <summary> Create the More like query from a PriorityQueue</summary>
+        private Query CreateQuery(PriorityQueue q)
+        {
+            BooleanQuery query = new BooleanQuery();
+            System.Object cur;
+            int qterms = 0;
+            float bestScore = 0;
+			
+            while (((cur = q.Pop()) != null))
+            {
+                System.Object[] ar = (System.Object[]) cur;
+                TermQuery tq = new TermQuery(new Term((System.String) ar[1], (System.String) ar[0]));
+				
+                if (boost)
+                {
+                    if (qterms == 0)
+                    {
+                        bestScore = (float) ((System.Single) ar[2]);
+                    }
+                    float myScore = (float) ((System.Single) ar[2]);
+					
+                    tq.SetBoost(myScore / bestScore);
+                }
+				
+                try
+                {
+                    query.Add(tq, BooleanClause.Occur.SHOULD);
+                }
+                catch (BooleanQuery.TooManyClauses ignore)
+                {
+                    break;
+                }
+				
+                qterms++;
+                if (maxQueryTerms > 0 && qterms >= maxQueryTerms)
+                {
+                    break;
+                }
+            }
+			
+            return query;
+        }
+		
+        /// <summary> Create a PriorityQueue from a word->tf map.
+        /// 
+        /// </summary>
+        /// <param name="words">a map of words keyed on the word(String) with Int objects as the values.
+        /// </param>
+        private PriorityQueue CreateQueue(System.Collections.IDictionary words)
+        {
+            // have collected all words in doc and their freqs
+            int numDocs = ir.NumDocs();
+            FreqQ res = new FreqQ(words.Count); // will order words by score
+			
+            System.Collections.IEnumerator it = words.Keys.GetEnumerator();
+            while (it.MoveNext())
+            {
+                // for every word
+                System.String word = (System.String) it.Current;
+				
+                int tf = ((Int) words[word]).x; // term freq in the source doc
+                if (minTermFreq > 0 && tf < minTermFreq)
+                {
+                    continue; // filter out words that don't occur enough times in the source
+                }
+				
+                // go through all the fields and find the largest document frequency
+                System.String topField = fieldNames[0];
+                int docFreq = 0;
+                for (int i = 0; i < fieldNames.Length; i++)
+                {
+                    int freq = ir.DocFreq(new Term(fieldNames[i], word));
+                    topField = (freq > docFreq) ? fieldNames[i] : topField;
+                    docFreq = (freq > docFreq) ? freq : docFreq;
+                }
+				
+                if (minDocFreq > 0 && docFreq < minDocFreq)
+                {
+                    continue; // filter out words that don't occur in enough docs
+                }
+				
+                if (docFreq == 0)
+                {
+                    continue; // index update problem?
+                }
+				
+                float idf = similarity.Idf(docFreq, numDocs);
+                float score = tf * idf;
+				
+                // only really need 1st 3 entries, other ones are for troubleshooting
+                res.Insert(new System.Object[]{word, topField, (float) score, (float) idf, (System.Int32) docFreq, (System.Int32) tf});
+            }
+            return res;
+        }
+		
+        /// <summary> Describe the parameters that control how the "more like this" query is formed.</summary>
+        public System.String DescribeParams()
+        {
+            System.Text.StringBuilder sb = new System.Text.StringBuilder();
+            sb.Append("\t" + "maxQueryTerms  : " + maxQueryTerms + "\n");
+            sb.Append("\t" + "minWordLen     : " + minWordLen + "\n");
+            sb.Append("\t" + "maxWordLen     : " + maxWordLen + "\n");
+            sb.Append("\t" + "fieldNames     : \"");
+            System.String delim = "";
+            for (int i = 0; i < fieldNames.Length; i++)
+            {
+                System.String fieldName = fieldNames[i];
+                sb.Append(delim).Append(fieldName);
+                delim = ", ";
+            }
+            sb.Append("\n");
+            sb.Append("\t" + "boost          : " + boost + "\n");
+            sb.Append("\t" + "minTermFreq    : " + minTermFreq + "\n");
+            sb.Append("\t" + "minDocFreq     : " + minDocFreq + "\n");
+            return sb.ToString();
+        }
+		
+        /// <summary> Test driver.
+        /// Pass in "-i INDEX" and then either "-fn FILE" or "-url URL".
+        /// </summary>
+        [STAThread]
+        public static void  Main(System.String[] a)
+        {
+            System.String indexName = "localhost_index";
+            System.String fn = "c:/Program Files/Apache Group/Apache/htdocs/manual/vhosts/index.html.en";
+            System.Uri url = null;
+            for (int i = 0; i < a.Length; i++)
+            {
+                if (a[i].Equals("-i"))
+                {
+                    indexName = a[++i];
+                }
+                else if (a[i].Equals("-f"))
+                {
+                    fn = a[++i];
+                }
+                else if (a[i].Equals("-url"))
+                {
+                    url = new System.Uri(a[++i]);
+                }
+            }
+			
+            System.IO.StreamWriter temp_writer;
+            temp_writer = new System.IO.StreamWriter(System.Console.OpenStandardOutput(), System.Console.Out.Encoding);
+            temp_writer.AutoFlush = true;
+            System.IO.StreamWriter o = temp_writer;
+            IndexReader r = IndexReader.Open(indexName);
+            o.WriteLine("Open index " + indexName + " which has " + r.NumDocs() + " docs");
+			
+            MoreLikeThis mlt = new MoreLikeThis(r);
+			
+            o.WriteLine("Query generation parameters:");
+            o.WriteLine(mlt.DescribeParams());
+            o.WriteLine();
+			
+            Query query = null;
+            if (url != null)
+            {
+                o.WriteLine("Parsing URL: " + url);
+                query = mlt.Like(url);
+            }
+            else if (fn != null)
+            {
+                o.WriteLine("Parsing file: " + fn);
+                query = mlt.Like(new System.IO.FileInfo(fn));
+            }
+			
+            o.WriteLine("q: " + query);
+            o.WriteLine();
+            IndexSearcher searcher = new IndexSearcher(indexName);
+			
+            Hits hits = searcher.Search(query);
+            int len = hits.Length();
+            o.WriteLine("found: " + len + " documents matching");
+            o.WriteLine();
+            for (int i = 0; i < System.Math.Min(25, len); i++)
+            {
+                Document d = hits.Doc(i);
+                System.String summary = d.Get("summary");
+                o.WriteLine("score  : " + hits.Score(i));
+                o.WriteLine("url    : " + d.Get("url"));
+                o.WriteLine("\ttitle  : " + d.Get("title"));
+                if (summary != null)
+                    o.WriteLine("\tsummary: " + d.Get("summary"));
+                o.WriteLine();
+            }
+        }
+		
+        /// <summary> Find words for a more-like-this query former.
+        /// 
+        /// </summary>
+        /// <param name="docNum">the id of the lucene document from which to find terms
+        /// </param>
+        private PriorityQueue RetrieveTerms(int docNum)
+        {
+            System.Collections.IDictionary termFreqMap = new System.Collections.Hashtable();
+            for (int i = 0; i < fieldNames.Length; i++)
+            {
+                System.String fieldName = fieldNames[i];
+                TermFreqVector vector = ir.GetTermFreqVector(docNum, fieldName);
+				
+                // field does not store term vector info
+                if (vector == null)
+                {
+                    Document d = ir.Document(docNum);
+                    System.String[] text = d.GetValues(fieldName);
+                    if (text != null)
+                    {
+                        for (int j = 0; j < text.Length; j++)
+                        {
+                            AddTermFrequencies(new System.IO.StreamReader(text[j]), termFreqMap, fieldName);
+                        }
+                    }
+                }
+                else
+                {
+                    AddTermFrequencies(termFreqMap, vector);
+                }
+            }
+			
+            return CreateQueue(termFreqMap);
+        }
+		
+        /// <summary> Adds terms and frequencies found in vector into the Map termFreqMap</summary>
+        /// <param name="termFreqMap">a Map of terms and their frequencies
+        /// </param>
+        /// <param name="vector">List of terms and their frequencies for a doc/field
+        /// </param>
+        private void  AddTermFrequencies(System.Collections.IDictionary termFreqMap, TermFreqVector vector)
+        {
+            System.String[] terms = vector.GetTerms();
+            int[] freqs = vector.GetTermFrequencies();
+            for (int j = 0; j < terms.Length; j++)
+            {
+                System.String term = terms[j];
+				
+                if (IsNoiseWord(term))
+                {
+                    continue;
+                }
+                // increment frequency
+                Int cnt = (Int) termFreqMap[term];
+                if (cnt == null)
+                {
+                    cnt = new Int();
+                    termFreqMap[term] = cnt;
+                    cnt.x = freqs[j];
+                }
+                else
+                {
+                    cnt.x += freqs[j];
+                }
+            }
+        }
+        /// <summary> Adds term frequencies found by tokenizing text from reader into the Map words</summary>
+        /// <param name="r">a source of text to be tokenized
+        /// </param>
+        /// <param name="termFreqMap">a Map of terms and their frequencies
+        /// </param>
+        /// <param name="fieldName">Used by analyzer for any special per-field analysis
+        /// </param>
+        private void  AddTermFrequencies(System.IO.StreamReader r, System.Collections.IDictionary termFreqMap, System.String fieldName)
+        {
+            TokenStream ts = analyzer.TokenStream(fieldName, r);
+            Lucene.Net.Analysis.Token token;
+            int tokenCount = 0;
+            while ((token = ts.Next()) != null)
+            {
+                // for every token
+                System.String word = token.TermText();
+                tokenCount++;
+                if (tokenCount > maxNumTokensParsed)
+                {
+                    break;
+                }
+                if (IsNoiseWord(word))
+                {
+                    continue;
+                }
+				
+                // increment frequency
+                Int cnt = (Int) termFreqMap[word];
+                if (cnt == null)
+                {
+                    termFreqMap[word] = new Int();
+                }
+                else
+                {
+                    cnt.x++;
+                }
+            }
+        }
+		
+		
+        /// <summary>determines if the passed term is likely to be of interest in "more like" comparisons 
+        /// 
+        /// </summary>
+        /// <param name="term">The word being considered
+        /// </param>
+        /// <returns> true if should be ignored, false if should be used in further analysis
+        /// </returns>
+        private bool IsNoiseWord(System.String term)
+        {
+            int len = term.Length;
+            if (minWordLen > 0 && len < minWordLen)
+            {
+                return true;
+            }
+            if (maxWordLen > 0 && len > maxWordLen)
+            {
+                return true;
+            }
+            if (stopWords != null && stopWords.Contains(term))
+            {
+                return true;
+            }
+            return false;
+        }
+		
+		
+        /// <summary> Find words for a more-like-this query former.
+        /// The result is a priority queue of arrays with one entry for <b>every word</b> in the document.
+        /// Each array has 6 elements.
+        /// The elements are:
+        /// <ol>
+        /// <li> The word (String)
+        /// <li> The top field that this word comes from (String)
+        /// <li> The score for this word (Float)
+        /// <li> The IDF value (Float)
+        /// <li> The frequency of this word in the index (Integer)
+        /// <li> The frequency of this word in the source document (Integer)	 	 
+        /// </ol>
+        /// This is a somewhat "advanced" routine, and in general only the 1st entry in the array is of interest.
+        /// This method is exposed so that you can identify the "interesting words" in a document.
+        /// For an easier method to call see {@link #retrieveInterestingTerms retrieveInterestingTerms()}.
+        /// 
+        /// </summary>
+        /// <param name="r">the reader that has the content of the document
+        /// </param>
+        /// <returns> the most intresting words in the document ordered by score, with the highest scoring, or best entry, first
+        /// 
+        /// </returns>
+        /// <seealso cref="#retrieveInterestingTerms">
+        /// </seealso>
+        public PriorityQueue RetrieveTerms(System.IO.StreamReader r)
+        {
+            System.Collections.IDictionary words = new System.Collections.Hashtable();
+            for (int i = 0; i < fieldNames.Length; i++)
+            {
+                System.String fieldName = fieldNames[i];
+                AddTermFrequencies(r, words, fieldName);
+            }
+            return CreateQueue(words);
+        }
+		
+        /// <summary> Convenience routine to make it easy to return the most interesting words in a document.
+        /// More advanced users will call {@link #RetrieveTerms(java.io.Reader) retrieveTerms()} directly.
+        /// </summary>
+        /// <param name="r">the source document
+        /// </param>
+        /// <returns> the most interesting words in the document
+        /// 
+        /// </returns>
+        /// <seealso cref="#RetrieveTerms(java.io.Reader)">
+        /// </seealso>
+        /// <seealso cref="#setMaxQueryTerms">
+        /// </seealso>
+        public System.String[] RetrieveInterestingTerms(System.IO.StreamReader r)
+        {
+            System.Collections.ArrayList al = new System.Collections.ArrayList(maxQueryTerms);
+            PriorityQueue pq = RetrieveTerms(r);
+            System.Object cur;
+            int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
+            // we just want to return the top words
+            while (((cur = pq.Pop()) != null) && lim-- > 0)
+            {
+                System.Object[] ar = (System.Object[]) cur;
+                al.Add(ar[0]); // the 1st entry is the interesting word
+            }
+            System.String[] res = new System.String[al.Count];
+            // return (System.String[]) SupportClass.ICollectionSupport.ToArray(al, res);
+            return (System.String[]) al.ToArray(typeof(System.String));
+        }
+		
+        /// <summary> PriorityQueue that orders words by score.</summary>
+        private class FreqQ : PriorityQueue
+        {
+            internal FreqQ(int s)
+            {
+                Initialize(s);
+            }
+			
+            override public bool LessThan(System.Object a, System.Object b)
+            {
+                System.Object[] aa = (System.Object[]) a;
+                System.Object[] bb = (System.Object[]) b;
+                System.Single fa = (System.Single) aa[2];
+                System.Single fb = (System.Single) bb[2];
+                return (float) fa > (float) fb;
+            }
+        }
+		
+        /// <summary> Use for frequencies and to avoid renewing Integers.</summary>
+        private class Int
+        {
+            internal int x;
+			
+            internal Int()
+            {
+                x = 1;
+            }
+        }
+    }
+}
\ No newline at end of file

Added: incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Similar/Package.html
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Similarity.Net/Similarity.Net/Similar/Package.html?view=auto&rev=516769
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Similar/Package.html (added)
+++ incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Similar/Package.html Sat Mar 10 11:26:32 2007
@@ -0,0 +1,5 @@
+<html>
+<body>
+Document similarity query generators.
+</body>
+</html>
\ No newline at end of file

Added: incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Similar/SimilarityQueries.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Similarity.Net/Similarity.Net/Similar/SimilarityQueries.cs?view=auto&rev=516769
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Similar/SimilarityQueries.cs (added)
+++ incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Similar/SimilarityQueries.cs Sat Mar 10 11:26:32 2007
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+
+using Analyzer = Lucene.Net.Analysis.Analyzer;
+using TokenStream = Lucene.Net.Analysis.TokenStream;
+using Term = Lucene.Net.Index.Term;
+using BooleanQuery = Lucene.Net.Search.BooleanQuery;
+using IndexSearcher = Lucene.Net.Search.IndexSearcher;
+using Query = Lucene.Net.Search.Query;
+using TermQuery = Lucene.Net.Search.TermQuery;
+using BooleanClause = Lucene.Net.Search.BooleanClause;
+
+namespace Similarity.Net
+{
+	
+    /// <summary> Simple similarity measures.
+    /// 
+    /// 
+    /// </summary>
+    /// <seealso cref="MoreLikeThis">
+    /// </seealso>
+    public sealed class SimilarityQueries
+    {
+        /// <summary> </summary>
+        private SimilarityQueries()
+        {
+        }
+		
+        /// <summary> Simple similarity query generators.
+        /// Takes every unique word and forms a boolean query where all words are optional.
+        /// After you get this you'll use to to query your {@link IndexSearcher} for similar docs.
+        /// The only caveat is the first hit returned <b>should be</b> your source document - you'll
+        /// need to then ignore that.
+        /// 
+        /// <p>
+        /// 
+        /// So, if you have a code fragment like this:
+        /// <br>
+        /// <code>
+        /// Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null);
+        /// </code>
+        /// 
+        /// <p>
+        /// 
+        /// </summary>
+        /// <summary> The query returned, in string form, will be <code>'(i use lucene to search fast searchers are good')</code>.
+        /// 
+        /// <p>
+        /// The philosophy behind this method is "two documents are similar if they share lots of words".
+        /// Note that behind the scenes, Lucenes scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words.
+        /// 
+        /// <P>
+        /// This method is fail-safe in that if a long 'body' is passed in and
+        /// {@link BooleanQuery#add BooleanQuery.add()} (used internally)
+        /// throws
+        /// {@link org.apache.lucene.search.BooleanQuery.TooManyClauses BooleanQuery.TooManyClauses}, the
+        /// query as it is will be returned.
+        /// 
+        /// 
+        /// 
+        /// 
+        /// 
+        /// </summary>
+        /// <param name="body">the body of the document you want to find similar documents to
+        /// </param>
+        /// <param name="a">the analyzer to use to parse the body
+        /// </param>
+        /// <param name="field">the field you want to search on, probably something like "contents" or "body"
+        /// </param>
+        /// <param name="stop">optional set of stop words to ignore
+        /// </param>
+        /// <returns> a query with all unique words in 'body'
+        /// </returns>
+        /// <throws>  IOException this can't happen... </throws>
+        public static Query FormSimilarQuery(System.String body, Analyzer a, System.String field, System.Collections.Hashtable stop)
+        {
+            TokenStream ts = a.TokenStream(field, new System.IO.StringReader(body));
+            Lucene.Net.Analysis.Token t;
+            BooleanQuery tmp = new BooleanQuery();
+            System.Collections.Hashtable already = new System.Collections.Hashtable(); // ignore dups
+            while ((t = ts.Next()) != null)
+            {
+                System.String word = t.TermText();
+                // ignore opt stop words
+                if (stop != null && stop.Contains(word))
+                    continue;
+                // ignore dups
+                if (already.Contains(word) == true)
+                    continue;
+                already.Add(word, word);
+                // add to query
+                TermQuery tq = new TermQuery(new Term(field, word));
+                try
+                {
+                    tmp.Add(tq, BooleanClause.Occur.SHOULD); //false, false);
+                }
+                catch (BooleanQuery.TooManyClauses too)
+                {
+                    // fail-safe, just return what we have, not the end of the world
+                    break;
+                }
+            }
+            return tmp;
+        }
+    }
+}
\ No newline at end of file

Added: incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Similarity.Net-2.0.0.csproj
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Similarity.Net/Similarity.Net/Similarity.Net-2.0.0.csproj?view=auto&rev=516769
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Similarity.Net-2.0.0.csproj (added)
+++ incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/Similarity.Net-2.0.0.csproj Sat Mar 10 11:26:32 2007
@@ -0,0 +1,139 @@
+<VisualStudioProject>
+    <CSHARP
+        ProjectType = "Local"
+        ProductVersion = "7.10.3077"
+        SchemaVersion = "2.0"
+        ProjectGuid = "{B9C12C92-199A-426E-B892-3531E2600217}"
+    >
+        <Build>
+            <Settings
+                ApplicationIcon = ""
+                AssemblyKeyContainerName = ""
+                AssemblyName = "SimilarityNet"
+                AssemblyOriginatorKeyFile = ""
+                DefaultClientScript = "JScript"
+                DefaultHTMLPageLayout = "Grid"
+                DefaultTargetSchema = "IE50"
+                DelaySign = "false"
+                OutputType = "Library"
+                PreBuildEvent = ""
+                PostBuildEvent = ""
+                RootNamespace = "SimilarityNet"
+                RunPostBuildEvent = "OnBuildSuccess"
+                StartupObject = ""
+            >
+                <Config
+                    Name = "Debug"
+                    AllowUnsafeBlocks = "false"
+                    BaseAddress = "285212672"
+                    CheckForOverflowUnderflow = "false"
+                    ConfigurationOverrideFile = ""
+                    DefineConstants = ""
+                    DocumentationFile = ""
+                    DebugSymbols = "true"
+                    FileAlignment = "4096"
+                    IncrementalBuild = "true"
+                    NoStdLib = "false"
+                    NoWarn = ""
+                    Optimize = "false"
+                    OutputPath = ".\"
+                    RegisterForComInterop = "false"
+                    RemoveIntegerChecks = "false"
+                    TreatWarningsAsErrors = "false"
+                    WarningLevel = "4"
+                />
+                <Config
+                    Name = "Release"
+                    AllowUnsafeBlocks = "false"
+                    BaseAddress = "285212672"
+                    CheckForOverflowUnderflow = "false"
+                    ConfigurationOverrideFile = ""
+                    DefineConstants = ""
+                    DocumentationFile = ""
+                    DebugSymbols = "true"
+                    FileAlignment = "4096"
+                    IncrementalBuild = "true"
+                    NoStdLib = "false"
+                    NoWarn = ""
+                    Optimize = "false"
+                    OutputPath = ".\"
+                    RegisterForComInterop = "false"
+                    RemoveIntegerChecks = "false"
+                    TreatWarningsAsErrors = "false"
+                    WarningLevel = "4"
+                />
+            </Settings>
+            <References>
+                <Reference
+                    Name = "System"
+                    AssemblyName = "System"
+                />
+                <Reference
+                    Name = "System.Data"
+                    AssemblyName = "System.Data"
+                />
+                <Reference
+                    Name = "System.Drawing"
+                    AssemblyName = "System.Drawing"
+                />
+                <Reference
+                    Name = "System.Management"
+                    AssemblyName = "System.Management"
+                />
+                <Reference
+                    Name = "System.Windows.Forms"
+                    AssemblyName = "System.Windows.Forms"
+                />
+                <Reference
+                    Name = "System.Design"
+                    AssemblyName = "System.Design"
+                />
+                <Reference
+                    Name = "System.Xml"
+                    AssemblyName = "System.Xml"
+                />
+                <Reference
+                    Name = "mscorlib"
+                    AssemblyName = "mscorlib"
+                />
+                <Reference
+                    Name = "Lucene.Net"
+                    AssemblyName = "Lucene.Net"
+                    HintPath = "..\..\Lucene.Net-2.0.1-001.src\Lucene.Net\bin\Debug\Lucene.Net.dll"
+                />
+            </References>
+        </Build>
+        <Files>
+            <Include>
+                <File
+                    RelPath = "AssemblyInfo.cs"
+                    SubType = "Code"
+                    BuildAction = "Compile"
+                />
+                <File
+                    RelPath = "Build.xml"
+                    BuildAction = "Content"
+                />
+                <File
+                    RelPath = "README.txt"
+                    BuildAction = "Content"
+                />
+                <File
+                    RelPath = "Similar\MoreLikeThis.cs"
+                    SubType = "Code"
+                    BuildAction = "Compile"
+                />
+                <File
+                    RelPath = "Similar\Package.html"
+                    BuildAction = "Content"
+                />
+                <File
+                    RelPath = "Similar\SimilarityQueries.cs"
+                    SubType = "Code"
+                    BuildAction = "Compile"
+                />
+            </Include>
+        </Files>
+    </CSHARP>
+</VisualStudioProject>
+

Added: incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/SimilarityNet.sln
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/contrib/Similarity.Net/Similarity.Net/SimilarityNet.sln?view=auto&rev=516769
==============================================================================
--- incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/SimilarityNet.sln (added)
+++ incubator/lucene.net/trunk/C#/contrib/Similarity.Net/Similarity.Net/SimilarityNet.sln Sat Mar 10 11:26:32 2007
@@ -0,0 +1,24 @@
+Microsoft Visual Studio Solution File, Format Version 8.00
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Similarity.Net-2.0.0", "Similarity.Net-2.0.0.csproj", "{B9C12C92-199A-426E-B892-3531E2600217}"
+	ProjectSection(ProjectDependencies) = postProject
+	EndProjectSection
+EndProject
+Global
+	GlobalSection(DPCodeReviewSolutionGUID) = preSolution
+		DPCodeReviewSolutionGUID = {00000000-0000-0000-0000-000000000000}
+	EndGlobalSection
+	GlobalSection(SolutionConfiguration) = preSolution
+		Debug = Debug
+		Release = Release
+	EndGlobalSection
+	GlobalSection(ProjectConfiguration) = postSolution
+		{B9C12C92-199A-426E-B892-3531E2600217}.Debug.ActiveCfg = Debug|.NET
+		{B9C12C92-199A-426E-B892-3531E2600217}.Debug.Build.0 = Debug|.NET
+		{B9C12C92-199A-426E-B892-3531E2600217}.Release.ActiveCfg = Release|.NET
+		{B9C12C92-199A-426E-B892-3531E2600217}.Release.Build.0 = Release|.NET
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+	EndGlobalSection
+	GlobalSection(ExtensibilityAddIns) = postSolution
+	EndGlobalSection
+EndGlobal



Mime
View raw message