lucene-java-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "denis.zhdanov" <denzhda...@gmail.com>
Subject SynonymFilter benefit over explicit field composition
Date Mon, 09 Sep 2013 06:05:50 GMT
Hello,I recently started using lucene and checking built-in synonyms
processing facilities. So, the main question so far is what is the benefit
of using /SynonymFilter/ over explicitly adding synonyms as document
fields?The former has an obvious drawback that it doesn't support transitive
relations. Consider a simple example below - registering pairs (/"first"/,
/"second"/) and (/"first"/, /"third"/) as synonyms; indexing /"second"/;
searching against /"third"/; no match:

        
package com.my.social.search.lucene;import
org.apache.lucene.analysis.Analyzer;import
org.apache.lucene.analysis.TokenStream;import
org.apache.lucene.analysis.Tokenizer;import
org.apache.lucene.analysis.core.LowerCaseFilter;import
org.apache.lucene.analysis.core.StopFilter;import
org.apache.lucene.analysis.en.EnglishAnalyzer;import
org.apache.lucene.analysis.en.EnglishPossessiveFilter;import
org.apache.lucene.analysis.en.PorterStemFilter;import
org.apache.lucene.analysis.miscellaneous.LengthFilter;import
org.apache.lucene.analysis.standard.StandardFilter;import
org.apache.lucene.analysis.standard.StandardTokenizer;import
org.apache.lucene.analysis.synonym.SynonymFilter;import
org.apache.lucene.analysis.synonym.SynonymMap;import
org.apache.lucene.analysis.util.StopwordAnalyzerBase;import
org.apache.lucene.document.Document;import
org.apache.lucene.document.Field;import
org.apache.lucene.document.TextField;import
org.apache.lucene.index.DirectoryReader;import
org.apache.lucene.index.IndexWriter;import
org.apache.lucene.index.IndexWriterConfig;import
org.apache.lucene.queryparser.classic.ParseException;import
org.apache.lucene.queryparser.classic.QueryParser;import
org.apache.lucene.search.*;import
org.apache.lucene.store.RAMDirectory;import
org.apache.lucene.util.CharsRef;import org.apache.lucene.util.Version;import
org.jetbrains.annotations.NotNull;import java.io.IOException;import
java.io.Reader;/** * @author Denis Zhdanov * @since 9/5/13 12:10 AM */public
class LuceneTest {    public static void main(String[] args) throws
IOException, ParseException {        RAMDirectory dir = new RAMDirectory();       
SynonymMap.Builder builder = new SynonymMap.Builder(true);       
builder.add(new CharsRef("first"), new CharsRef("second"), true);       
builder.add(new CharsRef("first"), new CharsRef("third"), true);       
MyAnalyzer analyzer = new MyAnalyzer(builder.build());        try
(IndexWriter writer = new IndexWriter(dir, new
IndexWriterConfig(Version.LUCENE_44, analyzer))) {            Document
document = new Document();            document.add(new TextField("tag",
"second", Field.Store.YES));            writer.addDocument(document);       
}        IndexSearcher searcher = new
IndexSearcher(DirectoryReader.open(dir));        QueryParser queryParser =
new QueryParser(Version.LUCENE_44, "tag", analyzer);        Query query =
queryParser.parse("third");        TopDocs hits = searcher.search(query,
null, 10);        for (ScoreDoc scoreDoc : hits.scoreDocs) {           
Document doc = searcher.doc(scoreDoc.doc);           
System.out.println(doc.get("tag"));            //Explanation explain =
searcher.explain(query, scoreDoc.doc);           
//System.out.println(explain);        }    }    private static class
MyAnalyzer extends StopwordAnalyzerBase {        private final SynonymMap
synonyms;        MyAnalyzer(@NotNull SynonymMap synonyms) {           
super(Version.LUCENE_44);            this.synonyms = synonyms;        }       
@Override        protected Analyzer.TokenStreamComponents
createComponents(String fieldName, Reader reader) {            final
Tokenizer source = new StandardTokenizer(matchVersion, reader);           
TokenStream result = new StandardFilter(matchVersion, source);           
result = new EnglishPossessiveFilter(matchVersion, result);           
result = new LowerCaseFilter(matchVersion, result);            result = new
SynonymFilter(result, synonyms, true);            result = new
PorterStemFilter(result);            return new
Analyzer.TokenStreamComponents(source, result);        }    }}
    

That means that I need to explicitly register all possible pairs from a set
of synonyms to get SynonymFilter-based approach work (I have a large set of
english synonyms (built from gutenberg dictionary) where every synonyms
group contains more than two words).I see the only possible benefit of using
/SynonymFilter/ so far - phrase search where synonym position matters:

        
package com.my.social.search.lucene;import
org.apache.lucene.analysis.Analyzer;import
org.apache.lucene.analysis.TokenStream;import
org.apache.lucene.analysis.Tokenizer;import
org.apache.lucene.analysis.core.LowerCaseFilter;import
org.apache.lucene.analysis.core.StopFilter;import
org.apache.lucene.analysis.en.EnglishAnalyzer;import
org.apache.lucene.analysis.en.EnglishPossessiveFilter;import
org.apache.lucene.analysis.en.PorterStemFilter;import
org.apache.lucene.analysis.miscellaneous.LengthFilter;import
org.apache.lucene.analysis.standard.StandardFilter;import
org.apache.lucene.analysis.standard.StandardTokenizer;import
org.apache.lucene.analysis.synonym.SynonymFilter;import
org.apache.lucene.analysis.synonym.SynonymMap;import
org.apache.lucene.analysis.util.StopwordAnalyzerBase;import
org.apache.lucene.document.Document;import
org.apache.lucene.document.Field;import
org.apache.lucene.document.TextField;import
org.apache.lucene.index.DirectoryReader;import
org.apache.lucene.index.IndexWriter;import
org.apache.lucene.index.IndexWriterConfig;import
org.apache.lucene.queryparser.classic.ParseException;import
org.apache.lucene.queryparser.classic.QueryParser;import
org.apache.lucene.search.*;import
org.apache.lucene.store.RAMDirectory;import
org.apache.lucene.util.CharsRef;import org.apache.lucene.util.Version;import
org.jetbrains.annotations.NotNull;import java.io.IOException;import
java.io.Reader;/** * @author Denis Zhdanov * @since 9/5/13 12:10 AM */public
class LuceneTest {    public static void main(String[] args) throws
IOException, ParseException {        RAMDirectory dir = new RAMDirectory();       
SynonymMap.Builder builder = new SynonymMap.Builder(true);       
builder.add(new CharsRef("first"), new CharsRef("second"), true);       
MyAnalyzer analyzer = new MyAnalyzer(builder.build());        try
(IndexWriter writer = new IndexWriter(dir, new
IndexWriterConfig(Version.LUCENE_44, analyzer))) {            Document
document = new Document();            document.add(new TextField("tag",
"second point or number", Field.Store.YES));           
writer.addDocument(document);            document = new Document();           
document.add(new TextField("tag", "first number dummy", Field.Store.YES));           
writer.addDocument(document);        }        IndexSearcher searcher = new
IndexSearcher(DirectoryReader.open(dir));        QueryParser queryParser =
new QueryParser(Version.LUCENE_44, "tag", analyzer);        Query query =
queryParser.parse("\"second number\"");        TopDocs hits =
searcher.search(query, null, 10);        for (ScoreDoc scoreDoc :
hits.scoreDocs) {            Document doc = searcher.doc(scoreDoc.doc);           
System.out.println(doc.get("tag"));            //Explanation explain =
searcher.explain(query, scoreDoc.doc);           
//System.out.println(explain);        }    }    private static class
MyAnalyzer extends StopwordAnalyzerBase {        private final SynonymMap
synonyms;        MyAnalyzer(@NotNull SynonymMap synonyms) {           
super(Version.LUCENE_44);            this.synonyms = synonyms;        }       
@Override        protected Analyzer.TokenStreamComponents
createComponents(String fieldName, Reader reader) {            final
Tokenizer source = new StandardTokenizer(matchVersion, reader);           
TokenStream result = new StandardFilter(matchVersion, source);           
result = new EnglishPossessiveFilter(matchVersion, result);           
result = new LowerCaseFilter(matchVersion, result);            result = new
SynonymFilter(result, synonyms, true);            result = new
PorterStemFilter(result);            return new
Analyzer.TokenStreamComponents(source, result);        }    }}
    

I google in order to find out if my understanding is correct but
unfortunately it doesn't show any results. That's why I decided to ask the
community before digging into lucene sources



--
View this message in context: http://lucene.472066.n3.nabble.com/SynonymFilter-benefit-over-explicit-field-composition-tp4088819.html
Sent from the Lucene - Java Users mailing list archive at Nabble.com.
Mime
  • Unnamed multipart/alternative (inline, None, 0 bytes)
View raw message