lucene-java-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Michael Sokolov <msoko...@safaribooksonline.com>
Subject Re: Payload and Similarity Function: Always same value
Date Thu, 30 Oct 2014 19:53:56 GMT
That's a lot of code to eyeball.  Have you tried printing out the input 
data as you are indexing it (just at doc.add)?  I am guessing there is 
some simple variable aliasing issue that I don't see at a glance ...

-Mike

On 10/30/14 2:03 PM, Ralf Bierig wrote:
> I want to implement a Lucene Indexer/Searcher that uses the new 
> Payload feature to add meta information to tokens. I specifically add 
> weights (i.e. 0-100) to conceptual tags in order to use them to 
> overwrite the standard Lucene TF-IDF weighting. I am puzzled by the 
> behaviour of this and I believe there is something wrong with the 
> Similarity class, that I overwrote, but I cannot figure it out.
>
> I attach the complete code below for this exampe. When I run a query 
> with it (e.g. "concept:red") I discover that each payload is always 
> the first number that was passed through MyPayloadSimilarity (in the 
> code example, this is 1.0) and not 1.0, 50.0 and 100.0. As a result, 
> all documents get the same payload and the same score. However, the 
> data should feature picture #1, with a payload of 100.0, followed by 
> picture #2, followed by picture #3 and very diverse scores. I can't 
> get my heard around this...
>
> Here are the results of the run:
>
> Query: concept:red
> ===>  docid: 0 payload: 1.0
> ===>  docid: 1 payload: 1.0
> ===>  docid: 2 payload: 1.0
> Number of results:3
> -> docid: 3.jpg score: 0.2518424
> -> docid: 2.jpg score: 0.2518424
> -> docid: 1.jpg score: 0.2518424
>
> What is wrong? Did i misunderstand something about Payloads?
>
> ---Start Code---
>
> public class PayloadShowcase {
>
>  public static void main(String s[]) {
>      PayloadShowcase p = new PayloadShowcase();
>      p.run();
>  }
>
> public void run () {
>     // Step 1: indexing
>     MyPayloadIndexer indexer = new MyPayloadIndexer();
>     indexer.index();
>     // Step 2: searching
>     MyPayloadSearcher searcher = new MyPayloadSearcher();
>     searcher.search("red");
> }
>
> public class MyPayloadAnalyzer extends Analyzer {
>
>     private PayloadEncoder encoder;
>     MyPayloadAnalyzer(PayloadEncoder encoder) {
>         this.encoder = encoder;
>     }
>
>     @Override
>     protected TokenStreamComponents createComponents(String fieldName, 
> Reader reader) {
>         Tokenizer source = new WhitespaceTokenizer(reader);
>         TokenStream filter = new LowerCaseFilter(source);
>         filter = new DelimitedPayloadTokenFilter(filter, '|', encoder);
>         return new TokenStreamComponents(source, filter);
>     }
> }
>
> public class MyPayloadIndexer {
>
>     public MyPayloadIndexer() {}
>
>     public void index() {
>         try {
>             Directory dir = FSDirectory.open(new 
> File("D:/data/indices/sandbox"));
>             Analyzer analyzer = new MyPayloadAnalyzer(new 
> FloatEncoder());
>             IndexWriterConfig iwconfig = new 
> IndexWriterConfig(Version.LUCENE_4_10_1, analyzer);
>             iwconfig.setSimilarity(new MyPayloadSimilarity());
> iwconfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
>
>             // load mappings and classifiers
>             HashMap<String, String> mappings = this.loadDataMappings();
>             HashMap<String, HashMap> cMaps = this.loadData();
>
>             IndexWriter writer = new IndexWriter(dir, iwconfig);
>             indexDocuments(writer, mappings, cMaps);
>             writer.close();
>
>         } catch (IOException e) {
>             System.out.println("Exception while indexing: " + 
> e.getMessage());
>         }
>     }
>
>     private void indexDocuments(IndexWriter writer, HashMap<String, 
> String> fileMappings, HashMap<String, HashMap> concepts) throws 
> IOException {
>
>         Set fileSet = fileMappings.keySet();
>         Iterator<String> iterator = fileSet.iterator();
>         while (iterator.hasNext()){
>             // unique file information
>             String fileID = iterator.next();
>             String filePath = fileMappings.get(fileID);
>             // create a new, empty document
>             Document doc = new Document();
>             // path of the indexed file
>             Field pathField = new StringField("path", filePath, 
> Field.Store.YES);
>             doc.add(pathField);
>             // lookup all concept probabilities for this fileID
>             Iterator<String> conceptIterator = 
> concepts.keySet().iterator();
>             while (conceptIterator.hasNext()){
>                 String conceptName = conceptIterator.next();
>                 HashMap conceptMap = concepts.get(conceptName);
>                 doc.add(new TextField("concept", ("" + conceptName + 
> "|").trim() + (conceptMap.get(fileID) + "").trim(), Field.Store.YES));
>             }
>             writer.addDocument(doc);
>         }
>     }
>
>     public HashMap<String, String> loadDataMappings(){
>         HashMap<String, String> h = new HashMap<>();
>         h.put("1", "1.jpg");
>         h.put("2", "2.jpg");
>         h.put("3", "3.jpg");
>         return h;
>     }
>
>     public HashMap<String, HashMap> loadData(){
>         HashMap<String, HashMap> h = new HashMap<>();
>         HashMap<String, String> green = new HashMap<>();
>         green.put("1", "50.0");
>         green.put("2", "1.0");
>         green.put("3", "100.0");
>         HashMap<String, String> red = new HashMap<>();
>         red.put("1", "100.0");
>         red.put("2", "50.0");
>         red.put("3", "1.0");
>         HashMap<String, String> blue = new HashMap<>();
>         blue.put("1", "1.0");
>         blue.put("2", "50.0");
>         blue.put("3", "100.0");
>         h.put("green", green);
>         h.put("red", red);
>         h.put("blue", blue);
>         return h;
>     }
> }
>
> class MyPayloadSimilarity extends DefaultSimilarity {
>
>     @Override
>     public float scorePayload(int docID, int start, int end, BytesRef 
> payload) {
>         float pload = 1.0f;
>         if (payload != null) {
>             pload = PayloadHelper.decodeFloat(payload.bytes, 
> payload.offset);
>         }
>         System.out.println("===>  docid: " + docID + " payload: " + 
> pload);
>         return pload;
>     }
> }
>
> public class MyPayloadSearcher {
>
>     public MyPayloadSearcher() {}
>
>     public void search(String queryString) {
>         try {
>             IndexReader reader = 
> DirectoryReader.open(FSDirectory.open(new 
> File("D:/data/indices/sandbox")));
>             IndexSearcher searcher = new IndexSearcher(reader);
>             searcher.setSimilarity(new PayloadSimilarity());
>             PayloadTermQuery query = new PayloadTermQuery(new 
> Term("concept", queryString),
>                     new AveragePayloadFunction());
>             System.out.println("Query: " + query.toString());
>             TopDocs topDocs = searcher.search(query, 999);
>             ScoreDoc[] hits = topDocs.scoreDocs;
>             System.out.println("Number of results:" + hits.length);
>
>             // output
>             for (int i = 0; i < hits.length; i++) {
>                 Document doc = searcher.doc(hits[i].doc);
>                 System.out.println("-> docid: " + doc.get("path") + " 
> score: " + hits[i].score);
>             }
>             reader.close();
>
>         } catch (Exception e) {
>             System.out.println("Exception while searching: " + 
> e.getMessage());
>         }
>     }
> }
>
> ---End Code---
>
> Any ideas? I am very grateful for any help...
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
> For additional commands, e-mail: java-user-help@lucene.apache.org
>


---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org


Mime
View raw message