lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Grant Ingersoll <gsing...@apache.org>
Subject Fwd: Search Test file
Date Sun, 04 Jan 2009 01:19:14 GMT
Hi Amin,

I see a couple of issues with your program below, and one that is the  
cause of the problem of not finding "amin" as a query term.

When you construct your IndexWriter, you are doing:
> IndexWriter indexWriter = new  
> IndexWriter(getDirectory(),getAnalyzer(),new  
> IndexWriter.MaxFieldLength(2));

The MaxFieldLength parameter specifies the maximum number of tokens  
allowed in a Field.  Everything else after that is dropped.  See http://lucene.apache.org/java/2_4_0/api/core/org/apache/lucene/index/IndexWriter.html#IndexWriter(org.apache.lucene.store.Directory,%20org.apache.lucene.analysis.Analyzer,%20org.apache.lucene.index.IndexWriter.MaxFieldLength)

  and http://lucene.apache.org/java/2_4_0/api/core/org/apache/lucene/index/IndexWriter.MaxFieldLength.html

Also,
  TopDocs topDocs = multiSearcher.search(query,  
BooleanQuery.getMaxClauseCount());

strikes me as really odd.  Why are you passing in the max clause count  
as the number of results you want returned?

Cheers,
Grant



Begin forwarded message:

> From: "aminmc@gmail.com" <aminmc@gmail.com>
> Date: January 3, 2009 3:24:52 PM EST
> To: gsingers@apache.org
> Subject: Search Test file
>
> I've shared a document with you called "Search Test file":
> http://docs.google.com/Doc?id=d77xf5q_0n6hb38fx&invite=cjq79zj
>
> It's not an attachment -- it's stored online at Google Docs. To open  
> this document, just click the link above.
> ---
>
> Hi
>
> I have uploaded the test file at google docs. It is currently a txt  
> file but if you change the extension to .java it should work.
>
> package com.amin.app.lucene.search.impl;
>
> import static org.junit.Assert.assertEquals;
> import static org.junit.Assert.assertNotNull;
> import static org.junit.Assert.assertNotSame;
> import static org.junit.Assert.assertTrue;
>
> import java.io.File;
> import java.io.FileInputStream;
> import java.io.FileOutputStream;
> import java.io.IOException;
> import java.io.InputStream;
> import java.io.OutputStream;
>
> import javax.swing.text.BadLocationException;
> import javax.swing.text.DefaultStyledDocument;
> import javax.swing.text.rtf.RTFEditorKit;
>
> import org.apache.commons.lang.StringUtils;
> import org.apache.lucene.analysis.Analyzer;
> import org.apache.lucene.analysis.standard.StandardAnalyzer;
> import org.apache.lucene.ant.DocumentHandler;
> import org.apache.lucene.ant.DocumentHandlerException;
> import org.apache.lucene.document.Document;
> import org.apache.lucene.document.Field;
> import org.apache.lucene.index.CorruptIndexException;
> import org.apache.lucene.index.IndexReader;
> import org.apache.lucene.index.IndexWriter;
> import org.apache.lucene.queryParser.MultiFieldQueryParser;
> import org.apache.lucene.queryParser.QueryParser;
> import org.apache.lucene.search.BooleanQuery;
> import org.apache.lucene.search.IndexSearcher;
> import org.apache.lucene.search.MultiSearcher;
> import org.apache.lucene.search.Query;
> import org.apache.lucene.search.ScoreDoc;
> import org.apache.lucene.search.Searchable;
> import org.apache.lucene.search.TopDocs;
> import org.apache.lucene.store.Directory;
> import org.apache.lucene.store.FSDirectory;
> import org.junit.After;
> import org.junit.Before;
> import org.junit.Test;
>
> import com.amin.app.lucene.util.WorkItem.IndexerType;
>
> public class SearchTest {
>
> private File rtfFile = null;
> private static final String RTF_FILE_NAME = "rtfDocumentToIndex.rtf";
>
> @Before
> public void setUp() throws Exception {
> InputStream inputStream =  
> this.getClass().getClassLoader().getResourceAsStream(RTF_FILE_NAME);
> rtfFile = new File(RTF_FILE_NAME);
> convertInputStreamToFile(inputStream, rtfFile);
> }
>
>
>
> @Test
> public void testCanCreateLuceneDocumentForRTFDocument() throws  
> Exception {
> JavaBuiltInRTFHandler builtInRTFHandler = new JavaBuiltInRTFHandler();
> Document document = builtInRTFHandler.getDocument(rtfFile);
> assertNotNull(document);
> String value = document.get(FieldNameEnum.BODY.getDescription());
> assertNotNull(value);
> assertNotSame("", value);
> assertTrue(value.contains("Amin Mohammed-Coleman"));
> assertTrue(value.contains("This is a test rtf document that will be  
> indexed."));
> String path = document.get(FieldNameEnum.PATH.getDescription());
> assertNotNull(path);
> assertTrue(path.contains(".rtf"));
> String fileName = document.get(FieldNameEnum.NAME.getDescription());
> assertNotNull(fileName);
> assertEquals(RTF_FILE_NAME, fileName);
> assertEquals(WorkItem.IndexerType.RTF_INDEXER.name(),  
> document.get(FieldNameEnum.TYPE.getDescription()));
>
> }
>
>
>
> @Test
> public void testCanSearchRtfDocument() throws Exception {
> JavaBuiltInRTFHandler builtInRTFHandler = new JavaBuiltInRTFHandler();
> Document document = builtInRTFHandler.getDocument(rtfFile);
> IndexWriter indexWriter = new  
> IndexWriter(getDirectory(),getAnalyzer(),new  
> IndexWriter.MaxFieldLength(2));
> try {
> indexWriter.addDocument(document);
> commitAndCloseWriter(indexWriter);
> } catch (CorruptIndexException e) {
> throw new IllegalStateException(e);
> } catch (IOException e) {
> throw new IllegalStateException(e);
> }
>
> //I plan to use other searchers later
> IndexSearcher indexSearcher = new IndexSearcher(getDirectory());
> MultiSearcher multiSearcher = new MultiSearcher(new Searchable[]  
> {indexSearcher});
> QueryParser queryParser = new MultiFieldQueryParser(new String[]  
> {FieldNameEnum.BODY.getDescription()}, new StandardAnalyzer());
> Query query = queryParser.parse("amin");
> TopDocs topDocs = multiSearcher.search(query,  
> BooleanQuery.getMaxClauseCount());
> assertNotNull(topDocs);
> assertEquals(1, topDocs.totalHits);
> ScoreDoc[] scoreDocs = topDocs.scoreDocs;
> for (ScoreDoc scoreDoc : scoreDocs) {
> Document documentFromSearch = indexSearcher.doc(scoreDoc.doc);
> assertNotNull(documentFromSearch);
> String bodyText =  
> documentFromSearch.get(FieldNameEnum.BODY.getDescription());
> assertNotNull(bodyText);
> assertNotSame("", bodyText);
> assertTrue(bodyText.contains("Amin Mohammed-Coleman"));
> assertTrue(bodyText.contains("This is a test rtf document that will  
> be indexed."));
>
> }
> multiSearcher.close();
>
> }
>
> @After
> public void tearDown() throws Exception {
> rtfFile.delete();
> if (getDirectory().list() != null && getDirectory().list().length >  
> 0) {
> IndexReader reader = IndexReader.open(getDirectory());
> for(int i = 0; i < reader.maxDoc();i++) {
> reader.deleteDocument(i);
> }
> reader.close();
> }
> }
>
> private void commitAndCloseWriter(IndexWriter indexWriter) throws  
> CorruptIndexException,IOException {
> indexWriter.commit();
> indexWriter.close();
> }
>
>
> public Directory getDirectory() throws IOException {
> return FSDirectory.getDirectory("/tmp/lucene/rtf");
> }
>
> public Analyzer getAnalyzer() {
> return new StandardAnalyzer();
> }
> private static void convertInputStreamToFile(InputStream  
> inputStream, File file) {
> try
>     {
>     OutputStream out=new FileOutputStream(file);
>     byte buf[]=new byte[1024];
>     int len;
>     while((len=inputStream.read(buf))>0)
>     out.write(buf,0,len);
>     out.close();
>     inputStream.close();
>
>     }catch (IOException e){
>     throw new IllegalStateException(e);
>     }
> }
> private static class JavaBuiltInRTFHandler implements DocumentHandler{
>
> public Document getDocument(File file) throws  
> DocumentHandlerException {
> String bodyText = null;
> DefaultStyledDocument styledDoc = new DefaultStyledDocument();
> try {
> InputStream inputStream = new FileInputStream(file);
> new RTFEditorKit().read(inputStream, styledDoc, 0);
> bodyText = styledDoc.getText(0, styledDoc.getLength());
> } catch (IOException ioex) {
> throw new IllegalStateException(ioex);
> } catch (BadLocationException e) {
> throw new IllegalArgumentException(e);
> }
> //create Document object using body
> if (bodyText != null) {
> Document document = new Document();
> String trimmedBodyText = StringUtils.trimToEmpty(bodyText);
> trimmedBodyText = trimmedBodyText.replaceAll("\n", "");
> Field field = new  
> Field(FieldNameEnum.BODY.getDescription(),trimmedBodyText,  
> Field.Store.YES, Field.Index.ANALYZED);
> document.add(field);
>
> String pathToFile = file.getPath();
> Field pathToFileField = new  
> Field(FieldNameEnum.PATH.getDescription(),pathToFile,  
> Field.Store.YES, Field.Index.NOT_ANALYZED);
> document.add(pathToFileField);
>
> String fileName = file.getName();
> Field fileNameField = new  
> Field(FieldNameEnum.NAME.getDescription(),fileName, Field.Store.YES,  
> Field.Index.NOT_ANALYZED);
> document.add(fileNameField);
>
> Field typeField = new  
> Field 
> (FieldNameEnum.TYPE.getDescription(),IndexerType.RTF_INDEXER.name(),  
> Field.Store.YES, Field.Index.NOT_ANALYZED);
> document.add(typeField);
>
> String summary = bodyText.substring(0, 10);
>
> Field summaryField = new  
> Field(FieldNameEnum.SUMMARY.getDescription(),summary,  
> Field.Store.YES, Field.Index.NOT_ANALYZED);
> document.add(summaryField);
>
> return document;
> }
> return null;
> }
> }
>
> private static class WorkItem {
>
> public enum WorkItemEvent {
> ADD,
> UPDATE,
> DELETE;
> }
>
> public enum IndexerType {
> RTF_INDEXER,
> PDF_INDEXER,
> XML_INDEXER,
> PLAIN_TEXT_INDEXER,
> MS_WORD_INDEXER,
> MS_EXCEL_INDEXER,
> MS_POWERPOINT_INDEXER;
> }
>
>
> private final Document workLoad;
>
> private final WorkItemEvent workItemEvent;
>
> private final IndexerType indexerType;
>
>
> public WorkItem(final Document workLoad, final WorkItemEvent  
> workItemEvent) {
> this.workLoad = workLoad;
> this.workItemEvent = workItemEvent;
> String type = this.workLoad.get("type");
> this.indexerType = IndexerType.valueOf(type);
> }
>
> public IndexerType getIndexerType() {
> return indexerType;
> }
>
> public Document getWorkLoad() {
> return workLoad;
> }
>
> public WorkItemEvent getWorkItemEvent() {
> return workItemEvent;
> }
> }
>
> private enum FieldNameEnum {
>
> AUTHOR("author"),
> BODY("body"),
> TITLE("title"),
> SUBJECT("subject"),
> KEYWORDS("keywords"),
> PATH("path"), NAME ("name"),
> TYPE("type"),
> ID ("id"),
> SUMMARY ("summary");
>
> private final String description;
>
> private FieldNameEnum(final String description) {
> this.description = description;
> }
>
> public String getDescription() {
> return this.description;
> }
> }
> }

--------------------------
Grant Ingersoll

Lucene Helpful Hints:
http://wiki.apache.org/lucene-java/BasicsOfPerformance
http://wiki.apache.org/lucene-java/LuceneFAQ











Mime
View raw message