lucene-java-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Liaqat Ali <liaqatalim...@gmail.com>
Subject Re: StopWords problem
Date Thu, 27 Dec 2007 09:49:52 GMT
Doron Cohen wrote:
> This is not a self contained program - it is incomplete, and it depends
> on files on *your* disk...
>
> Still, can you show why you're saying it indexes stopwords?
> Can you print here few samples of IndexReader.terms().term()?
>
> BR, Doron
>
> On Dec 27, 2007 10:22 AM, Liaqat Ali <liaqatalimian@gmail.com> wrote:
>
>   
>> The whole program is given below. But it does not eliminate stop words
>> from the index.
>>
>>                Document document  = new Document();
>>            document.add(new Field("contents",sb.toString(),
>> Field.Store.NO, Field.Index.TOKENIZED));
>>
>>
>> ---------------------------------------------------------------------
>> To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
>> For additional commands, e-mail: java-user-help@lucene.apache.org
>>
>>
>>     
>
> Hi, Doron
>   
I got your point. The program given does not give not any error during 
compilation and it is interpreted well. But the it does not create any 
index. when the StandardAnalyzer() is called without Stopwords list it 
works well, but when the list of stop words is passed as an argument, 
then it does not.



import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;

import java.io.*;


public class urduIndexer1 {



Reader file;
BufferedReader buff;
String line;
IndexWriter writer;
String indexDir;
Directory dir;

public static final String[] URDU_STOP_WORDS = { "کی" ,"کا" ,"کو" ,"ہے" 
,"کے" ,"نے" ,"پر" ,"اور" ,"سے","میں" ,"بھی"
,"ان" ,"ایک" ,"تھا" ,"تھی" ,"کیا" ,"ہیں" ,"کر" ,"وہ" ,"جس" ,"نہں"
,"تک" };


public void index() throws IOException,
UnsupportedEncodingException {


indexDir = "D:\\UIR\\in";
Analyzer analyzer = new StandardAnalyzer(URDU_STOP_WORDS);
boolean createFlag = true;

dir = FSDirectory.getDirectory(indexDir);
writer = new IndexWriter(dir, analyzer, createFlag);

for (int i=1;i<=201;i++) {



file = new InputStreamReader(new FileInputStream("corpus\\doc" + i + 
".txt"), "UTF-8");

StringBuffer sb = new StringBuffer();

buff = new BufferedReader(file);

//line = buff.readLine();


while( (line = buff.readLine()) != null) {
sb.append(line);
}



boolean eof = false;

Document document = new Document();
document.add(new Field("contents",sb.toString(), Field.Store.NO, 
Field.Index.TOKENIZED));

//document.add(new Field("Document","Doc" + 
i,Field.Store.YES,Field.Index.TOKENIZED));


writer.addDocument(document);

buff.close();



}
writer.optimize();
writer.close();



}


public static void main(String arg[]) throws Exception{


urduIndexer indx = new urduIndexer();
indx.index();





}




}


---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org


Mime
View raw message