lucene-solr-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From PeterKerk <vettepa...@hotmail.com>
Subject RE: Indexing fieldvalues with dashes and spaces
Date Thu, 05 Aug 2010 08:31:33 GMT

@Michael, @Erick,

You both mention interesting things that triggered me.

@Erick:
Your referenced page is very useful. It seems the whitespace tokenizer under
the text_ws is causing issues.

You do mention another interesting thing:
"And do be aware that fields you get back from a request (i.e. a search) are
the stored fields, NOT what's indexed."

On the page you provided I see this under the Analyzers section: "Analyzers
are components that pre-process input text at index time and/or at search
time."

So I dont completely understand how that sentence is in line with your
comment.


@Michael:
You say: "use the tokenized field to return results, but have a duplicate
field of fieldtype="string" to show the untokenized results. E.g. facet on
that field."
I think your comment applies on my requirement: "a city field is something
that I want users to search on via text input, so lets say "New Yo" would
give the results for "New York".
But also a facet "Cities" is available in which "New York" is just one of
the cities that is clickable.
The other facet is "theme", which in my example holds values like
"Gemeentehuis" and "Strand & Zee", that would not be a thing on which can be
searched via manual input but IS clickable. "

Could you please indicate (just for the above fields) what needs to be
changed in my schema.xml and if so how that affects the way my request is
build up?


Thanks so much ahead in getting me started!


This is my schema.xml


<?xml version="1.0" encoding="UTF-8" ?>

<schema name="db" version="1.1">

  <types>
    <fieldType name="string" class="solr.StrField" sortMissingLast="true"
omitNorms="true"/>
    <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"
omitNorms="true"/>
    <fieldType name="integer" class="solr.IntField" omitNorms="true"/>
    <fieldType name="long" class="solr.LongField" omitNorms="true"/>
    <fieldType name="float" class="solr.FloatField" omitNorms="true"/>
    <fieldType name="double" class="solr.DoubleField" omitNorms="true"/>
    <fieldType name="sint" class="solr.SortableIntField"
sortMissingLast="true" omitNorms="true"/>
    <fieldType name="slong" class="solr.SortableLongField"
sortMissingLast="true" omitNorms="true"/>
    <fieldType name="sfloat" class="solr.SortableFloatField"
sortMissingLast="true" omitNorms="true"/>
    <fieldType name="sdouble" class="solr.SortableDoubleField"
sortMissingLast="true" omitNorms="true"/>
    <fieldType name="date" class="solr.DateField" sortMissingLast="true"
omitNorms="true"/>
    <fieldType name="random" class="solr.RandomSortField" indexed="true" />
    <fieldType name="text_ws" class="solr.TextField"
positionIncrementGap="100">
      <analyzer>
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
      </analyzer>
    </fieldType>
    <fieldType name="text" class="solr.TextField"
positionIncrementGap="100">
      <analyzer type="index">
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt"/>
        <filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="1"
catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.EnglishPorterFilterFactory"
protected="protwords.txt"/>
        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
      </analyzer>
      <analyzer type="query">
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
ignoreCase="true" expand="true"/>
        <filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt"/>
        <filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.EnglishPorterFilterFactory"
protected="protwords.txt"/>
        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
      </analyzer>
    </fieldType>

    <fieldType name="textTight" class="solr.TextField"
positionIncrementGap="100" >
      <analyzer>
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
ignoreCase="true" expand="false"/>
        <filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt"/>
        <filter class="solr.WordDelimiterFilterFactory"
generateWordParts="0" generateNumberParts="0" catenateWords="1"
catenateNumbers="1" catenateAll="0"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.EnglishPorterFilterFactory"
protected="protwords.txt"/>
        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
      </analyzer>
    </fieldType>

    <fieldType name="alphaOnlySort" class="solr.TextField"
sortMissingLast="true" omitNorms="true">
      <analyzer>
        <tokenizer class="solr.KeywordTokenizerFactory"/>
        <filter class="solr.LowerCaseFilterFactory" />
        <filter class="solr.TrimFilterFactory" />
        <filter class="solr.PatternReplaceFilterFactory" pattern="([^a-z])"
replacement="" replace="all" />
      </analyzer>
    </fieldType>
    <fieldtype name="ignored" stored="false" indexed="false"
class="solr.StrField" /> 
 </types>

 <fields>
   <field name="id" type="string" indexed="true" stored="true"
required="true" /> 
   <field name="title" type="text_ws" indexed="true" stored="true"/>
   <field name="city" type="text_ws" indexed="true" stored="true"/>
   <field name="official" type="integer" indexed="true" stored="true"/>
   <field name="theme" type="text_ws" indexed="true" stored="true"
multiValued="true" omitNorms="true" termVectors="true" />
   <field name="features" type="text_ws" indexed="true" stored="true"
multiValued="true"/>
   <field name="services" type="text_ws" indexed="true" stored="true"
multiValued="true"/>
   <field name="province" type="text_ws" indexed="true" stored="true"/>
   <field name="word" type="string" indexed="true" stored="true"/>
   <field name="text" type="text" indexed="true" stored="false"
multiValued="true"/>
   <field name="timestamp" type="date" indexed="true" stored="true"
default="NOW" multiValued="false"/>

   <dynamicField name="*_i"  type="sint"    indexed="true"  stored="true"/>
   <dynamicField name="*_s"  type="string"  indexed="true"  stored="true"/>
   <dynamicField name="*_l"  type="slong"   indexed="true"  stored="true"/>
   <dynamicField name="*_t"  type="text"    indexed="true"  stored="true"/>
   <dynamicField name="*_b"  type="boolean" indexed="true"  stored="true"/>
   <dynamicField name="*_f"  type="sfloat"  indexed="true"  stored="true"/>
   <dynamicField name="*_d"  type="sdouble" indexed="true"  stored="true"/>
   <dynamicField name="*_dt" type="date"    indexed="true"  stored="true"/>
   <dynamicField name="random*" type="random" />

 </fields>

 <uniqueKey>id</uniqueKey>

 <defaultSearchField>text</defaultSearchField>

 <solrQueryParser defaultOperator="OR"/>

   <copyField source="theme" dest="text"/>
   <copyField source="title" dest="text"/>
   <copyField source="city" dest="text"/>
   <copyField source="official" dest="text" />
   <copyField source="features" dest="text"/>
   <copyField source="services" dest="text"/>
</schema>

-- 
View this message in context: http://lucene.472066.n3.nabble.com/Indexing-fieldvalues-with-dashes-and-spaces-tp1023699p1025463.html
Sent from the Solr - User mailing list archive at Nabble.com.

Mime
View raw message