lucene-solr-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Wendy <w...@rcsb.rutgers.edu>
Subject Re: help with DIH transformer to add a suffix to column names
Date Tue, 23 Aug 2016 16:52:23 GMT
<http://lucene.472066.n3.nabble.com/file/n4292972/Screenshot-19.png> 

Hi Alex,

It worked out kindly. I have to specify table column names. Using customer
transformer allowed me to change column name to ????_stem. In this way, it
simplifies field ranking in solrconfig.xml file and simplifies field
specification in  managed-schema file. I listed the steps below. 


Steps:
-------------------------------------------------------
1. sample of db-data-config.xml file

<dataConfig>

<dataSource name ="data_source_?????"
            type="JdbcDataSource"
            driver="com.mysql.jdbc.Driver"
           
url="jdbc:mysql://machineName:3306/databaseName?zeroDateTimeBehavior=convertToNull"
            user="?????"
            password="?????" /> 

<document name="db-fulltext-index">

<entity name="pdb_entry" pk="pdb_id"
transformer="my.solr.transformer.FieldTransformer"
               query="select pdb_id,author_list, method, header,
deposit_site, process_site,  initial_deposition_date, date_of_RCSB_release
from pdb_entry where status_code = 'REL' "
	       deltaImputQuery="select pdb_id, author_list , method, header ,
deposit_site, process_site,  initial_deposition_date, date_of_RCSB_release
from pdb_entry a, database_PDB_rev b  where b.Structure_ID = a.pdb_id and
a.status_code = 'REL' and b.date &gt; '${dih.last_index_time}' "
               parentDeltaImputQuery="select pdb_id from pdb_entry where
pdb_id=${database_PDB_rev.Structure_ID}" 
               deltaQuery="select pdb_id, author_list, method, header,
deposit_site, process_site,  initial_deposition_date, date_of_RCSB_release
from pdb_entry a, database_PDB_rev b  where b.Structure_ID = a.pdb_id and
a.status_code = 'REL' and b.date &gt; '${dih.last_index_time}' "
               parentDeltaQuery="select pdb_id from pdb_entry where
pdb_id=${database_PDB_rev.Structure_ID}" >
           
  <entity  name="citation"  onError="continue" 
transformer="my.solr.transformer.FieldTransformer"
		query="select title  from citation where
Structure_ID='${pdb_entry.pdb_id}' and id = 'primary' and title is not null" 
                parentDeltaQuery="select pdb_id from pdb_entry where
pdb_id=${citation.Structure_ID}" >
                         
  </entity>

  <entity  name="citation_author" onError="continue" 
transformer="my.solr.transformer.FieldTransformer"
		query="select name  from citation_author where
Structure_ID='${pdb_entry.pdb_id}' and citation_id = 'primary' " 
                parentDeltaQuery="select pdb_id from pdb_entry where
pdb_id=${citation_author.Structure_ID}" >        
  </entity>
</entity> 
  </document>
</dataConfig>

2. Modification of solrconfig.xml file: 
Add the following lines:


 <lib dir="${solr.install.dir:../../../..}/dist/" 
regex="solr-dataimporthandler-\d.*\.jar" />
 <lib dir="${solr.install.dir:../../../..}/dist/"
regex="mysql-connector-java-5.0.7-bin.jar" />


 <lib dir="${solr.install.dir:../../../..}/dist/"
regex="solr-rcsb-plugin.jar" />


<requestHandler name="/dataimport"
class="org.apache.solr.handler.dataimport.DataImportHandler">
<lst name="defaults">
<str name="config">db-data-config.xml</str>
</lst>
</requestHandler>


 <requestHandler name="/search" class="solr.SearchHandler">
  <lst name="defaults">
      <str name="indent">true</str>      
          <str name="echoParams">explicit</str>
              <str name="defType">edismax</str>
               <str name="qf">pdb_id^20.0</str>
               <str name="qf">author_list_stem^20.0</str>
	        <str name="qf">header^10.0</str>
		<str name="qf">reflns.resolution^5.0</str>
                <str name="qf">keywords_stem^10.0</str>
                <str name="qf">rest_field_stem^0.3</str> 
                <str name="mm">7</str>
                <int name="rows">1000</int>
                <str name="df">text</str> 
  </lst>
 </requestHandler>

3. Modification of managed-schema file:


    <field name="pdb_id" type="string" indexed="true" stored="true"
required="true" multiValued="false" />
   

 <field name="rest_fields_stem" type="pdb_text_stem" indexed="true"
stored="true" multiValued="true"/>
 <copyField source="*_stem" dest="rest_fields_stem"/>


<dynamicField name="*_stem"  type="pdb_text_stem"    indexed="true" 
stored="true"/>

 
<fieldtype name="pdb_text_stem" class="solr.TextField"
positionIncrementGap="100">
  <analyzer type="index">
    <tokenizer class="solr.WhitespaceTokenizerFactory"/> 
      <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
ignoreCase="true" expand="true" />
      <filter class="solr.StopFilterFactory" ignoreCase="true"/>
      <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1"
generateNumberParts="1" catenateWords="0" catenateNumbers="0"
catenateAll="0"/>
      <filter class="solr.LowerCaseFilterFactory"/>
      <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
      <filter class="solr.PorterStemFilterFactory"/>
  </analyzer>
  <analyzer type="query">
      <tokenizer class="solr.WhitespaceTokenizerFactory"/>
      <filter class="solr.StopFilterFactory" ignoreCase="true"/>
       <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1"
generateNumberParts="1" catenateWords="0" catenateNumbers="0"
catenateAll="0"/>
      
       <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
ignoreCase="true" expand="true" />
      <filter class="solr.LowerCaseFilterFactory"/>
      <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
      <filter class="solr.PorterStemFilterFactory"/>
  </analyzer>
 </fieldtype>


4. Java class of FieldTransformer:
package my.solr.transformer;

import java.util.List;
import java.util.Map;

import org.apache.solr.handler.dataimport.Context;
import org.apache.solr.handler.dataimport.DataImporter;
import org.apache.solr.handler.dataimport.Transformer;

public class FieldTransformer extends Transformer  {
    public Map<String, Object> transformRow(Map<String, Object> row, Context
context) {
        List<Map&lt;String, String>> fields = ((Context)
context).getAllEntityFields();
        
        for (Map<String, String> field : fields) {
            String columnName = field.get(DataImporter.COLUMN);
            // Get this field's value from the current row
            Object value = row.get(columnName);
            if (value != null && !value.toString().trim().equals("")) {
               row.put(columnName + "_stem", value.toString().trim());
            }
        }
        return row;
    }
    
    
}

5. NOTES:

1. When write customer transformer, need to copy the following files:

cp /opt/solr-6.1.0/dist/solr-dataimporthandler-extras-6.1.0.jar

 /opt/solr-6.1.0/server/solr-webapp/webapp/WEB-INF/lib/

cp /opt/solr-6.1.0/dist/solr-dataimporthandler-6.1.0.jar

 /opt/solr-6.1.0/server/solr-webapp/webapp/WEB-INF/lib/

2. put the customer transformer jar file to the following directory and
specify it solrconfig.xml file (see step 2 above) 

/opt/solr-6.1.0/dist/solr-rcsb-plugin.jar 





--
View this message in context: http://lucene.472066.n3.nabble.com/help-with-DIH-transformer-to-add-a-suffix-to-column-names-tp4292448p4292972.html
Sent from the Solr - User mailing list archive at Nabble.com.

Mime
View raw message