ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From seanfi...@apache.org
Subject svn commit: r1881994 [2/3] - in /ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased: ./ ae/ annotation/ dictionary/ encoder/ lookup/ table/ table/column/ util/ util/bsv/ util/jdbc/ util/textspan/ util/tokenize/ ...
Date Fri, 25 Sep 2020 00:59:37 GMT
Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/BsvEncoder.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/BsvEncoder.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/BsvEncoder.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/BsvEncoder.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,86 @@
+package org.apache.ctakes.dictionary.cased.encoder;
+
+
+import org.apache.ctakes.dictionary.cased.util.bsv.BsvFileParser;
+import org.apache.ctakes.dictionary.cased.util.bsv.StringArrayCreator;
+import org.apache.ctakes.dictionary.lookup2.util.CuiCodeUtil;
+import org.apache.ctakes.utils.env.EnvironmentVariable;
+import org.apache.log4j.Logger;
+import org.apache.uima.UimaContext;
+
+import java.io.IOException;
+import java.util.*;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/18/2020
+ */
+final public class BsvEncoder implements TermEncoder {
+
+   static public final String ENCODER_TYPE = "BSV";
+
+   static private final Logger LOGGER = Logger.getLogger( "BsvEncoder" );
+
+
+   private final InMemoryEncoder _delegate;
+
+   public BsvEncoder( final String name, final UimaContext uimaContext ) {
+      this( name, EnvironmentVariable.getEnv( name + "_file", uimaContext ) );
+   }
+
+   public BsvEncoder( final String name, final String bsvPath ) {
+      final Map<Long, Collection<TermEncoding>> encodingMap = parseBsvFile( name, bsvPath );
+      _delegate = new InMemoryEncoder( name, encodingMap );
+   }
+
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public String getName() {
+      return _delegate.getName();
+   }
+
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public Collection<TermEncoding> getEncodings( final long cuiCode ) {
+      return _delegate.getEncodings( cuiCode );
+   }
+
+
+   /**
+    * Create a map of {@link TermEncoding} Objects
+    * by parsing a bsv file.  The file should have a columnar format:
+    * <p>
+    * CUI|Code
+    * </p>
+    *
+    * @param bsvFilePath path to file containing term rows and bsv columns
+    * @return map of all cuis and codes read from the bsv file
+    */
+   static private Map<Long, Collection<TermEncoding>> parseBsvFile( final String name, final String bsvFilePath ) {
+      final Collection<String[]> columnCollection = new HashSet<>();
+      try {
+         columnCollection.addAll( BsvFileParser.parseBsvFile( bsvFilePath, new StringArrayCreator( 2 ) ) );
+      } catch ( IOException ioE ) {
+         LOGGER.error( ioE.getMessage() );
+      }
+      if ( columnCollection.isEmpty() ) {
+         return Collections.emptyMap();
+      }
+      final Map<Long, Collection<TermEncoding>> encodingMap = new HashMap<>();
+      for ( String[] columns : columnCollection ) {
+         final long cuiCode = CuiCodeUtil.getInstance().getCuiCode( columns[ 0 ] );
+         final TermEncoding termEncoding = new TermEncoding( name, columns[ 1 ].trim() );
+         encodingMap.computeIfAbsent( cuiCode, l -> new HashSet<>() ).add( termEncoding );
+      }
+      return encodingMap;
+   }
+
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/BsvListEncoder.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/BsvListEncoder.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/BsvListEncoder.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/BsvListEncoder.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,85 @@
+package org.apache.ctakes.dictionary.cased.encoder;
+
+
+import org.apache.ctakes.core.util.StringUtil;
+import org.apache.ctakes.dictionary.lookup2.util.CuiCodeUtil;
+import org.apache.ctakes.utils.env.EnvironmentVariable;
+import org.apache.log4j.Logger;
+import org.apache.uima.UimaContext;
+
+import java.util.*;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/18/2020
+ */
+final public class BsvListEncoder implements TermEncoder {
+
+   static public final String ENCODER_TYPE = "BSV_LIST";
+
+   static private final Logger LOGGER = Logger.getLogger( "BsvListEncoder" );
+
+
+   private final InMemoryEncoder _delegate;
+
+   public BsvListEncoder( final String name, final UimaContext uimaContext ) {
+      this( name, EnvironmentVariable.getEnv( name + "_list", uimaContext ) );
+   }
+
+   public BsvListEncoder( final String name, final String bsvList ) {
+      final Map<Long, Collection<TermEncoding>> encodingMap = parseList( name, bsvList );
+      LOGGER.info( "Parsed " + encodingMap.size() + " encodings for " + name );
+      _delegate = new InMemoryEncoder( name, encodingMap );
+   }
+
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public String getName() {
+      return _delegate.getName();
+   }
+
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public Collection<TermEncoding> getEncodings( final long cuiCode ) {
+      return _delegate.getEncodings( cuiCode );
+   }
+
+
+   /**
+    * Create a map of {@link TermEncoding} Objects
+    * by parsing a bsv file.  The file should have a columnar format:
+    * <p>
+    * CUI|Code
+    * </p>
+    *
+    * @param bsvList path to file containing term rows and bsv columns
+    * @return map of all cuis and codes read from the bsv file
+    */
+   static private Map<Long, Collection<TermEncoding>> parseList( final String name, final String bsvList ) {
+      if ( bsvList.isEmpty() ) {
+         LOGGER.error( "List of term encodings is empty for " + name );
+         return Collections.emptyMap();
+      }
+      final Map<Long, Collection<TermEncoding>> encodingMap = new HashMap<>();
+      for ( String encoding : StringUtil.fastSplit( bsvList, '|' ) ) {
+         final String[] keyValue = StringUtil.fastSplit( encoding, ':' );
+         if ( keyValue.length != 2 ) {
+            LOGGER.warn( "Improper Key : Value pair for Term Encoding " + encoding );
+            continue;
+         }
+         final long cuiCode = CuiCodeUtil.getInstance().getCuiCode( keyValue[ 0 ] );
+         final TermEncoding termEncoding = new TermEncoding( name, keyValue[ 1 ].trim() );
+         encodingMap.computeIfAbsent( cuiCode, l -> new HashSet<>() ).add( termEncoding );
+      }
+      return encodingMap;
+   }
+
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/CodeSchema.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/CodeSchema.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/CodeSchema.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/CodeSchema.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,59 @@
+package org.apache.ctakes.dictionary.cased.encoder;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashSet;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/25/2020
+ */
+public enum CodeSchema {
+   TUI( "int", String.class, "TUI" ),
+   PREFERRED_TEXT( "text", String.class, "PREFTEXT", "PREF_TEXT", "PREFERRED_TEXT" ),
+   UNKNOWN( "text", String.class, "UNKNOWN" );
+
+
+   private final String _codeFormat;
+   private final Class<?> _codeClass;
+   private final Collection<String> _names;
+
+
+   CodeSchema( final String codeFormat, final Class<?> codeClass, final String... names ) {
+      _codeFormat = codeFormat;
+      _codeClass = codeClass;
+      _names = new HashSet<>( Arrays.asList( names ) );
+   }
+
+
+   public String getCodeFormat() {
+      return _codeFormat;
+   }
+
+   public Class<?> getCodeClass() {
+      return _codeClass;
+   }
+
+   public Collection<String> getNames() {
+      return _names;
+   }
+
+   public boolean isSchema( final TermEncoding encoding ) {
+      return isSchema( encoding.getSchema() );
+   }
+
+   public boolean isSchema( final String name ) {
+      return _names.contains( name.toUpperCase() );
+   }
+
+
+   static public CodeSchema getSchema( final String name ) {
+      return Arrays.stream( CodeSchema.values() )
+                   .filter( c -> c.isSchema( name ) )
+                   .findFirst()
+                   .orElse( UNKNOWN );
+   }
+
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/EncoderStore.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/EncoderStore.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/EncoderStore.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/EncoderStore.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,42 @@
+package org.apache.ctakes.dictionary.cased.encoder;
+
+
+import java.util.ArrayList;
+import java.util.Collection;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/17/2020
+ */
+public enum EncoderStore {
+   INSTANCE;
+
+   static public EncoderStore getInstance() {
+      return INSTANCE;
+   }
+
+
+   private final Collection<TermEncoder> _encoders = new ArrayList<>();
+
+   public boolean addEncoder( final TermEncoder encoder ) {
+      final String name = encoder.getName();
+      synchronized ( _encoders ) {
+         final boolean present = _encoders.stream()
+                                          .map( TermEncoder::getName )
+                                          .anyMatch( name::equals );
+         if ( present ) {
+            // Encoder with given name already exists.
+            return false;
+         }
+         _encoders.add( encoder );
+         return true;
+      }
+   }
+
+
+   public Collection<TermEncoder> getEncoders() {
+      return _encoders;
+   }
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/InMemoryEncoder.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/InMemoryEncoder.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/InMemoryEncoder.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/InMemoryEncoder.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,46 @@
+package org.apache.ctakes.dictionary.cased.encoder;
+
+
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Map;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/18/2020
+ */
+final public class InMemoryEncoder implements TermEncoder {
+
+   private final String _name;
+
+   // Map of rare tokens to terms that contain those tokens.  Used like "First Word Token Lookup" but faster
+   private final Map<Long, Collection<TermEncoding>> _encodingMap;
+
+   /**
+    * @param name        unique name for dictionary
+    * @param encodingMap Map with a cui code as key, and TermEncoding Collection as value
+    */
+   public InMemoryEncoder( final String name, final Map<Long, Collection<TermEncoding>> encodingMap ) {
+      _name = name;
+      _encodingMap = encodingMap;
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public String getName() {
+      return _name;
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public Collection<TermEncoding> getEncodings( final long cuiCode ) {
+      return _encodingMap.getOrDefault( cuiCode, Collections.emptyList() );
+   }
+
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/JdbcEncoder.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/JdbcEncoder.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/JdbcEncoder.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/JdbcEncoder.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,185 @@
+package org.apache.ctakes.dictionary.cased.encoder;
+
+
+import org.apache.ctakes.dictionary.cased.table.column.CodeType;
+import org.apache.ctakes.dictionary.cased.table.column.SchemaCode;
+import org.apache.ctakes.dictionary.cased.util.jdbc.JdbcUtil;
+import org.apache.log4j.Logger;
+import org.apache.uima.UimaContext;
+
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+import static org.apache.ctakes.dictionary.cased.util.jdbc.JdbcUtil.*;
+
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/18/2020
+ */
+final public class JdbcEncoder implements TermEncoder {
+
+   static public final String ENCODER_TYPE = "JDBC";
+
+   static private final Logger LOGGER = Logger.getLogger( "JdbcEncoder" );
+
+   private final String _name;
+   private final PreparedStatement _selectCodeStatement;
+   private final CodeType _codeType;
+
+
+   public JdbcEncoder( final String name, final UimaContext uimaContext ) throws SQLException {
+      this( name,
+            getParameterValue( name, "driver", uimaContext, HSQL_DRIVER ),
+            getParameterValue( name, "url", uimaContext, "" ),
+            getParameterValue( name, "table", uimaContext, name.toUpperCase() ),
+            getParameterValue( name, "user", uimaContext, DEFAULT_USER ),
+            getParameterValue( name, "pass", uimaContext, DEFAULT_PASS ),
+            getParameterValue( name, "class", uimaContext, CodeType.TEXT.name() ) );
+   }
+
+   /**
+    * @param name       unique name for dictionary
+    * @param jdbcDriver -
+    * @param jdbcUrl    -
+    * @param tableName  -
+    * @param jdbcUser   -
+    * @param jdbcPass   -
+    * @param codeType   -
+    */
+   public JdbcEncoder( final String name,
+                       final String jdbcDriver,
+                       final String jdbcUrl,
+                       final String tableName,
+                       final String jdbcUser,
+                       final String jdbcPass,
+                       final String codeType ) throws SQLException {
+      _name = name;
+      _selectCodeStatement = JdbcUtil.createPreparedStatement( name,
+            jdbcDriver, jdbcUrl, jdbcUser, jdbcPass, tableName, SchemaCode.CUI.name() );
+      LOGGER.info( "Connected to " + name + " table " + tableName );
+      _codeType = CodeType.getCodeType( codeType );
+   }
+
+   public String getName() {
+      return _name;
+   }
+
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public Collection<TermEncoding> getEncodings( final long cuiCode ) {
+      switch ( _codeType ) {
+         case TEXT:
+            return getTextEncodings( cuiCode, SchemaCode.SCHEMA_CODE.getColumn() );
+         case LONG:
+            return getLongEncodings( cuiCode, SchemaCode.SCHEMA_CODE.getColumn() );
+         case INT:
+            return getIntEncodings( cuiCode, SchemaCode.SCHEMA_CODE.getColumn() );
+         case TUI:
+            return getTuiEncodings( cuiCode, SchemaCode.SCHEMA_CODE.getColumn() );
+         case PREF_TEXT:
+            return getPrefTextEncodings( cuiCode, SchemaCode.SCHEMA_CODE.getColumn() );
+      }
+      return getTextEncodings( cuiCode, SchemaCode.SCHEMA_CODE.getColumn() );
+   }
+
+
+   private Collection<TermEncoding> getTextEncodings( final long cuiCode, final int column ) {
+      final List<TermEncoding> encodings = new ArrayList<>();
+      try {
+         JdbcUtil.fillSelectCall( _selectCodeStatement, cuiCode );
+         final ResultSet resultSet = _selectCodeStatement.executeQuery();
+         while ( resultSet.next() ) {
+            encodings.add( new TermEncoding( getName(), resultSet.getString( column ) ) );
+         }
+         // Though the ResultSet interface documentation states that there are automatic closures,
+         // it is up to the driver to implement this behavior ...  historically some drivers have not done so
+         resultSet.close();
+      } catch ( SQLException e ) {
+         LOGGER.error( e.getMessage() );
+      }
+      return encodings;
+   }
+
+
+   private Collection<TermEncoding> getLongEncodings( final long cuiCode, final int column ) {
+      final List<TermEncoding> encodings = new ArrayList<>();
+      try {
+         JdbcUtil.fillSelectCall( _selectCodeStatement, cuiCode );
+         final ResultSet resultSet = _selectCodeStatement.executeQuery();
+         while ( resultSet.next() ) {
+            encodings.add( new TermEncoding( getName(), resultSet.getLong( column ) ) );
+         }
+         // Though the ResultSet interface documentation states that there are automatic closures,
+         // it is up to the driver to implement this behavior ...  historically some drivers have not done so
+         resultSet.close();
+      } catch ( SQLException e ) {
+         LOGGER.error( e.getMessage() );
+      }
+      return encodings;
+   }
+
+
+   private Collection<TermEncoding> getIntEncodings( final long cuiCode, final int column ) {
+      final List<TermEncoding> encodings = new ArrayList<>();
+      try {
+         JdbcUtil.fillSelectCall( _selectCodeStatement, cuiCode );
+         final ResultSet resultSet = _selectCodeStatement.executeQuery();
+         while ( resultSet.next() ) {
+            encodings.add( new TermEncoding( getName(), resultSet.getInt( column ) ) );
+         }
+         // Though the ResultSet interface documentation states that there are automatic closures,
+         // it is up to the driver to implement this behavior ...  historically some drivers have not done so
+         resultSet.close();
+      } catch ( SQLException e ) {
+         LOGGER.error( e.getMessage() );
+      }
+      return encodings;
+   }
+
+
+   private Collection<TermEncoding> getTuiEncodings( final long cuiCode, final int column ) {
+      final List<TermEncoding> encodings = new ArrayList<>();
+      try {
+         JdbcUtil.fillSelectCall( _selectCodeStatement, cuiCode );
+         final ResultSet resultSet = _selectCodeStatement.executeQuery();
+         while ( resultSet.next() ) {
+            encodings.add( new TermEncoding( CodeSchema.TUI.name(), resultSet.getInt( column ) ) );
+         }
+         // Though the ResultSet interface documentation states that there are automatic closures,
+         // it is up to the driver to implement this behavior ...  historically some drivers have not done so
+         resultSet.close();
+      } catch ( SQLException e ) {
+         LOGGER.error( e.getMessage() );
+      }
+      return encodings;
+   }
+
+
+   private Collection<TermEncoding> getPrefTextEncodings( final long cuiCode, final int column ) {
+      final List<TermEncoding> encodings = new ArrayList<>();
+      try {
+         JdbcUtil.fillSelectCall( _selectCodeStatement, cuiCode );
+         final ResultSet resultSet = _selectCodeStatement.executeQuery();
+         while ( resultSet.next() ) {
+            encodings.add( new TermEncoding( CodeSchema.PREFERRED_TEXT.name(), resultSet.getString( column ) ) );
+         }
+         // Though the ResultSet interface documentation states that there are automatic closures,
+         // it is up to the driver to implement this behavior ...  historically some drivers have not done so
+         resultSet.close();
+      } catch ( SQLException e ) {
+         LOGGER.error( e.getMessage() );
+      }
+      return encodings;
+   }
+
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/TermEncoder.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/TermEncoder.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/TermEncoder.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/TermEncoder.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,31 @@
+package org.apache.ctakes.dictionary.cased.encoder;
+
+import org.apache.ctakes.dictionary.cased.lookup.DiscoveredTerm;
+
+import java.util.Collection;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/17/2020
+ */
+public interface TermEncoder {
+
+   /**
+    * The Type identifier and Name are used to maintain a collection of term encoders,
+    * so the combination of Type and Name should be unique for each encoder if possible.
+    *
+    * @return simple name for the encoder
+    */
+   String getName();
+
+
+   default Collection<TermEncoding> getEncodings( final DiscoveredTerm discoveredTerm ) {
+      return getEncodings( discoveredTerm.getCuiCode() );
+   }
+
+
+   Collection<TermEncoding> getEncodings( final long cuiCode );
+
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/TermEncoding.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/TermEncoding.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/TermEncoding.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/TermEncoding.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,42 @@
+package org.apache.ctakes.dictionary.cased.encoder;
+
+import jdk.nashorn.internal.ir.annotations.Immutable;
+
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/18/2020
+ */
+@Immutable
+final public class TermEncoding {
+
+   private final String _schema;
+   private final Object _schemaCode;
+
+
+   public TermEncoding( final String schema,
+                        final Object schemaCode ) {
+      _schema = schema;
+      _schemaCode = schemaCode;
+   }
+
+   public String getSchema() {
+      return _schema;
+   }
+
+   public Object getSchemaCode() {
+      return _schemaCode;
+   }
+
+   public boolean equals( final Object object ) {
+      return object instanceof TermEncoding
+             && ((TermEncoding)object).getSchema().equals( getSchema() )
+             && ((TermEncoding)object).getSchemaCode().equals( getSchemaCode() );
+   }
+
+   public int hashCode() {
+      return (_schema + '_' + _schemaCode).hashCode();
+   }
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/CandidateTerm.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/CandidateTerm.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/CandidateTerm.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/CandidateTerm.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,227 @@
+package org.apache.ctakes.dictionary.cased.lookup;
+
+
+import jdk.nashorn.internal.ir.annotations.Immutable;
+import org.apache.ctakes.core.util.StringUtil;
+import org.apache.ctakes.dictionary.cased.util.tokenize.TokenizedTerm;
+
+import java.util.Arrays;
+
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/14/2020
+ */
+@Immutable
+final public class CandidateTerm {
+
+
+   private final long _cuiCode;
+
+   private final String[] _prefixes;
+   private final String _rareWord;
+   private final String[] _suffixes;
+   final private boolean _allUpperCase;
+   final private boolean _allLowerCase;
+   final private boolean _matchesLookupCase;
+   private final int _rank;
+   private final int _instances;
+
+   final private int _hashCode;
+
+
+   public CandidateTerm( final TokenizedTerm tokenizedTerm, final int rareWordIndex ) {
+      _cuiCode = tokenizedTerm.getCui();
+      final String[] tokens = tokenizedTerm.getTokens();
+      _prefixes = rareWordIndex == 0
+                  ? new String[ 0 ]
+                  : Arrays.copyOf( tokens, rareWordIndex );
+      _rareWord = tokens[ rareWordIndex ];
+      final int suffixLength = tokens.length - rareWordIndex - 1;
+      _suffixes = new String[ suffixLength ];
+      System.arraycopy( tokens, rareWordIndex + 1, _suffixes, 0, suffixLength );
+      _allUpperCase = tokenizedTerm.isAllUpperCase();
+      _allLowerCase = tokenizedTerm.isAllLowerCase();
+      _matchesLookupCase = true;
+      _hashCode = (_cuiCode + "_" + String.join( " ", tokens )).hashCode();
+      _rank = 1;
+      _instances = 1;
+   }
+
+
+   public CandidateTerm( final long cuiCode,
+                         final String[] tokens,
+                         final int rareWordIndex,
+                         final boolean lookupAllUpper,
+                         final boolean lookupAllLower,
+                         final int rank,
+                         final int instances ) {
+      _cuiCode = cuiCode;
+      _prefixes = rareWordIndex == 0
+                  ? new String[ 0 ]
+                  : Arrays.copyOf( tokens, rareWordIndex );
+      _rareWord = tokens[ rareWordIndex ];
+      final int suffixLength = tokens.length - rareWordIndex - 1;
+      _suffixes = new String[ suffixLength ];
+      System.arraycopy( tokens, rareWordIndex + 1, _suffixes, 0, suffixLength );
+      boolean anyCaps = false;
+      boolean anyLower = false;
+      for ( char c : String.join( "", tokens ).toCharArray() ) {
+         if ( Character.isUpperCase( c ) ) {
+            anyCaps = true;
+         } else if ( Character.isLowerCase( c ) ) {
+            anyLower = true;
+         }
+         if ( anyCaps && anyLower ) {
+            break;
+         }
+      }
+      _allUpperCase = anyCaps && !anyLower;
+      _allLowerCase = anyLower && !anyCaps;
+
+      _hashCode = (cuiCode + "_" + String.join( " ", tokens )).hashCode();
+      _matchesLookupCase = _allUpperCase == lookupAllUpper && _allLowerCase == lookupAllLower;
+      _rank = rank;
+      _instances = instances;
+   }
+
+
+   public CandidateTerm( final long cuiCode,
+                         final String prefix,
+                         final String rareWord,
+                         final String suffix,
+                         final boolean lookupAllUpper,
+                         final boolean lookupAllLower,
+                         final int rank,
+                         final int instances ) {
+      _cuiCode = cuiCode;
+      _prefixes = prefix.isEmpty()
+                  ? new String[ 0 ]
+                  : StringUtil.fastSplit( prefix, ' ' );
+      _rareWord = rareWord;
+      _suffixes = suffix.isEmpty()
+                  ? new String[ 0 ]
+                  : StringUtil.fastSplit( suffix, ' ' );
+      boolean anyCaps = false;
+      boolean anyLower = false;
+      for ( char c : (prefix + rareWord + suffix).toCharArray() ) {
+         if ( Character.isUpperCase( c ) ) {
+            anyCaps = true;
+         } else if ( Character.isLowerCase( c ) ) {
+            anyLower = true;
+         }
+         if ( anyCaps && anyLower ) {
+            break;
+         }
+      }
+      _allUpperCase = anyCaps && !anyLower;
+      _allLowerCase = anyLower && !anyCaps;
+      _hashCode = (cuiCode + "_"
+                   + (prefix.isEmpty() ? "" : prefix + " ")
+                   + rareWord
+                   + (suffix.isEmpty() ? "" : " " + suffix))
+            .hashCode();
+      _matchesLookupCase = _allUpperCase == lookupAllUpper && _allLowerCase == lookupAllLower;
+      _rank = rank;
+      _instances = instances;
+   }
+
+
+   /**
+    * @return umls cui for the term
+    */
+   public Long getCuiCode() {
+      return _cuiCode;
+   }
+
+
+   /**
+    * @return each token in the term as a separate String
+    */
+   public String[] getTokens() {
+      final String[] tokens = new String[ _prefixes.length + 1 + _suffixes.length ];
+      System.arraycopy( _prefixes, 0, tokens, 0, _prefixes.length );
+      tokens[ _prefixes.length ] = _rareWord;
+      System.arraycopy( _suffixes, 0, tokens, _prefixes.length + 1, _suffixes.length );
+      return tokens;
+   }
+
+
+   public String[] getPrefixes() {
+      return _prefixes;
+   }
+
+   public String[] getLowerPrefixes() {
+      if ( isAllLowerCase() ) {
+         return _prefixes;
+      }
+      return Arrays.stream( _prefixes ).map( String::toLowerCase ).toArray( String[]::new );
+   }
+
+
+   public String[] getSuffixes() {
+      return _suffixes;
+   }
+
+   public String[] getLowerSuffixes() {
+      if ( isAllLowerCase() ) {
+         return _suffixes;
+      }
+      return Arrays.stream( _suffixes ).map( String::toLowerCase ).toArray( String[]::new );
+   }
+
+   /**
+    * @return the index of the rare word used for indexing in the token array
+    */
+   public int getRareWordIndex() {
+      return _prefixes.length;
+   }
+
+
+   public int getTokenCount() {
+      return _prefixes.length + 1 + _suffixes.length;
+   }
+
+   public boolean isAllUpperCase() {
+      return _allUpperCase;
+   }
+
+   public boolean isAllLowerCase() {
+      return _allLowerCase;
+   }
+
+   public boolean matchesLookupCase() {
+      return _matchesLookupCase;
+   }
+
+   public int getRank() {
+      return _rank;
+   }
+
+   public int getInstances() {
+      return _instances;
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public boolean equals( final Object value ) {
+      return value instanceof CandidateTerm && value.hashCode() == hashCode();
+//      if ( !(value instanceof LookupTerm) ) {
+//         return false;
+//      }
+//      final LookupTerm other = (LookupTerm)value;
+//      return other.getCuiCode().equals( _cuiCode ) && Arrays.equals( other.getTokens(), getTokens() );
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public int hashCode() {
+      return _hashCode;
+   }
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/ContiguousLookupEngine.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/ContiguousLookupEngine.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/ContiguousLookupEngine.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/ContiguousLookupEngine.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,236 @@
+package org.apache.ctakes.dictionary.cased.lookup;
+
+import org.apache.ctakes.core.util.Pair;
+import org.apache.ctakes.dictionary.cased.dictionary.CasedDictionary;
+import org.apache.log4j.Logger;
+
+import java.util.*;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/18/2020
+ */
+final public class ContiguousLookupEngine {
+
+   static private final Logger LOGGER = Logger.getLogger( "ContiguousLookupEngine" );
+
+
+   /**
+    * Given a dictionary, tokens, and lookup token indices, populate a terms collection with discovered terms
+    *
+    * @param dictionary   -
+    * @param lookupTokens -
+    * @return map of text spans to terms discovered at those text spans.
+    */
+   public final Map<Pair<Integer>, Collection<DiscoveredTerm>> findTerms( final CasedDictionary dictionary,
+                                                                          final List<LookupToken> lookupTokens,
+                                                                          final int consecutiveSkipMax,
+                                                                          final int totalSkipMax ) {
+      final Map<Pair<Integer>, Collection<DiscoveredTerm>> discoveredTermMap = new HashMap<>();
+      int lookupTokenIndex = -1;
+      Collection<CandidateTerm> candidateTerms;
+      for ( LookupToken lookupToken : lookupTokens ) {
+         lookupTokenIndex++;
+         if ( !lookupToken.isValidIndexToken() ) {
+            continue;
+         }
+         candidateTerms = dictionary.getCandidateTerms( lookupToken );
+         if ( candidateTerms == null || candidateTerms.isEmpty() ) {
+            continue;
+         }
+         for ( CandidateTerm candidateTerm : candidateTerms ) {
+            if ( candidateTerm.getTokenCount() == 1 ) {
+               // Single word term, add and move on
+               discoveredTermMap.computeIfAbsent( lookupToken.getTextSpan(), s -> new HashSet<>() )
+                                .add( new DiscoveredTerm( candidateTerm ) );
+               continue;
+            }
+            if ( candidateTerm.getPrefixes().length >= lookupTokenIndex
+                 || lookupTokenIndex + candidateTerm.getSuffixes().length >= lookupTokens.size() ) {
+               // term will extend beyond window
+               continue;
+            }
+            if ( isMismatch( getPrefixMatch( candidateTerm, lookupTokens, lookupTokenIndex ) ) ) {
+               continue;
+            }
+            if ( isMismatch( getSuffixMatch( candidateTerm, lookupTokens, lookupTokenIndex ) ) ) {
+               continue;
+            }
+            final int spanBegin = lookupTokens.get( lookupTokenIndex - candidateTerm.getPrefixes().length ).getBegin();
+            final int spanEnd = lookupTokens.get( lookupTokenIndex + candidateTerm.getSuffixes().length ).getEnd();
+            discoveredTermMap.computeIfAbsent( new Pair<>( spanBegin, spanEnd ), s -> new HashSet<>() )
+                             .add( new DiscoveredTerm( candidateTerm ) );
+         }
+      }
+      return discoveredTermMap;
+   }
+
+
+   static private final Pair<Integer> HIT = new Pair<>( 0, 0 );
+   static private final Pair<Integer> MISS = new Pair<>( -1, -1 );
+
+   static private boolean isMismatch( final Pair<Integer> skips ) {
+      return MISS.equals( skips );
+   }
+
+
+   /**
+    * Hopefully the jit will inline this method
+    *
+    * @param candidateTerm    rare word term to check for match
+    * @param allTokens        all tokens in a window
+    * @param lookupTokenIndex index of first token in allTokens to check
+    * @return the consecutiveSkips and totalSkips required to make the prefix fit the tokens.  -1,-1 if no fit.
+    */
+   public static Pair<Integer> getPrefixMatch( final CandidateTerm candidateTerm,
+                                               final List<LookupToken> allTokens,
+                                               final int lookupTokenIndex ) {
+      final String[] prefixes = candidateTerm.getPrefixes();
+      final String[] lowerPrefixes = candidateTerm.getLowerPrefixes();
+      if ( prefixes.length == 0 ) {
+         return HIT;
+      }
+      int tokenIndex = lookupTokenIndex - 1;
+      LookupToken lookupToken = allTokens.get( tokenIndex );
+      for ( int i = prefixes.length - 1; i >= 0; i-- ) {
+         if ( candidateTerm.isAllUpperCase() ) {
+            if ( !lookupToken.isAllUpperCase() ) {
+               return MISS;
+            }
+            if ( !prefixes[ i ].equals( lookupToken.getText() ) ) {
+               return MISS;
+            }
+         }
+         if ( !candidateTerm.isAllUpperCase() && !candidateTerm.isAllLowerCase() ) {
+            if ( !prefixes[ i ].equals( lookupToken.getText() ) ) {
+               return MISS;
+            }
+         }
+         if ( lowerPrefixes[ i ].equals( lookupToken.getLowerText() ) ) {
+            tokenIndex--;
+            lookupToken = allTokens.get( tokenIndex );
+            continue;
+         }
+         // the token normal didn't match
+         return MISS;
+      }
+      // the token normal matched
+      return HIT;
+   }
+
+   /**
+    * Hopefully the jit will inline this method
+    *
+    * @param candidateTerm    rare word term to check for match
+    * @param allTokens        all tokens in a window
+    * @param lookupTokenIndex index of first token in allTokens to check
+    * @return the consecutiveSkips and totalSkips required to make the prefix fit the tokens.  -1,-1 if no fit.
+    */
+   public static Pair<Integer> getSuffixMatch( final CandidateTerm candidateTerm,
+                                               final List<LookupToken> allTokens,
+                                               final int lookupTokenIndex ) {
+      final String[] suffixes = candidateTerm.getSuffixes();
+      //  TODO - Do we really want lower-case candidates?
+      //   They should be stored in the dictionary as the desired case.
+      final String[] lowerSuffixes = candidateTerm.getLowerSuffixes();
+      if ( suffixes.length == 0 ) {
+         return HIT;
+      }
+      int tokenIndex = lookupTokenIndex + 1;
+      LookupToken lookupToken = allTokens.get( tokenIndex );
+      for ( int i = 0; i < suffixes.length; i++ ) {
+         if ( candidateTerm.isAllUpperCase() ) {
+            if ( !lookupToken.isAllUpperCase() ) {
+               return MISS;
+            }
+            if ( !suffixes[ i ].equals( lookupToken.getText() ) ) {
+               return MISS;
+            }
+         }
+         if ( !candidateTerm.isAllUpperCase() && !candidateTerm.isAllLowerCase() ) {
+            if ( !suffixes[ i ].equals( lookupToken.getText() ) ) {
+               return MISS;
+            }
+         }
+         if ( lowerSuffixes[ i ].equals( lookupToken.getLowerText() ) ) {
+            tokenIndex--;
+            lookupToken = allTokens.get( tokenIndex );
+            continue;
+         }
+         // the token normal didn't match
+         return MISS;
+      }
+      // the token normal matched
+      return HIT;
+   }
+
+
+   /**
+    * Hopefully the jit will inline this method
+    *
+    * @param candidateTerm      rare word term to check for match
+    * @param allTokens          all tokens in a window
+    * @param lookupTokenIndex   index of first token in allTokens to check
+    * @param consecutiveSkipMax -
+    * @param totalSkipMax       -
+    * @return the consecutiveSkips and totalSkips required to make the prefix fit the tokens.  -1,-1 if no fit.
+    */
+   public static Pair<Integer> getPrefixMatch( final CandidateTerm candidateTerm,
+                                               final List<LookupToken> allTokens,
+                                               final int lookupTokenIndex,
+                                               final int consecutiveSkipMax,
+                                               final int totalSkipMax ) {
+      final String[] prefixes = candidateTerm.getPrefixes();
+      if ( prefixes.length == 0 ) {
+         return HIT;
+      }
+      int tokenIndex = lookupTokenIndex - 1;
+      for ( int i = prefixes.length - 1; i >= 0; i-- ) {
+         if ( prefixes[ i ].equals( allTokens.get( tokenIndex ).getText() ) ) {
+            tokenIndex--;
+            continue;
+         }
+         // the token normal didn't match
+         // TODO Add overlap logic ...
+         return MISS;
+      }
+      // the token normal matched
+      return HIT;
+   }
+
+
+   /**
+    * Hopefully the jit will inline this method
+    *
+    * @param candidateTerm      rare word term to check for match
+    * @param allTokens          all tokens in a window
+    * @param lookupTokenIndex   index of first token in allTokens to check
+    * @param consecutiveSkipMax -
+    * @param totalSkipMax       -
+    * @return the consecutiveSkips and totalSkips required to make the prefix fit the tokens.  -1,-1 if no fit.
+    */
+   public static Pair<Integer> getSuffixMatch( final CandidateTerm candidateTerm,
+                                               final List<LookupToken> allTokens,
+                                               final int lookupTokenIndex,
+                                               final int consecutiveSkipMax,
+                                               final int totalSkipMax ) {
+      final String[] suffixes = candidateTerm.getSuffixes();
+      if ( suffixes.length == 0 ) {
+         return HIT;
+      }
+      int tokenIndex = lookupTokenIndex + 1;
+      for ( String suffix : suffixes ) {
+         if ( suffix.equals( allTokens.get( tokenIndex ).getText() ) ) {
+            tokenIndex++;
+            continue;
+         }
+         // the token normal didn't match
+         // TODO Add overlap logic ...
+         return MISS;
+      }
+      // the token normal matched
+      return HIT;
+   }
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/DiscoveredTerm.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/DiscoveredTerm.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/DiscoveredTerm.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/DiscoveredTerm.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,66 @@
+package org.apache.ctakes.dictionary.cased.lookup;
+
+
+import jdk.nashorn.internal.ir.annotations.Immutable;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/17/2020
+ */
+@Immutable
+final public class DiscoveredTerm {
+
+   private final long _cuiCode;
+   private final int _consecutiveSkips;
+   private final int _totalSkips;
+   private final boolean _matchesLookupCase;
+   private final int _rank;
+   private final int _instances;
+
+   public DiscoveredTerm( final CandidateTerm candidateTerm ) {
+      this( candidateTerm, 0, 0 );
+   }
+
+   public DiscoveredTerm( final CandidateTerm candidateTerm,
+                          final int consecutiveSkips,
+                          final int totalSkips ) {
+      _cuiCode = candidateTerm.getCuiCode();
+      _consecutiveSkips = consecutiveSkips;
+      _totalSkips = totalSkips;
+      _matchesLookupCase = candidateTerm.matchesLookupCase();
+      _rank = candidateTerm.getRank();
+      _instances = candidateTerm.getInstances();
+   }
+
+   public long getCuiCode() {
+      return _cuiCode;
+   }
+
+   public boolean matchesLookupCase() {
+      return _matchesLookupCase;
+   }
+
+   /**
+    * @return rank, where 1 is the "best".
+    */
+   public int getRank() {
+      return _rank;
+   }
+
+   /**
+    * @return number of source vocabularies that have this synonym for this cui.
+    */
+   public int getInstances() {
+      return _instances;
+   }
+
+   public int getTotalSkips() {
+      return _totalSkips;
+   }
+
+   public int getConsecutiveSkips() {
+      return _consecutiveSkips;
+   }
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/LookupEngine.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/LookupEngine.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/LookupEngine.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/LookupEngine.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,236 @@
+package org.apache.ctakes.dictionary.cased.lookup;
+
+
+import org.apache.ctakes.core.util.Pair;
+import org.apache.ctakes.dictionary.cased.dictionary.CasedDictionary;
+import org.apache.log4j.Logger;
+
+import java.util.*;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/17/2020
+ */
+public class LookupEngine {
+
+   static private final Logger LOGGER = Logger.getLogger( "LookupEngine" );
+
+
+   /**
+    * Given a dictionary, tokens, and lookup token indices, populate a terms collection with discovered terms
+    *
+    * @param dictionary   -
+    * @param lookupTokens -
+    * @return map of text spans to terms discovered at those text spans.
+    */
+   public final Map<Pair<Integer>, Collection<DiscoveredTerm>> findTerms( final CasedDictionary dictionary,
+                                                                          final List<LookupToken> lookupTokens,
+                                                                          final int consecutiveSkipMax,
+                                                                          final int totalSkipMax ) {
+      final Map<Pair<Integer>, Collection<DiscoveredTerm>> discoveredTermMap = new HashMap<>();
+      int lookupTokenIndex = -1;
+      Collection<CandidateTerm> candidateTerms;
+      for ( LookupToken lookupToken : lookupTokens ) {
+         lookupTokenIndex++;
+         if ( !lookupToken.isValidIndexToken() ) {
+            continue;
+         }
+         candidateTerms = dictionary.getCandidateTerms( lookupToken );
+         if ( candidateTerms == null || candidateTerms.isEmpty() ) {
+            continue;
+         }
+         for ( CandidateTerm candidateTerm : candidateTerms ) {
+            if ( candidateTerm.getTokenCount() == 1 ) {
+               // Single word term, add and move on
+               discoveredTermMap.computeIfAbsent( lookupToken.getTextSpan(), s -> new HashSet<>() )
+                                .add( new DiscoveredTerm( candidateTerm ) );
+               continue;
+            }
+            if ( candidateTerm.getPrefixes().length >= lookupTokenIndex
+                 || lookupTokenIndex + candidateTerm.getSuffixes().length >= lookupTokens.size() ) {
+               // term will extend beyond window
+               continue;
+            }
+            if ( isMismatch( getPrefixMatch( candidateTerm, lookupTokens, lookupTokenIndex ) ) ) {
+               continue;
+            }
+            if ( isMismatch( getSuffixMatch( candidateTerm, lookupTokens, lookupTokenIndex ) ) ) {
+               continue;
+            }
+            final int spanBegin = lookupTokens.get( lookupTokenIndex - candidateTerm.getPrefixes().length ).getBegin();
+            final int spanEnd = lookupTokens.get( lookupTokenIndex + candidateTerm.getSuffixes().length ).getEnd();
+            discoveredTermMap.computeIfAbsent( new Pair<>( spanBegin, spanEnd ), s -> new HashSet<>() )
+                             .add( new DiscoveredTerm( candidateTerm ) );
+         }
+      }
+      return discoveredTermMap;
+   }
+
+
+   static private final Pair<Integer> HIT = new Pair<>( 0, 0 );
+   static private final Pair<Integer> MISS = new Pair<>( -1, -1 );
+
+   static private boolean isMismatch( final Pair<Integer> skips ) {
+      return MISS.equals( skips );
+   }
+
+
+   /**
+    * Hopefully the jit will inline this method
+    *
+    * @param candidateTerm    rare word term to check for match
+    * @param allTokens        all tokens in a window
+    * @param lookupTokenIndex index of first token in allTokens to check
+    * @return the consecutiveSkips and totalSkips required to make the prefix fit the tokens.  -1,-1 if no fit.
+    */
+   public static Pair<Integer> getPrefixMatch( final CandidateTerm candidateTerm,
+                                               final List<LookupToken> allTokens,
+                                               final int lookupTokenIndex ) {
+      final String[] prefixes = candidateTerm.getPrefixes();
+      final String[] lowerPrefixes = candidateTerm.getLowerPrefixes();
+      if ( prefixes.length == 0 ) {
+         return HIT;
+      }
+      int tokenIndex = lookupTokenIndex - 1;
+      LookupToken lookupToken = allTokens.get( tokenIndex );
+      for ( int i = prefixes.length - 1; i >= 0; i-- ) {
+         if ( candidateTerm.isAllUpperCase() ) {
+            if ( !lookupToken.isAllUpperCase() ) {
+               return MISS;
+            }
+            if ( !prefixes[ i ].equals( lookupToken.getText() ) ) {
+               return MISS;
+            }
+         }
+         if ( !candidateTerm.isAllUpperCase() && !candidateTerm.isAllLowerCase() ) {
+            if ( !prefixes[ i ].equals( lookupToken.getText() ) ) {
+               return MISS;
+            }
+         }
+         if ( lowerPrefixes[ i ].equals( lookupToken.getLowerText() ) ) {
+            tokenIndex--;
+            lookupToken = allTokens.get( tokenIndex );
+            continue;
+         }
+         // the token normal didn't match
+         return MISS;
+      }
+      // the token normal matched
+      return HIT;
+   }
+
+   /**
+    * Hopefully the jit will inline this method
+    *
+    * @param candidateTerm    rare word term to check for match
+    * @param allTokens        all tokens in a window
+    * @param lookupTokenIndex index of first token in allTokens to check
+    * @return the consecutiveSkips and totalSkips required to make the prefix fit the tokens.  -1,-1 if no fit.
+    */
+   public static Pair<Integer> getSuffixMatch( final CandidateTerm candidateTerm,
+                                               final List<LookupToken> allTokens,
+                                               final int lookupTokenIndex ) {
+      final String[] suffixes = candidateTerm.getSuffixes();
+      final String[] lowerSuffixes = candidateTerm.getLowerSuffixes();
+      if ( suffixes.length == 0 ) {
+         return HIT;
+      }
+      int tokenIndex = lookupTokenIndex + 1;
+      LookupToken lookupToken = allTokens.get( tokenIndex );
+      for ( int i = 0; i < suffixes.length; i++ ) {
+         if ( candidateTerm.isAllUpperCase() ) {
+            if ( !lookupToken.isAllUpperCase() ) {
+               return MISS;
+            }
+            if ( !suffixes[ i ].equals( lookupToken.getText() ) ) {
+               return MISS;
+            }
+         }
+         if ( !candidateTerm.isAllUpperCase() && !candidateTerm.isAllLowerCase() ) {
+            if ( !suffixes[ i ].equals( lookupToken.getText() ) ) {
+               return MISS;
+            }
+         }
+         if ( lowerSuffixes[ i ].equals( lookupToken.getLowerText() ) ) {
+            tokenIndex--;
+            lookupToken = allTokens.get( tokenIndex );
+            continue;
+         }
+         // the token normal didn't match
+         return MISS;
+      }
+      // the token normal matched
+      return HIT;
+   }
+
+
+//   /**
+//    * Hopefully the jit will inline this method
+//    *
+//    * @param candidateTerm     rare word term to check for match
+//    * @param allTokens      all tokens in a window
+//    * @param lookupTokenIndex index of first token in allTokens to check
+//    * @param consecutiveSkipMax -
+//    * @param totalSkipMax -
+//    * @return the consecutiveSkips and totalSkips required to make the prefix fit the tokens.  -1,-1 if no fit.
+//    */
+//   public static Pair<Integer> getPrefixMatch( final CandidateTerm candidateTerm,
+//                                               final List<LookupToken> allTokens,
+//                                               final int lookupTokenIndex,
+//                                               final int consecutiveSkipMax,
+//                                               final int totalSkipMax ) {
+//      final String[] prefixes = candidateTerm.getPrefixes();
+//      if ( prefixes.length == 0 ) {
+//         return HIT;
+//      }
+//      int tokenIndex = lookupTokenIndex-1;
+//      for ( int i = prefixes.length-1; i >=0; i-- ) {
+//         if ( prefixes[ i ].equals( allTokens.get( tokenIndex ).getText() ) ) {
+//            tokenIndex--;
+//            continue;
+//         }
+//         // the token normal didn't match
+//         // TODO Add overlap logic ...
+//         return MISS;
+//      }
+//      // the token normal matched
+//      return HIT;
+//   }
+//
+//
+//   /**
+//    * Hopefully the jit will inline this method
+//    *
+//    * @param candidateTerm     rare word term to check for match
+//    * @param allTokens      all tokens in a window
+//    * @param lookupTokenIndex index of first token in allTokens to check
+//    * @param consecutiveSkipMax -
+//    * @param totalSkipMax -
+//    * @return the consecutiveSkips and totalSkips required to make the prefix fit the tokens.  -1,-1 if no fit.
+//    */
+//   public static Pair<Integer> getSuffixMatch( final CandidateTerm candidateTerm,
+//                                               final List<LookupToken> allTokens,
+//                                               final int lookupTokenIndex,
+//                                               final int consecutiveSkipMax,
+//                                               final int totalSkipMax ) {
+//      final String[] suffixes = candidateTerm.getSuffixes();
+//      if ( suffixes.length == 0 ) {
+//         return HIT;
+//      }
+//      int tokenIndex = lookupTokenIndex+1;
+//      for ( String suffix : suffixes ) {
+//         if ( suffix.equals( allTokens.get( tokenIndex ).getText() ) ) {
+//            tokenIndex++;
+//            continue;
+//         }
+//         // the token normal didn't match
+//         // TODO Add overlap logic ...
+//         return MISS;
+//      }
+//      // the token normal matched
+//      return HIT;
+//   }
+
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/LookupToken.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/LookupToken.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/LookupToken.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/LookupToken.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,124 @@
+package org.apache.ctakes.dictionary.cased.lookup;
+
+
+import jdk.nashorn.internal.ir.annotations.Immutable;
+import org.apache.ctakes.core.util.Pair;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/13/2020
+ */
+@Immutable
+final public class LookupToken {
+
+   final private Pair<Integer> _textSpan;
+   final private String _text;
+   final private boolean _allCaps;
+   final private boolean _allLower;
+   final private boolean _isValidIndexToken;
+
+   public LookupToken( final BaseToken baseToken, final boolean isValidIndexToken ) {
+      _textSpan = new Pair<>( baseToken.getBegin(), baseToken.getEnd() );
+      // All case-sensitivity is handled here.  This is the text in the note.
+      _text = baseToken.getCoveredText();
+      boolean anyCaps = false;
+      boolean anyLower = false;
+      for ( char c : _text.toCharArray() ) {
+         if ( Character.isUpperCase( c ) ) {
+            anyCaps = true;
+         } else if ( Character.isLowerCase( c ) ) {
+            anyLower = true;
+         }
+         if ( anyCaps && anyLower ) {
+            break;
+         }
+      }
+      _allCaps = anyCaps && !anyLower;
+      _allLower = anyLower && !anyCaps;
+      _isValidIndexToken = isValidIndexToken;
+   }
+
+   /**
+    * @return a span with the start and end indices used for this lookup token
+    */
+   public Pair<Integer> getTextSpan() {
+      return _textSpan;
+   }
+
+   /**
+    * @return the start index used for this lookup token
+    */
+   public int getBegin() {
+      return _textSpan.getValue1();
+   }
+
+   /**
+    * @return the end index used for this lookup token
+    */
+   public int getEnd() {
+      return _textSpan.getValue2();
+   }
+
+   /**
+    * @return the length of the text span in characters
+    */
+   public int getLength() {
+      return _text.length();
+   }
+
+   /**
+    * @return the actual text in the document for the lookup token, regardless of case.
+    */
+   public String getText() {
+      return _text;
+   }
+
+   /**
+    * @return the actual text in the document for the lookup token, regardless of case.
+    */
+   public String getLowerText() {
+      if ( _allLower ) {
+         return _text;
+      }
+      return _text.toLowerCase();
+   }
+
+   /**
+    * @return true if the text characters are all upper case.
+    */
+   public boolean isAllUpperCase() {
+      return _allCaps;
+   }
+
+   /**
+    * @return true if the text characters are all lower case.
+    */
+   public boolean isAllLowerCase() {
+      return _allLower;
+   }
+
+   public boolean isValidIndexToken() {
+      return _isValidIndexToken;
+   }
+
+   /**
+    * Two lookup tokens are equal iff the spans are equal.
+    *
+    * @param value -
+    * @return true if {@code value} is a {@code FastLookupToken} and has a span equal to this token's span
+    */
+   public boolean equals( final Object value ) {
+      return value instanceof LookupToken
+             && _textSpan.equals( ((LookupToken)value).getTextSpan() );
+   }
+
+   /**
+    * @return hashCode created from the Span
+    */
+   public int hashCode() {
+      return _textSpan.hashCode();
+   }
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/table/column/CodeType.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/table/column/CodeType.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/table/column/CodeType.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/table/column/CodeType.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,35 @@
+package org.apache.ctakes.dictionary.cased.table.column;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/18/2020
+ */
+public enum CodeType {
+   INT,
+   LONG,
+   TEXT,
+   TUI,
+   PREF_TEXT;
+//     VARCHAR(48)  ,  BIGINT  ,  FLOAT  ,  INTEGER   ??
+
+
+   /**
+    * Sending a nonexistant name to enum .valueof( .. ) will throw an IllegalArgumentException.
+    *
+    * @param name -
+    * @return -
+    */
+   static public CodeType getCodeType( final String name ) {
+      final String upper = name.toUpperCase();
+      for ( CodeType codeType : CodeType.values() ) {
+         if ( codeType.name().equals( upper ) ) {
+            return codeType;
+         }
+      }
+      // Return TEXT as a default.
+      return TEXT;
+   }
+
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/table/column/SchemaCode.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/table/column/SchemaCode.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/table/column/SchemaCode.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/table/column/SchemaCode.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,22 @@
+package org.apache.ctakes.dictionary.cased.table.column;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/18/2020
+ */
+public enum SchemaCode {
+   CUI( 1 ),
+   SCHEMA_CODE( 2 );
+
+   final private int _column;
+
+   SchemaCode( final int column ) {
+      _column = column;
+   }
+
+   public int getColumn() {
+      return _column;
+   }
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/table/column/Synonym.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/table/column/Synonym.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/table/column/Synonym.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/table/column/Synonym.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,32 @@
+package org.apache.ctakes.dictionary.cased.table.column;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/14/2020
+ */
+public enum Synonym {
+   CUI( 1, Long.class ),
+   PREFIX( 2, String.class ),
+   INDEX_WORD( 3, String.class ),
+   SUFFIX( 4, String.class ),
+   RANK( 5, Integer.class ),
+   INSTANCES( 6, Integer.class );
+
+   final private int _column;
+   final private Class<?> _class;
+
+   Synonym( final int column, final Class<?> clazz ) {
+      _column = column;
+      _class = clazz;
+   }
+
+   public int getColumn() {
+      return _column;
+   }
+
+   public Class<?> getClassType() {
+      return _class;
+   }
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/bsv/BsvFileParser.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/bsv/BsvFileParser.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/bsv/BsvFileParser.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/bsv/BsvFileParser.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,51 @@
+package org.apache.ctakes.dictionary.cased.util.bsv;
+
+
+import org.apache.ctakes.core.resource.FileLocator;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.Collection;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/18/2020
+ */
+final public class BsvFileParser {
+
+   private BsvFileParser() {
+   }
+
+
+   static public Collection<String[]> parseBsvFile( final String bsvFilePath ) throws IOException {
+      return parseBsvFile( bsvFilePath, Integer.MAX_VALUE );
+   }
+
+
+   static public Collection<String[]> parseBsvFile( final String bsvFilePath,
+                                                    final int columnCount ) throws IOException {
+      return parseBsvFile( bsvFilePath, new StringArrayCreator( columnCount ) );
+   }
+
+
+   static public <T> Collection<T> parseBsvFile( final String bsvFilePath,
+                                                 final BsvObjectCreator<T> objectCreator ) throws IOException {
+      final Collection<T> bsvObjects = new ArrayList<>();
+      final BufferedReader reader
+            = new BufferedReader( new InputStreamReader( FileLocator.getAsStream( bsvFilePath ) ) );
+      String line = reader.readLine();
+      while ( line != null ) {
+         final T bsvObject = objectCreator.createBsvObject( line );
+         if ( bsvObject != null ) {
+            bsvObjects.add( bsvObject );
+         }
+         line = reader.readLine();
+      }
+      return bsvObjects;
+   }
+
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/bsv/BsvObjectCreator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/bsv/BsvObjectCreator.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/bsv/BsvObjectCreator.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/bsv/BsvObjectCreator.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,35 @@
+package org.apache.ctakes.dictionary.cased.util.bsv;
+
+import org.apache.ctakes.core.util.StringUtil;
+
+import java.util.Arrays;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/18/2020
+ */
+public interface BsvObjectCreator<T> {
+
+   T createBsvObject( final String[] columns );
+
+   default T createBsvObject( final String line ) {
+      if ( isCommentLine( line ) ) {
+         return null;
+      }
+      final String[] columns = StringUtil.fastSplit( line, '|' );
+      if ( isAnyColumnEmpty( columns ) ) {
+         return null;
+      }
+      return createBsvObject( columns );
+   }
+
+   default boolean isCommentLine( final String line ) {
+      return line.isEmpty() || line.startsWith( "//" ) || line.startsWith( "#" );
+   }
+
+   default boolean isAnyColumnEmpty( final String[] columns ) {
+      return Arrays.stream( columns ).anyMatch( c -> c.trim().isEmpty() );
+   }
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/bsv/StringArrayCreator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/bsv/StringArrayCreator.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/bsv/StringArrayCreator.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/bsv/StringArrayCreator.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,29 @@
+package org.apache.ctakes.dictionary.cased.util.bsv;
+
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/18/2020
+ */
+public class StringArrayCreator implements BsvObjectCreator<String[]> {
+
+   private final int _columnCount;
+
+   public StringArrayCreator() {
+      this( Integer.MAX_VALUE );
+   }
+
+   public StringArrayCreator( final int columnCount ) {
+      _columnCount = columnCount;
+   }
+
+   public String[] createBsvObject( final String[] columns ) {
+      if ( _columnCount != Integer.MAX_VALUE && columns.length != _columnCount ) {
+         return null;
+      }
+      return columns;
+   }
+
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/jdbc/JdbcUtil.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/jdbc/JdbcUtil.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/jdbc/JdbcUtil.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/jdbc/JdbcUtil.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,100 @@
+package org.apache.ctakes.dictionary.cased.util.jdbc;
+
+
+import org.apache.ctakes.dictionary.lookup2.util.JdbcConnectionFactory;
+import org.apache.ctakes.utils.env.EnvironmentVariable;
+import org.apache.uima.UimaContext;
+
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.SQLException;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/18/2020
+ */
+final public class JdbcUtil {
+
+   private JdbcUtil() {
+   }
+
+   static public final String HSQL_DRIVER = "org.hsqldb.jdbcDriver";
+   static public final String UPPER_TABLE = "UPPER";
+   static public final String MIXED_TABLE = "MIXED";
+   static public final String LOWER_TABLE = "LOWER";
+   static public final String DEFAULT_USER = "sa";
+   static public final String DEFAULT_PASS = "";
+
+
+   static public String getParameterValue( final String rootName,
+                                           final String parameterName,
+                                           final UimaContext uimaContext,
+                                           final String defaultValue ) {
+      final String value = EnvironmentVariable.getEnv( rootName + '_' + parameterName, uimaContext );
+      if ( value != null && !value.equals( EnvironmentVariable.NOT_PRESENT ) ) {
+         return value;
+      }
+      return defaultValue;
+   }
+
+
+   static public PreparedStatement createPreparedStatement( final String name,
+                                                            final String jdbcDriver,
+                                                            final String jdbcUrl,
+                                                            final String jdbcUser,
+                                                            final String jdbcPass,
+                                                            final String tableName,
+                                                            final String indexName ) throws SQLException {
+      if ( jdbcDriver == null || jdbcDriver.isEmpty() ) {
+         throw new SQLException( "No JDBC Driver specified for " + name );
+      }
+      if ( jdbcUrl == null || jdbcUrl.isEmpty() ) {
+         throw new SQLException( "No URL specified for " + name );
+      }
+      if ( tableName == null || tableName.isEmpty() ) {
+         throw new SQLException( "No Table specified for " + name );
+      }
+      // DO NOT use try with resources here.  Try with resources uses a closable and closes it when exiting the try
+      final Connection connection = JdbcConnectionFactory.getInstance()
+                                                         .getConnection( jdbcDriver, jdbcUrl, jdbcUser, jdbcPass );
+      if ( connection == null ) {
+         throw new SQLException( "Could not connect to " + name );
+      }
+      return createSelectCall( connection, tableName, indexName );
+   }
+
+
+   /**
+    * @return an sql call to use for term lookup
+    * @throws SQLException if the {@code PreparedStatement} could not be created or changed
+    */
+   static private PreparedStatement createSelectCall( final Connection connection,
+                                                      final String table,
+                                                      final String index ) throws SQLException {
+      final String lookupSql = "SELECT * FROM " + table + " WHERE " + index + " = ?";
+      return connection.prepareStatement( lookupSql );
+   }
+
+   /**
+    * @param statement an sql call to use for lookup
+    * @param text      of the text to use for lookup
+    * @throws SQLException if the {@code PreparedStatement} could not be created or changed
+    */
+   static public void fillSelectCall( final PreparedStatement statement, final String text ) throws SQLException {
+      statement.clearParameters();
+      statement.setString( 1, text );
+   }
+
+   /**
+    * @param statement an sql call to use for lookup
+    * @param value     of the long to use for lookup
+    * @throws SQLException if the {@code PreparedStatement} could not be created or changed
+    */
+   static public void fillSelectCall( final PreparedStatement statement, final long value ) throws SQLException {
+      statement.clearParameters();
+      statement.setLong( 1, value );
+   }
+
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/textspan/ContiguousTextSpan.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/textspan/ContiguousTextSpan.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/textspan/ContiguousTextSpan.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/textspan/ContiguousTextSpan.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,53 @@
+package org.apache.ctakes.dictionary.cased.util.textspan;
+
+import org.apache.ctakes.core.util.Pair;
+
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/19/2020
+ */
+public final class ContiguousTextSpan implements MagicTextSpan {
+   private final Pair<Integer> _span;
+
+   public ContiguousTextSpan( final int begin, final int end ) {
+      this( new Pair<>( begin, end ) );
+   }
+
+   public ContiguousTextSpan( final Pair<Integer> span ) {
+      _span = span;
+   }
+
+   public Pair<Integer> toIntPair() {
+      return _span;
+   }
+
+   public int getBegin() {
+      return _span.getValue1();
+   }
+
+   public int getEnd() {
+      return _span.getValue2();
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public boolean equals( final Object other ) {
+      return other instanceof ContiguousTextSpan
+             && ((ContiguousTextSpan)other).getBegin() == getBegin()
+             && ((ContiguousTextSpan)other).getEnd() == getEnd();
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public int hashCode() {
+      return _span.hashCode();
+   }
+
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/textspan/DiscontiguousTextSpan.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/textspan/DiscontiguousTextSpan.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/textspan/DiscontiguousTextSpan.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/textspan/DiscontiguousTextSpan.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,115 @@
+package org.apache.ctakes.dictionary.cased.util.textspan;
+
+import org.apache.ctakes.core.util.Pair;
+
+import java.util.*;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/19/2020
+ */
+public final class DiscontiguousTextSpan implements MagicTextSpan {
+   private final Pair<Integer> _span;
+   private final Collection<MagicTextSpan> _presentTextSpans;
+
+   private DiscontiguousTextSpan( final Pair<Integer> span, final Collection<Pair<Integer>> missingSpans ) {
+      _span = span;
+      _presentTextSpans = createPresentSpans( span, missingSpans );
+   }
+
+   public Pair<Integer> toIntPair() {
+      return _span;
+   }
+
+   public int getBegin() {
+      return _span.getValue1();
+   }
+
+   public int getEnd() {
+      return _span.getValue2();
+   }
+
+
+   public Collection<MagicTextSpan> getPresentSpans() {
+      return _presentTextSpans;
+   }
+
+   static private Collection<MagicTextSpan> createPresentSpans( final Pair<Integer> span,
+                                                                final Collection<Pair<Integer>> missingSpans ) {
+      final List<Pair<Integer>> missingSpanList = new ArrayList<>( missingSpans );
+      missingSpanList.sort( Comparator.comparingInt( Pair::getValue1 ) );
+
+      final Collection<MagicTextSpan> presentSpans = new HashSet<>( missingSpans.size() + 1 );
+      int previousBegin = span.getValue1();
+      for ( Pair<Integer> missingSpan : missingSpanList ) {
+         presentSpans.add( new ContiguousTextSpan( previousBegin, missingSpan.getValue1() ) );
+         previousBegin = missingSpan.getValue2();
+      }
+      presentSpans.add( new ContiguousTextSpan( previousBegin, span.getValue2() ) );
+      return presentSpans;
+   }
+
+   public boolean containsAll( final MagicTextSpan textSpan ) {
+      if ( !contains( textSpan ) ) {
+         return false;
+      }
+      final Collection<MagicTextSpan> presentSpans = getPresentSpans();
+      if ( textSpan instanceof ContiguousTextSpan ) {
+         return presentSpans.stream().anyMatch( t -> t.contains( textSpan ) );
+      }
+      if ( textSpan instanceof DiscontiguousTextSpan ) {
+         final Collection<MagicTextSpan> otherPresentSpans = ((DiscontiguousTextSpan)textSpan).getPresentSpans();
+         for ( MagicTextSpan other : otherPresentSpans ) {
+            if ( presentSpans.stream().noneMatch( t -> t.contains( other ) ) ) {
+               return false;
+            }
+         }
+      }
+      return true;
+   }
+
+   public boolean fullyContainsAll( final MagicTextSpan textSpan ) {
+      if ( !fullyContains( textSpan ) ) {
+         return false;
+      }
+      final Collection<MagicTextSpan> presentSpans = getPresentSpans();
+      if ( textSpan instanceof ContiguousTextSpan ) {
+         return presentSpans.stream().anyMatch( t -> t.fullyContains( textSpan ) );
+      }
+      if ( textSpan instanceof DiscontiguousTextSpan ) {
+         boolean fullyContains = false;
+         final Collection<MagicTextSpan> otherPresentSpans = ((DiscontiguousTextSpan)textSpan).getPresentSpans();
+         for ( MagicTextSpan other : otherPresentSpans ) {
+            if ( presentSpans.stream().noneMatch( t -> t.contains( other ) ) ) {
+               return false;
+            }
+            fullyContains = fullyContains
+                            || presentSpans.stream().anyMatch( t -> t.fullyContains( other ) );
+         }
+         return fullyContains;
+      }
+      return true;
+   }
+
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public boolean equals( final Object other ) {
+      return other instanceof DiscontiguousTextSpan
+             && ((DiscontiguousTextSpan)other).getBegin() == getBegin()
+             && ((DiscontiguousTextSpan)other).getEnd() == getEnd()
+             && ((DiscontiguousTextSpan)other).getPresentSpans().equals( getPresentSpans() );
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public int hashCode() {
+      return _span.hashCode() + _presentTextSpans.hashCode();
+   }
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/textspan/MagicTextSpan.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/textspan/MagicTextSpan.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/textspan/MagicTextSpan.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/textspan/MagicTextSpan.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,65 @@
+package org.apache.ctakes.dictionary.cased.util.textspan;
+
+
+import org.apache.ctakes.core.util.Pair;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/19/2020
+ */
+public interface MagicTextSpan {
+
+   int getBegin();
+
+   int getEnd();
+
+   default Pair<Integer> toIntPair() {
+      return new Pair<>( getBegin(), getEnd() );
+   }
+
+   default int getWidth() {
+      return getEnd() - getBegin();
+   }
+
+   /**
+    * NOTE: TextSpans are begin inclusive end exclusive.
+    * So, 1 is subtracted from the end when comparing to another begin
+    *
+    * @param textSpan another textspan
+    * @return true if there is overlap between the two text spans
+    */
+   default boolean overlaps( final MagicTextSpan textSpan ) {
+      return !(textSpan.getEnd() - 1 < getBegin()) && !(textSpan.getBegin() > getEnd() - 1);
+   }
+
+   default boolean contains( final MagicTextSpan textSpan ) {
+      return getBegin() <= textSpan.getBegin() && textSpan.getEnd() <= getEnd();
+   }
+
+   default boolean fullyContains( final MagicTextSpan textSpan ) {
+      return (getBegin() < textSpan.getBegin() && textSpan.getEnd() <= getEnd())
+             || (getBegin() <= textSpan.getBegin() && textSpan.getEnd() < getEnd());
+   }
+
+   /**
+    * For discontiguous spans, every part of this span must include every part of that span.
+    *
+    * @param textSpan -
+    * @return -
+    */
+   default boolean containsAll( MagicTextSpan textSpan ) {
+      return contains( textSpan );
+   }
+
+   /**
+    * For discontiguous spans, every part of this span must include every part of that span.
+    *
+    * @param textSpan -
+    * @return -
+    */
+   default boolean fullyContainsAll( MagicTextSpan textSpan ) {
+      return fullyContains( textSpan );
+   }
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/tokenize/TokenizedTerm.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/tokenize/TokenizedTerm.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/tokenize/TokenizedTerm.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/tokenize/TokenizedTerm.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,237 @@
+package org.apache.ctakes.dictionary.cased.util.tokenize;
+
+import jdk.nashorn.internal.ir.annotations.Immutable;
+import org.apache.ctakes.dictionary.lookup2.util.CuiCodeUtil;
+
+import java.util.*;
+import java.util.stream.Collectors;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/17/2020
+ */
+@Immutable
+final public class TokenizedTerm {
+
+   static private final Collection<String> PREFIXES = new HashSet<>( Arrays.asList(
+         "e-",
+         "a-",
+         "u-",
+         "x-",
+         "agro-",
+         "ante-",
+         "anti-",
+         "arch-",
+         "be-",
+         "bi-",
+         "bio-",
+         "co-",
+         "counter-",
+         "cross-",
+         "cyber-",
+         "de-",
+         "eco-",
+         "ex-",
+         "extra-",
+         "inter-",
+         "intra-",
+         "macro-",
+         "mega-",
+         "micro-",
+         "mid-",
+         "mini-",
+         "multi-",
+         "neo-",
+         "non-",
+         "over-",
+         "pan-",
+         "para-",
+         "peri-",
+         "post-",
+         "pre-",
+         "pro-",
+         "pseudo-",
+         "quasi-",
+         "re-",
+         "semi-",
+         "sub-",
+         "super-",
+         "tri-",
+         "ultra-",
+         "un-",
+         "uni-",
+         "vice-",
+         // From email from Colin Warner <colinw@ldc.upenn.edu> on 7/25/2010
+         "electro-",
+         "gasto-",
+         "homo-",
+         "hetero-",
+         "ortho-",
+         "phospho-" ) );
+
+   static private final Collection<String> SUFFIXES = new HashSet<>( Arrays.asList(
+         "-esque",
+         "-ette",
+         "-fest",
+         "-fold",
+         "-gate",
+         "-itis",
+         "-less",
+         "-most",
+         "-o-torium",
+         "-rama",
+         "-wise" ) );
+
+   static private final Collection<String> UPPER_PREFIXES = PREFIXES.stream()
+                                                                    .map( String::toUpperCase )
+                                                                    .collect( Collectors.toSet() );
+
+   static private final Collection<String> UPPER_SUFFIXES = SUFFIXES.stream()
+                                                                    .map( String::toUpperCase )
+                                                                    .collect( Collectors.toSet() );
+
+
+   final private String[] _tokens;
+   final private boolean _allUpperCase;
+   final private boolean _allLowerCase;
+   final private Long _cui;
+   final private int _hashcode;
+
+   public TokenizedTerm( final String cui, final String text ) {
+      _cui = CuiCodeUtil.getInstance().getCuiCode( cui );
+      _tokens = getTermTokens( text );
+      boolean anyCaps = false;
+      boolean anyLower = false;
+      for ( char c : text.toCharArray() ) {
+         if ( Character.isUpperCase( c ) ) {
+            anyCaps = true;
+         } else if ( Character.isLowerCase( c ) ) {
+            anyLower = true;
+         }
+         if ( anyCaps && anyLower ) {
+            break;
+         }
+      }
+      _allUpperCase = anyCaps && !anyLower;
+      _allLowerCase = anyLower && !anyCaps;
+      _hashcode = (cui + "_" + text).hashCode();
+   }
+
+   public long getCui() {
+      return _cui;
+   }
+
+   public String[] getTokens() {
+      return _tokens;
+   }
+
+   public boolean isAllUpperCase() {
+      return _allUpperCase;
+   }
+
+   public boolean isAllLowerCase() {
+      return _allLowerCase;
+   }
+
+
+   static private String[] getTermTokens( final String text ) {
+      if ( text.isEmpty() ) {
+         return new String[ 0 ];
+      }
+      return Arrays.stream( text.split( "\\s+" ) )
+                   .map( TokenizedTerm::getTokens )
+                   .flatMap( Collection::stream )
+                   .toArray( String[]::new );
+   }
+
+   // TODO should this be exactly the same as getTokens in TextTokenizer (dictionary gui code)  ? probably ...
+   static private List<String> getTokens( final String word ) {
+      final List<String> tokens = new ArrayList<>();
+      final StringBuilder sb = new StringBuilder();
+      final int count = word.length();
+      for ( int i = 0; i < count; i++ ) {
+         final char c = word.charAt( i );
+         if ( Character.isLetterOrDigit( c ) ) {
+            sb.append( c );
+            continue;
+         }
+         if ( c == '-' && (isPrefix( sb.toString() ) || isSuffix( word, i + 1 )) ) {
+            // what precedes is a prefix or what follows is a suffix so append the dash to the current word and move on
+            sb.append( c );
+            continue;
+         }
+         if ( (c == '\'' && isOwnerApostrophe( word, i + 1 ))
+              || (c == '.' && isNumberDecimal( word, i + 1 )) ) {
+            // what follows is an 's or .# so add the preceding and move on
+            if ( sb.length() != 0 ) {
+               tokens.add( createToken( sb ) );
+               sb.setLength( 0 );
+            }
+            sb.append( c );
+            continue;
+         }
+         // Wasn't a special symbol for consideration, so add the previous and symbol separately
+         if ( sb.length() != 0 ) {
+            tokens.add( createToken( sb ) );
+            sb.setLength( 0 );
+         }
+         tokens.add( "" + c );
+      }
+      if ( sb.length() != 0 ) {
+         tokens.add( createToken( sb ) );
+      }
+      return tokens;
+   }
+
+   static private String createToken( final StringBuilder sb ) {
+      return sb.toString();
+   }
+
+   static private boolean isPrefix( final String word ) {
+      return PREFIXES.contains( word + "-" ) || UPPER_PREFIXES.contains( word + "-" );
+   }
+
+   static private boolean isSuffix( final String word, final int startIndex ) {
+      if ( word.length() <= startIndex ) {
+         return false;
+      }
+      final String nextCharTerm = getNextCharTerm( word.substring( startIndex ) );
+      if ( nextCharTerm.isEmpty() ) {
+         return false;
+      }
+      return SUFFIXES.contains( "-" + nextCharTerm ) || UPPER_SUFFIXES.contains( "-" + nextCharTerm );
+   }
+
+   static private boolean isOwnerApostrophe( final CharSequence word, final int startIndex ) {
+      return word.length() == startIndex + 1 && word.charAt( startIndex ) == 's';
+   }
+
+   static private boolean isNumberDecimal( final CharSequence word, final int startIndex ) {
+      // Bizarre scenario in which ctakes tokenizes ".2" as a fraction, but not ".22"
+      return word.length() == startIndex + 1 && Character.isDigit( word.charAt( startIndex ) );
+   }
+
+   static private String getNextCharTerm( final String word ) {
+      final int count = word.length();
+      for ( int i = 0; i < count; i++ ) {
+         final char c = word.charAt( i );
+         if ( !Character.isLetterOrDigit( c ) ) {
+            return word.substring( 0, i );
+         }
+      }
+      return word;
+   }
+
+   public boolean equals( final Object value ) {
+      return value instanceof TokenizedTerm
+             && Arrays.equals( _tokens, ((TokenizedTerm)value)._tokens )
+             && _cui.equals( ((TokenizedTerm)value)._cui );
+   }
+
+   public int hashCode() {
+      return _hashcode;
+   }
+
+
+}



Mime
View raw message