ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From seanfi...@apache.org
Subject svn commit: r1881994 [1/3] - in /ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased: ./ ae/ annotation/ dictionary/ encoder/ lookup/ table/ table/column/ util/ util/bsv/ util/jdbc/ util/textspan/ util/tokenize/ ...
Date Fri, 25 Sep 2020 00:59:37 GMT
Author: seanfinan
Date: Fri Sep 25 00:59:37 2020
New Revision: 1881994

URL: http://svn.apache.org/viewvc?rev=1881994&view=rev
Log:
New Case Sensitive Dictionary Lookup

Added:
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/ae/
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/ae/CasedAnnotationFinder.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/AlikeSubsumingAnnotationCreator.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/AnnotationCreator.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/AnnotationCreatorUtil.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/NonSubsumingAnnotationCreator.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/SemanticSubsumingAnnotationCreator.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/SubsumptionUtil.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/BsvDictionary.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/BsvListDictionary.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/CasedDictionary.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/DictionaryStore.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/InMemoryDictionary.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/JdbcDictionary.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/BsvEncoder.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/BsvListEncoder.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/CodeSchema.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/EncoderStore.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/InMemoryEncoder.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/JdbcEncoder.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/TermEncoder.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/encoder/TermEncoding.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/CandidateTerm.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/ContiguousLookupEngine.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/DiscoveredTerm.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/LookupEngine.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/lookup/LookupToken.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/table/
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/table/column/
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/table/column/CodeType.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/table/column/SchemaCode.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/table/column/Synonym.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/bsv/
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/bsv/BsvFileParser.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/bsv/BsvObjectCreator.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/bsv/StringArrayCreator.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/jdbc/
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/jdbc/JdbcUtil.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/textspan/
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/textspan/ContiguousTextSpan.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/textspan/DiscontiguousTextSpan.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/textspan/MagicTextSpan.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/tokenize/
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/tokenize/TokenizedTerm.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/util/tokenize/TokenizedTermMapper.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/wsd/
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/wsd/WsdUtil.java

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/ae/CasedAnnotationFinder.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/ae/CasedAnnotationFinder.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/ae/CasedAnnotationFinder.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/ae/CasedAnnotationFinder.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,440 @@
+package org.apache.ctakes.dictionary.cased.ae;
+
+import org.apache.ctakes.core.pipeline.PipeBitInfo;
+import org.apache.ctakes.core.util.Pair;
+import org.apache.ctakes.core.util.StringUtil;
+import org.apache.ctakes.core.util.annotation.SemanticGroup;
+import org.apache.ctakes.core.util.annotation.SemanticTui;
+import org.apache.ctakes.dictionary.cased.annotation.AlikeSubsumingAnnotationCreator;
+import org.apache.ctakes.dictionary.cased.annotation.AnnotationCreator;
+import org.apache.ctakes.dictionary.cased.annotation.NonSubsumingAnnotationCreator;
+import org.apache.ctakes.dictionary.cased.annotation.SemanticSubsumingAnnotationCreator;
+import org.apache.ctakes.dictionary.cased.dictionary.*;
+import org.apache.ctakes.dictionary.cased.encoder.*;
+import org.apache.ctakes.dictionary.cased.lookup.DiscoveredTerm;
+import org.apache.ctakes.dictionary.cased.lookup.LookupEngine;
+import org.apache.ctakes.dictionary.cased.lookup.LookupToken;
+import org.apache.ctakes.dictionary.lookup2.ae.JCasTermAnnotator;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.syntax.NewlineToken;
+import org.apache.ctakes.typesystem.type.syntax.WordToken;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.ctakes.utils.env.EnvironmentVariable;
+import org.apache.log4j.Logger;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceInitializationException;
+
+import java.sql.SQLException;
+import java.util.*;
+import java.util.function.Function;
+import java.util.function.Predicate;
+import java.util.stream.Collectors;
+
+import static org.apache.ctakes.core.pipeline.PipeBitInfo.TypeProduct.*;
+
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/12/2020
+ */
+@PipeBitInfo(
+      name = "CasedAnnotationFinder",
+      description = "Finds all-uppercase or normal terms in text.",
+      role = PipeBitInfo.Role.ANNOTATOR,
+      dependencies = { BASE_TOKEN, SENTENCE },
+      products = IDENTIFIED_ANNOTATION
+)
+final public class CasedAnnotationFinder extends JCasAnnotator_ImplBase {
+
+   static private final Logger LOGGER = Logger.getLogger( "CasedAnnotationFinder" );
+
+   static public final String DICTIONARY_TYPE = "_type";
+   static public final String ENCODER_TYPE = "_type";
+
+
+   // dictionaries accepts a comma-separated list
+   @ConfigurationParameter( name = "dictionaries", mandatory = true,
+         description = "Dictionaries to use for lookup." )
+   private String[] _dictionaries;
+
+   static private final String snomed_rxnorm_2020aa_type = "Jdbc";
+
+
+   // https://www.eecis.udel.edu/~vijay/cis889/ie/pos-set.pdf
+
+   static private final String[] VERB_POS = { "VB", "VBD", "VBG", "VBN", "VBP", "VBZ",
+                                              "VV", "VVD", "VVG", "VVN", "VVP", "VVZ" };
+   @ConfigurationParameter( name = "lookupVerbs", mandatory = false,
+         description = "Use Verb parts of speech for lookup." )
+   private String _lookupVerbs = "yes";
+
+   static private final String[] NOUN_POS = { "NN", "NNS", "NP", "NPS", "NNP", "NNPS" };
+   @ConfigurationParameter( name = "lookupNouns", mandatory = false,
+         description = "Use Noun parts of speech for lookup." )
+   private String _lookupNouns = "yes";
+
+   static private final String[] ADJECTIVE_POS = { "JJ", "JJR", "JJS" };
+   @ConfigurationParameter( name = "lookupAdjectives", mandatory = false,
+         description = "Use Adjective parts of speech for lookup." )
+   private String _lookupAdjectives = "yes";
+
+   static private final String[] ADVERB_POS = { "RB", "RBR", "RBS" };
+   @ConfigurationParameter( name = "lookupAdverbs", mandatory = false,
+         description = "Use Adverb parts of speech for lookup." )
+   private String _lookupAdverbs = "yes";
+
+   @ConfigurationParameter( name = "otherLookups", mandatory = false,
+         description = "List of other parts of speech for lookup." )
+   private String[] _otherLookups = {};
+
+   // minimum span required to accept a term
+   @ConfigurationParameter( name = JCasTermAnnotator.PARAM_MIN_SPAN_KEY, mandatory = false,
+         description = "Minimum number of characters for a term." )
+   protected int _minLookupSpan = JCasTermAnnotator.DEFAULT_MINIMUM_SPAN;
+
+
+   @ConfigurationParameter( name = "allowWordSkips", mandatory = false,
+         description = "Terms may include words that do not match.  So-called loose matching." )
+   protected String _allowSkips = "no";
+
+   static private final String CONS_SKIP_PRP_KEY = "consecutiveSkips";
+   @ConfigurationParameter( name = CONS_SKIP_PRP_KEY, mandatory = false,
+         description = "Number of consecutive non-comma tokens that can be skipped." )
+   private int _consecutiveSkipMax = 2;
+
+   static private final String TOTAL_SKIP_PRP_KEY = "totalSkips";
+   @ConfigurationParameter( name = TOTAL_SKIP_PRP_KEY, mandatory = false,
+         description = "Number of total tokens that can be skipped." )
+   private int _totalSkipMax = 4;
+
+
+   @ConfigurationParameter( name = "subsume", mandatory = false,
+         description = "Subsume contained terms of the same semantic group.", defaultValue = "yes" )
+   private String _subsume = "yes";
+
+   @ConfigurationParameter( name = "subsumeSemantics", mandatory = false,
+         description = "Subsume contained terms of the same and certain other semantic groups.", defaultValue = "yes" )
+   private String _subsumeSemantics = "yes";
+
+
+   @ConfigurationParameter( name = "reassignSemantics", mandatory = false,
+         description = "Reassign Semantic Types (TUIs) to non-default Semantic Groups." )
+   private String[] _reassignSemanticList = {};
+
+
+   // code lists accepts a comma-separated list
+   @ConfigurationParameter( name = "encoders", mandatory = true,
+         description = "Term Encoders with schemas and schema codes." )
+   private String[] _encoders;
+
+
+   private boolean _allowSkipping;
+
+   private AnnotationCreator _annotationCreator;
+
+   final private Collection<String> _lookupPos = new HashSet<>();
+
+   final private Map<SemanticTui, SemanticGroup> _semanticReassignment = new HashMap<>();
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public void initialize( final UimaContext context ) throws ResourceInitializationException {
+      LOGGER.info( "Initializing Dictionary Lookup ..." );
+      super.initialize( context );
+
+      if ( isParameterTrue( _subsumeSemantics ) ) {
+         _annotationCreator = new SemanticSubsumingAnnotationCreator();
+
+      } else if ( isParameterTrue( _subsume ) ) {
+         _annotationCreator = new AlikeSubsumingAnnotationCreator();
+      } else {
+         _annotationCreator = new NonSubsumingAnnotationCreator();
+      }
+
+      setupDictionaries( context );
+      setupEncoders( context );
+      setupPos();
+      setupReassignSemantics();
+   }
+
+
+   static private boolean isParameterTrue( final String value ) {
+      return value.equalsIgnoreCase( "yes" ) || value.equalsIgnoreCase( "true" );
+   }
+
+   private void setupDictionaries( final UimaContext context ) throws ResourceInitializationException {
+      if ( _dictionaries.length == 0 ) {
+         LOGGER.error( "Dictionary List is empty.  Consider using the default cTAKES Dictionary." +
+                       "  If you are using a piper file, add the line \"load sno_rx_16ab_settings\"" );
+         throw new ResourceInitializationException();
+      }
+      for ( String name : _dictionaries ) {
+         final CasedDictionary dictionary = createDictionary( name, context );
+         if ( dictionary == null ) {
+            LOGGER.error( "Could not create Dictionary for " + name );
+            throw new ResourceInitializationException();
+         }
+         DictionaryStore.getInstance().addDictionary( dictionary );
+      }
+   }
+
+
+   private CasedDictionary createDictionary( final String name, final UimaContext context ) {
+      final String type = EnvironmentVariable.getEnv( name + DICTIONARY_TYPE, context );
+      if ( type == null || type.equals( EnvironmentVariable.NOT_PRESENT ) ) {
+         LOGGER.error(
+               "No Dictionary Type specified for " + name + ".  Please set parameter " + name + DICTIONARY_TYPE );
+         return null;
+      }
+      try {
+         switch ( type.toUpperCase() ) {
+            case JdbcDictionary
+                  .DICTIONARY_TYPE:
+               return new JdbcDictionary( name, context );
+            case BsvDictionary
+                  .DICTIONARY_TYPE:
+               return new BsvDictionary( name, context );
+            case BsvListDictionary
+                  .DICTIONARY_TYPE:
+               return new BsvListDictionary( name, context );
+            default:
+               LOGGER.error( "Unknown Dictionary type " + type + " specified for " + name );
+         }
+      } catch ( SQLException multE ) {
+         LOGGER.error( multE.getMessage() );
+      }
+      return null;
+   }
+
+
+   private void setupEncoders( final UimaContext context ) throws ResourceInitializationException {
+      if ( _encoders.length == 0 ) {
+         LOGGER.error( "Term Encoder List is empty.  Consider using the default cTAKES Term Encoder." +
+                       "  If you are using a piper file, add the line \"load sno_rx_2020aa_settings\"" );
+         throw new ResourceInitializationException();
+      }
+      for ( String name : _encoders ) {
+         final TermEncoder encoder = createEncoder( name, context );
+         if ( encoder == null ) {
+            LOGGER.error( "Could not create Term Encoder for " + name );
+            throw new ResourceInitializationException();
+         }
+         EncoderStore.getInstance().addEncoder( encoder );
+      }
+   }
+
+
+   private TermEncoder createEncoder( final String name, final UimaContext context ) {
+      final String type = EnvironmentVariable.getEnv( name + ENCODER_TYPE, context );
+      if ( type == null || type.equals( EnvironmentVariable.NOT_PRESENT ) ) {
+         LOGGER.error(
+               "No Term Encoder Type specified for " + name + ".  Please set parameter " + name + ENCODER_TYPE );
+         return null;
+      }
+      try {
+         switch ( type.toUpperCase() ) {
+            case JdbcEncoder
+                  .ENCODER_TYPE:
+               return new JdbcEncoder( name, context );
+            case BsvEncoder
+                  .ENCODER_TYPE:
+               return new BsvEncoder( name, context );
+            case BsvListEncoder
+                  .ENCODER_TYPE:
+               return new BsvListEncoder( name, context );
+            default:
+               LOGGER.error( "Unknown Term Encoder type " + type + " specified for " + name );
+         }
+      } catch ( SQLException multE ) {
+         LOGGER.error( multE.getMessage() );
+      }
+      return null;
+   }
+
+
+   private void setupPos() throws ResourceInitializationException {
+      if ( isTrue( _lookupVerbs ) ) {
+         _lookupPos.addAll( Arrays.asList( VERB_POS ) );
+      }
+      if ( isTrue( _lookupNouns ) ) {
+         _lookupPos.addAll( Arrays.asList( NOUN_POS ) );
+      }
+      if ( isTrue( _lookupAdjectives ) ) {
+         _lookupPos.addAll( Arrays.asList( ADJECTIVE_POS ) );
+      }
+      if ( isTrue( _lookupAdverbs ) ) {
+         _lookupPos.addAll( Arrays.asList( ADVERB_POS ) );
+      }
+      if ( _otherLookups.length != 0 ) {
+         _lookupPos.addAll( Arrays.asList( _otherLookups ) );
+      }
+      if ( _lookupPos.isEmpty() ) {
+         LOGGER.error( "No Parts of Speech indicated for Lookup.  At least one Part of Speech must be used." );
+         throw new ResourceInitializationException();
+      }
+      LOGGER.info( "Using Parts of Speech " + String.join( ", ", _lookupPos ) );
+   }
+
+   private void setupReassignSemantics() {
+      if ( _semanticReassignment == null || _reassignSemanticList.length == 0 ) {
+         return;
+      }
+      for ( String keyValue : _reassignSemanticList ) {
+         final String[] splits = StringUtil.fastSplit( keyValue, ':' );
+         if ( splits.length != 2 ) {
+            LOGGER.warn( "Improper Key : Value pair for Semantic Reassignment " + keyValue );
+            continue;
+         }
+         final SemanticTui tui = SemanticTui.getTui( splits[ 0 ].trim() );
+         final SemanticGroup group = SemanticGroup.getGroup( splits[ 1 ].trim() );
+         _semanticReassignment.put( tui, group );
+      }
+      LOGGER.info( "Reassigned Semantics: "
+                   + _semanticReassignment.entrySet()
+                                          .stream()
+                                          .map( e -> e.getKey().getSemanticType() + " : " + e.getValue().getLongName() )
+                                          .collect( Collectors.joining( ", " ) ) );
+   }
+
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public void process( final JCas jCas ) throws AnalysisEngineProcessException {
+      LOGGER.info( "Finding Named Entities ..." );
+
+      // Get all BaseTokens, grouped by Sentence.
+      final Map<Sentence, Collection<BaseToken>> sentenceBaseTokens
+            = JCasUtil.indexCovered( jCas, Sentence.class, BaseToken.class );
+
+      // Discover Terms in text, grouped by text span.
+      final Map<Pair<Integer>, Collection<DiscoveredTerm>> allDiscoveredTermsMap = new HashMap<>();
+      try {
+         // Using foreach loop because try/catch in a stream is terrible.
+         for ( Collection<BaseToken> baseTokens : sentenceBaseTokens.values() ) {
+            allDiscoveredTermsMap.putAll( getDiscoveredTerms( baseTokens ) );
+         }
+      } catch ( ArrayIndexOutOfBoundsException iobE ) {
+         // JCasHashMap will throw this every once in a while.  Assume the windows are done and move on.
+         LOGGER.warn( iobE.getMessage() );
+      }
+
+
+      // Get all encodings (schemas and codes) or the discovered terms.
+      final Map<DiscoveredTerm, Collection<TermEncoding>> termEncodingMap
+            = allDiscoveredTermsMap.values()
+                                   .stream()
+                                   .flatMap( Collection::parallelStream )
+                                   .collect( Collectors.toMap( Function.identity(), this::getEncodings ) );
+
+
+      createAnnotations( jCas, allDiscoveredTermsMap, termEncodingMap );
+   }
+
+
+   private void createAnnotations( final JCas jCas,
+                                   final Map<Pair<Integer>, Collection<DiscoveredTerm>> allDiscoveredTermsMap,
+                                   final Map<DiscoveredTerm, Collection<TermEncoding>> termEncodingMap ) {
+      _annotationCreator.createAnnotations( jCas, allDiscoveredTermsMap, termEncodingMap, _semanticReassignment );
+   }
+
+
+   private Collection<TermEncoding> getEncodings( final DiscoveredTerm discoveredTerm ) {
+      return EncoderStore.getInstance()
+                         .getEncoders()
+                         .stream()
+                         .map( e -> e.getEncodings( discoveredTerm ) )
+                         .filter( Objects::nonNull )
+                         .flatMap( Collection::stream )
+                         .collect( Collectors.toSet() );
+   }
+
+
+   public Map<Pair<Integer>, Collection<DiscoveredTerm>> getDiscoveredTerms( final Collection<BaseToken> baseTokens ) {
+      final Map<CasedDictionary, Map<Pair<Integer>, Collection<DiscoveredTerm>>> discoveredTermsMap
+            = findTerms( baseTokens );
+
+      return discoveredTermsMap.values()
+                               .stream()
+                               .map( Map::entrySet )
+                               .flatMap( Collection::stream )
+                               .collect( Collectors.toMap( Map.Entry::getKey, Map.Entry::getValue ) );
+   }
+
+
+   /**
+    * Given a set of dictionaries, tokens, and lookup token indices, populate a terms map with discovered terms
+    *
+    * @param baseTokens -
+    * @return dictionaries to map of text spans to terms discovered at those text spans.
+    */
+   private Map<CasedDictionary, Map<Pair<Integer>, Collection<DiscoveredTerm>>> findTerms(
+         final Collection<BaseToken> baseTokens ) {
+      final Collection<CasedDictionary> dictionaries = DictionaryStore.getInstance().getDictionaries();
+      final Map<CasedDictionary, Map<Pair<Integer>, Collection<DiscoveredTerm>>> dictionaryTermsMap
+            = new HashMap<>( dictionaries.size() );
+      final List<LookupToken> lookupTokens = baseTokens.stream()
+                                                       .filter( isWantedToken )
+                                                       .sorted( Comparator.comparingInt( Annotation::getBegin ) )
+                                                       .map( toLookupToken )
+                                                       .collect( Collectors.toList() );
+      final LookupEngine engine = getLookupEngine();
+      dictionaries.forEach( d -> dictionaryTermsMap.put( d,
+            engine.findTerms( d, lookupTokens, _consecutiveSkipMax, _totalSkipMax ) ) );
+      return dictionaryTermsMap;
+   }
+
+   static private final Predicate<BaseToken> isWantedToken = t -> !(t instanceof NewlineToken);
+
+   private final Function<BaseToken, LookupToken> toLookupToken = b -> new LookupToken( b, isValidLookup( b ) );
+
+
+   private boolean isValidLookup( final BaseToken baseToken ) {
+      // We are only interested in tokens that are -words- of a certain length.
+      if ( !(baseToken instanceof WordToken)
+           || (baseToken.getEnd() - baseToken.getBegin() < _minLookupSpan) ) {
+         return false;
+      }
+      // We are only interested in tokens that are -words- of the wanted part of speech.
+      final String partOfSpeech = baseToken.getPartOfSpeech();
+      return partOfSpeech == null || _lookupPos.contains( partOfSpeech );
+   }
+
+
+   private LookupEngine getLookupEngine() {
+      return new LookupEngine();
+   }
+
+
+   static protected int parseInt( final Object value, final String name, final int defaultValue ) {
+      if ( value instanceof Integer ) {
+         return (Integer)value;
+      } else if ( value instanceof String ) {
+         try {
+            return Integer.parseInt( (String)value );
+         } catch ( NumberFormatException nfE ) {
+            LOGGER.warn( "Could not parse " + name + " " + value + " as an integer" );
+         }
+      } else {
+         LOGGER.warn( "Could not parse " + name + " " + value + " as an integer" );
+      }
+      return defaultValue;
+   }
+
+
+   static private boolean isTrue( final String text ) {
+      return text.equalsIgnoreCase( "yes" ) || text.equalsIgnoreCase( "true" );
+   }
+
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/AlikeSubsumingAnnotationCreator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/AlikeSubsumingAnnotationCreator.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/AlikeSubsumingAnnotationCreator.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/AlikeSubsumingAnnotationCreator.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,97 @@
+package org.apache.ctakes.dictionary.cased.annotation;
+
+
+import jdk.nashorn.internal.ir.annotations.Immutable;
+import org.apache.ctakes.core.util.Pair;
+import org.apache.ctakes.core.util.annotation.SemanticGroup;
+import org.apache.ctakes.core.util.annotation.SemanticTui;
+import org.apache.ctakes.dictionary.cased.encoder.TermEncoding;
+import org.apache.ctakes.dictionary.cased.lookup.DiscoveredTerm;
+import org.apache.ctakes.dictionary.cased.util.textspan.MagicTextSpan;
+import org.apache.ctakes.dictionary.cased.wsd.WsdUtil;
+import org.apache.log4j.Logger;
+import org.apache.uima.jcas.JCas;
+
+import java.util.*;
+
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/19/2020
+ */
+@Immutable
+final public class AlikeSubsumingAnnotationCreator implements AnnotationCreator {
+
+   static private final Logger LOGGER = Logger.getLogger( "AlikeSubsumingAnnotationCreator" );
+
+   public AlikeSubsumingAnnotationCreator() {
+   }
+
+
+   public void createAnnotations( final JCas jCas,
+                                  final Map<Pair<Integer>, Collection<DiscoveredTerm>> allDiscoveredTermsMap,
+                                  final Map<DiscoveredTerm, Collection<TermEncoding>> termEncodingMap,
+                                  final Map<SemanticTui, SemanticGroup> reassignSemantics ) {
+
+      final Map<SemanticGroup, Collection<DiscoveredTerm>> semanticTermsMap
+            = AnnotationCreatorUtil.mapSemanticTerms( termEncodingMap, reassignSemantics );
+
+      final Map<DiscoveredTerm, Collection<MagicTextSpan>> termSpanMap
+            = AnnotationCreatorUtil.mapTermSpans( allDiscoveredTermsMap );
+
+
+      for ( SemanticGroup subsumingGroup : SemanticGroup.values() ) {
+         final Collection<DiscoveredTerm> semanticTerms = semanticTermsMap.get( subsumingGroup );
+         if ( semanticTerms == null || semanticTerms.isEmpty() ) {
+            continue;
+         }
+
+         final Map<MagicTextSpan, Collection<DiscoveredTerm>> subsumedTermsMap
+               = getSubsumedSpanTerms( subsumingGroup, semanticTermsMap, termSpanMap );
+
+         for ( Map.Entry<MagicTextSpan, Collection<DiscoveredTerm>> subsumedTerms : subsumedTermsMap.entrySet() ) {
+            allDiscoveredTermsMap.getOrDefault( subsumedTerms.getKey().toIntPair(), new HashSet<>() )
+                                 .removeAll( subsumedTerms.getValue() );
+            semanticTerms.removeAll( subsumedTerms.getValue() );
+         }
+
+         final Map<MagicTextSpan, Collection<DiscoveredTerm>> wsdedTermsMap
+               = WsdUtil.getSemanticWsdSpanTerms( semanticTerms, termSpanMap );
+
+         for ( Map.Entry<MagicTextSpan, Collection<DiscoveredTerm>> wsdedTerms : wsdedTermsMap.entrySet() ) {
+            allDiscoveredTermsMap.getOrDefault( wsdedTerms.getKey().toIntPair(), new HashSet<>() )
+                                 .removeAll( wsdedTerms.getValue() );
+         }
+
+      }
+
+      allDiscoveredTermsMap.forEach(
+            ( k, v ) -> AnnotationCreatorUtil.createAnnotations( jCas, k, v, termEncodingMap, reassignSemantics ) );
+   }
+
+
+   static public Map<MagicTextSpan, Collection<DiscoveredTerm>> getSubsumedSpanTerms(
+         final SemanticGroup subsumingGroup,
+         final Map<SemanticGroup, Collection<DiscoveredTerm>> semanticTermsMap,
+         final Map<DiscoveredTerm, Collection<MagicTextSpan>> termSpanMap ) {
+      final Map<MagicTextSpan, Collection<DiscoveredTerm>> subsumedSpanTermsMap = new HashMap<>();
+      // Get subsuming spans and their corresponding terms.
+      final Map<MagicTextSpan, Collection<DiscoveredTerm>> subsumingSpanTermsMap
+            = SubsumptionUtil.mapSpanTerms( subsumingGroup, semanticTermsMap, termSpanMap );
+      if ( subsumingSpanTermsMap.isEmpty() ) {
+         // No subsuming Spans.
+         return Collections.emptyMap();
+      }
+      // List of spans for subsuming terms, sorted by end character index.
+      final List<MagicTextSpan> subsumingSpans = new ArrayList<>( subsumingSpanTermsMap.keySet() );
+      subsumingSpans.sort( Comparator.comparingInt( MagicTextSpan::getEnd ) );
+      // Remove smaller terms of the same semantic group
+      if ( subsumingSpanTermsMap.size() > 1 ) {
+         subsumedSpanTermsMap.putAll( SubsumptionUtil.mapFullySubsumedTermSpans( subsumingSpans, subsumingSpanTermsMap ) );
+      }
+      return subsumedSpanTermsMap;
+   }
+
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/AnnotationCreator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/AnnotationCreator.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/AnnotationCreator.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/AnnotationCreator.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,25 @@
+package org.apache.ctakes.dictionary.cased.annotation;
+
+import org.apache.ctakes.core.util.Pair;
+import org.apache.ctakes.core.util.annotation.SemanticGroup;
+import org.apache.ctakes.core.util.annotation.SemanticTui;
+import org.apache.ctakes.dictionary.cased.encoder.TermEncoding;
+import org.apache.ctakes.dictionary.cased.lookup.DiscoveredTerm;
+import org.apache.uima.jcas.JCas;
+
+import java.util.Collection;
+import java.util.Map;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/26/2020
+ */
+public interface AnnotationCreator {
+
+   void createAnnotations( final JCas jCas,
+                           final Map<Pair<Integer>, Collection<DiscoveredTerm>> allDiscoveredTermsMap,
+                           final Map<DiscoveredTerm, Collection<TermEncoding>> termEncodingMap,
+                           final Map<SemanticTui, SemanticGroup> reassignSemantics );
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/AnnotationCreatorUtil.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/AnnotationCreatorUtil.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/AnnotationCreatorUtil.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/AnnotationCreatorUtil.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,236 @@
+package org.apache.ctakes.dictionary.cased.annotation;
+
+import jdk.nashorn.internal.ir.annotations.Immutable;
+import org.apache.ctakes.core.util.Pair;
+import org.apache.ctakes.core.util.annotation.SemanticGroup;
+import org.apache.ctakes.core.util.annotation.SemanticTui;
+import org.apache.ctakes.dictionary.cased.encoder.CodeSchema;
+import org.apache.ctakes.dictionary.cased.encoder.TermEncoding;
+import org.apache.ctakes.dictionary.cased.lookup.DiscoveredTerm;
+import org.apache.ctakes.dictionary.cased.util.textspan.ContiguousTextSpan;
+import org.apache.ctakes.dictionary.cased.util.textspan.MagicTextSpan;
+import org.apache.ctakes.dictionary.lookup2.util.CuiCodeUtil;
+import org.apache.ctakes.dictionary.lookup2.util.TuiCodeUtil;
+import org.apache.ctakes.typesystem.type.constants.CONST;
+import org.apache.ctakes.typesystem.type.refsem.UmlsConcept;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.log4j.Logger;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSArray;
+
+import java.util.*;
+import java.util.function.Predicate;
+import java.util.stream.Collectors;
+
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/26/2020
+ */
+@Immutable
+final public class AnnotationCreatorUtil {
+
+   static private final Logger LOGGER = Logger.getLogger( "AnnotationCreatorUtil" );
+
+   private AnnotationCreatorUtil() {
+   }
+
+
+   static public Map<DiscoveredTerm, Collection<MagicTextSpan>> mapTermSpans(
+         final Map<Pair<Integer>, Collection<DiscoveredTerm>> allDiscoveredTermsMap ) {
+      final Map<DiscoveredTerm, Collection<MagicTextSpan>> termSpanMap = new HashMap<>();
+      for ( Map.Entry<Pair<Integer>, Collection<DiscoveredTerm>> spanTerms : allDiscoveredTermsMap.entrySet() ) {
+         final MagicTextSpan textSpan = new ContiguousTextSpan( spanTerms.getKey() );
+         spanTerms.getValue().forEach( t -> termSpanMap.computeIfAbsent( t, s -> new HashSet<>() ).add( textSpan ) );
+      }
+      return termSpanMap;
+   }
+
+
+   static public void createAnnotations( final JCas jcas,
+                                         final Pair<Integer> textSpan,
+                                         final Collection<DiscoveredTerm> discoveredTerms,
+                                         final Map<DiscoveredTerm, Collection<TermEncoding>> termEncodingMap,
+                                         final Map<SemanticTui, SemanticGroup> reassignSemantics ) {
+      discoveredTerms.forEach( t
+            -> createAnnotation( jcas, textSpan, t, termEncodingMap.get( t ), reassignSemantics ) );
+   }
+
+   static private void createAnnotation( final JCas jcas,
+                                         final Pair<Integer> textSpan,
+                                         final DiscoveredTerm discoveredTerm,
+                                         final Collection<TermEncoding> termEncodings,
+                                         final Map<SemanticTui, SemanticGroup> reassignSemantics ) {
+      final SemanticGroup bestGroup
+            = SemanticGroup.getBestGroup( getSemanticGroups( termEncodings, reassignSemantics ) );
+      final IdentifiedAnnotation annotation = bestGroup
+            .getCreator()
+            .apply( jcas );
+      annotation.setTypeID( bestGroup.getCode() );
+      annotation.setBegin( textSpan.getValue1() );
+      annotation.setEnd( textSpan.getValue2() );
+      annotation.setDiscoveryTechnique( CONST.NE_DISCOVERY_TECH_DICT_LOOKUP );
+
+      final String cui = CuiCodeUtil.getInstance().getAsCui( discoveredTerm.getCuiCode() );
+      Collection<String> tuis = getTuis( termEncodings );
+      if ( tuis.isEmpty() ) {
+         tuis = Collections.singletonList( SemanticTui.UNKNOWN.name() );
+      }
+      final String prefText = getPreferredText( termEncodings );
+
+      final Collection<UmlsConcept> umlsConcepts = new HashSet<>();
+      for ( String tui : tuis ) {
+         termEncodings.stream()
+                      .filter( isPrefTextEncoding.negate() )
+                      .filter( isTuiEncoding.negate() )
+                      .map( e -> createUmlsConcept( jcas, cui, tui, prefText, e ) )
+                      .forEach( umlsConcepts::add );
+      }
+      final FSArray conceptArr = new FSArray( jcas, umlsConcepts.size() );
+      int arrIdx = 0;
+      for ( UmlsConcept umlsConcept : umlsConcepts ) {
+         conceptArr.set( arrIdx, umlsConcept );
+         arrIdx++;
+      }
+      annotation.setOntologyConceptArr( conceptArr );
+      annotation.addToIndexes();
+      LOGGER.warn( "Created Annotation " + annotation.getCoveredText()
+                   + " of " + bestGroup.getName()
+                   + " with " + termEncodings.stream()
+                                             .map( t -> t.getSchema() + " " + t.getSchemaCode() )
+                                             .collect( Collectors.joining( ";" ) )
+                   + " tuis " + String.join( ",", tuis ) );
+   }
+
+
+   static private String getPreferredText( final Collection<TermEncoding> termEncodings ) {
+      return termEncodings.stream()
+                          .filter( CodeSchema.PREFERRED_TEXT::isSchema )
+                          .map( TermEncoding::getSchemaCode )
+                          .map( Object::toString )
+                          .distinct()
+                          .collect( Collectors.joining( ";" ) );
+   }
+
+   static private final Predicate<TermEncoding> isPrefTextEncoding
+         = CodeSchema.PREFERRED_TEXT::isSchema;
+
+
+   static private String getTui( final Collection<TermEncoding> termEncodings ) {
+      return termEncodings.stream()
+                          .filter( CodeSchema.TUI::isSchema )
+                          .map( TermEncoding::getSchemaCode )
+                          .map( AnnotationCreatorUtil::parseTuiValue )
+                          .map( TuiCodeUtil::getAsTui )
+                          .distinct()
+                          .collect( Collectors.joining( ";" ) );
+   }
+
+   static private Collection<String> getTuis( final Collection<TermEncoding> termEncodings ) {
+      return termEncodings.stream()
+                          .filter( CodeSchema.TUI::isSchema )
+                          .map( TermEncoding::getSchemaCode )
+                          .map( AnnotationCreatorUtil::parseTuiValue )
+                          .map( TuiCodeUtil::getAsTui )
+                          .collect( Collectors.toSet() );
+   }
+
+   static private final Predicate<TermEncoding> isTuiEncoding = CodeSchema.TUI::isSchema;
+
+
+   static private UmlsConcept createUmlsConcept( final JCas jcas,
+                                                 final String cui,
+                                                 final String tui,
+                                                 final String preferredText,
+                                                 final TermEncoding termEncoding ) {
+      final UmlsConcept umlsConcept = new UmlsConcept( jcas );
+      umlsConcept.setCodingScheme( termEncoding.getSchema() );
+      umlsConcept.setCui( cui );
+      if ( tui != null ) {
+         umlsConcept.setTui( tui );
+      }
+      if ( preferredText != null && !preferredText.isEmpty() ) {
+         umlsConcept.setPreferredText( preferredText );
+      }
+      umlsConcept.setCode( termEncoding.getSchemaCode().toString() );
+      return umlsConcept;
+   }
+
+
+   static public Map<SemanticGroup, Collection<DiscoveredTerm>> mapSemanticTerms(
+         final Map<DiscoveredTerm, Collection<TermEncoding>> termEncodingMap,
+         final Map<SemanticTui, SemanticGroup> reassignSemantics ) {
+      final Map<SemanticGroup, Collection<DiscoveredTerm>> semanticTermMap = new EnumMap<>( SemanticGroup.class );
+      for ( Map.Entry<DiscoveredTerm, Collection<TermEncoding>> discoveredEncodings : termEncodingMap.entrySet() ) {
+         getSemanticGroups( discoveredEncodings.getValue(), reassignSemantics )
+               .forEach( g -> semanticTermMap.computeIfAbsent( g, s -> new HashSet<>() )
+                                             .add( discoveredEncodings.getKey() ) );
+      }
+      return semanticTermMap;
+   }
+
+
+   static private Collection<SemanticGroup> getSemanticGroups(
+         final Collection<TermEncoding> termEncodings,
+         final Map<SemanticTui, SemanticGroup> reassignSemantics ) {
+      final Collection<SemanticGroup> groups = termEncodings.stream()
+                                                            .filter( CodeSchema.TUI::isSchema )
+                                                            .map( e -> getSemanticGroup( e, reassignSemantics ) )
+                                                            .collect( Collectors.toSet() );
+      if ( groups.isEmpty() ) {
+         return Collections.singletonList( SemanticGroup.UNKNOWN );
+      }
+      return groups;
+   }
+
+
+   static private SemanticGroup getSemanticGroup( final TermEncoding tuiEncoding,
+                                                  final Map<SemanticTui, SemanticGroup> reassignSemantics ) {
+      final Object object = tuiEncoding.getSchemaCode();
+      if ( object instanceof Integer ) {
+         return getSemanticGroup( (Integer)object, reassignSemantics );
+      }
+      return getSemanticGroup( parseTuiValue( object ), reassignSemantics );
+   }
+
+   static private SemanticGroup getSemanticGroup( final int tuiCode,
+                                                  final Map<SemanticTui, SemanticGroup> reassignSemantics ) {
+      final SemanticTui tui = SemanticTui.getTui( tuiCode );
+      if ( !reassignSemantics.isEmpty() ) {
+         final SemanticGroup reassignGroup = reassignSemantics.get( tui );
+         if ( reassignGroup != null ) {
+            return reassignGroup;
+         }
+      }
+      return tui.getGroup();
+   }
+
+
+   static private int parseTuiValue( final Object object ) {
+      try {
+         return Integer.parseInt( object.toString() );
+      } catch ( NumberFormatException nfE ) {
+         return SemanticTui.UNKNOWN.getCode();
+      }
+   }
+
+
+//   static private Map<DiscoveredTerm, Collection<SemanticGroup>> mapTermSemantics(
+//         final Map<DiscoveredTerm, Collection<TermEncoding>> termEncodingMap,
+//         final Map<SemanticTui,SemanticGroup> reassignSemantics ) {
+//      final Map<DiscoveredTerm, Collection<SemanticGroup>> termSemanticsMap = new HashMap<>( termEncodingMap.size() );
+//      termEncodingMap.forEach( (k,v) -> termSemanticsMap.put( k, getSemanticGroups( v, reassignSemantics) ) );
+//      return termSemanticsMap;
+//   }
+
+
+//   static private Map<TermEncoding,SemanticGroup> mapEncodingSemantics( final Collection<TermEncoding> termEncodings,
+//                                                                        final Map<SemanticTui,SemanticGroup> reassignSemantics ) {
+//      return termEncodings.stream()
+//                          .collect( Collectors.toMap( Function.identity(),
+//                                e -> getSemanticGroup( e, reassignSemantics ) ) );
+//   }
+
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/NonSubsumingAnnotationCreator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/NonSubsumingAnnotationCreator.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/NonSubsumingAnnotationCreator.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/NonSubsumingAnnotationCreator.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,38 @@
+package org.apache.ctakes.dictionary.cased.annotation;
+
+
+import jdk.nashorn.internal.ir.annotations.Immutable;
+import org.apache.ctakes.core.util.Pair;
+import org.apache.ctakes.core.util.annotation.SemanticGroup;
+import org.apache.ctakes.core.util.annotation.SemanticTui;
+import org.apache.ctakes.dictionary.cased.encoder.TermEncoding;
+import org.apache.ctakes.dictionary.cased.lookup.DiscoveredTerm;
+import org.apache.log4j.Logger;
+import org.apache.uima.jcas.JCas;
+
+import java.util.Collection;
+import java.util.Map;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/26/2020
+ */
+@Immutable
+final public class NonSubsumingAnnotationCreator implements AnnotationCreator {
+
+   static private final Logger LOGGER = Logger.getLogger( "NonSubsumingAnnotationCreator" );
+
+   public NonSubsumingAnnotationCreator() {
+   }
+
+
+   public void createAnnotations( final JCas jCas,
+                                  final Map<Pair<Integer>, Collection<DiscoveredTerm>> allDiscoveredTermsMap,
+                                  final Map<DiscoveredTerm, Collection<TermEncoding>> termEncodingMap,
+                                  final Map<SemanticTui, SemanticGroup> reassignSemantics ) {
+      allDiscoveredTermsMap.forEach(
+            ( k, v ) -> AnnotationCreatorUtil.createAnnotations( jCas, k, v, termEncodingMap, reassignSemantics ) );
+   }
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/SemanticSubsumingAnnotationCreator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/SemanticSubsumingAnnotationCreator.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/SemanticSubsumingAnnotationCreator.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/SemanticSubsumingAnnotationCreator.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,163 @@
+package org.apache.ctakes.dictionary.cased.annotation;
+
+
+import jdk.nashorn.internal.ir.annotations.Immutable;
+import org.apache.ctakes.core.util.Pair;
+import org.apache.ctakes.core.util.annotation.SemanticGroup;
+import org.apache.ctakes.core.util.annotation.SemanticTui;
+import org.apache.ctakes.dictionary.cased.encoder.TermEncoding;
+import org.apache.ctakes.dictionary.cased.lookup.DiscoveredTerm;
+import org.apache.ctakes.dictionary.cased.util.textspan.MagicTextSpan;
+import org.apache.ctakes.dictionary.cased.wsd.WsdUtil;
+import org.apache.log4j.Logger;
+import org.apache.uima.jcas.JCas;
+
+import java.util.*;
+
+import static org.apache.ctakes.core.util.annotation.SemanticGroup.*;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/19/2020
+ */
+@Immutable
+final public class SemanticSubsumingAnnotationCreator implements AnnotationCreator {
+
+   static private final Logger LOGGER = Logger.getLogger( "SemanticSubsumingAnnotationCreator" );
+
+   public SemanticSubsumingAnnotationCreator() {
+   }
+
+
+   static private final Map<SemanticGroup, Collection<SemanticGroup>> SUBSUME_MAP
+         = new EnumMap<>( SemanticGroup.class );
+
+   static {
+      //
+      SUBSUME_MAP.put( DRUG, EnumSet.of( LAB, PHENOMENON, ENTITY, EVENT ) );
+      //
+      SUBSUME_MAP.put( DISORDER, EnumSet.of( DRUG, FINDING, LAB, PHENOMENON, ENTITY, EVENT ) );
+      //
+      SUBSUME_MAP.put( FINDING, EnumSet.of( LAB, PHENOMENON, ENTITY, EVENT ) );
+      // "Oral Surgery"
+      SUBSUME_MAP.put( PROCEDURE, EnumSet.of( LAB, PHENOMENON, EVENT ) );
+      //
+      SUBSUME_MAP.put( ANATOMY, EnumSet.of( DRUG, DISORDER, FINDING, LAB, PHENOMENON, ENTITY ) );
+      //
+//      SUBSUME_MAP.put( CLINICAL_ATTRIBUTE, EnumSet.of( ENTITY ) );
+      // may be wanted even within procedure, procedure probably wanted within device.  Maybe Anatomy?
+//      SUBSUME_MAP.put( DEVICE, EnumSet.of( ENTITY ) );
+      //
+//      SUBSUME_MAP.put( LAB, EnumSet.of( PHENOMENON, ENTITY, EVENT ) );
+      //
+//      SUBSUME_MAP.put( PHENOMENON, EnumSet.of( ENTITY ) );
+      //   SUBJECT
+      //   TITLE
+      //   EVENT
+      //   ENTITY
+      //   TIME
+      //   MODIFIER
+      //   LAB_MODIFIER
+   }
+
+
+   public void createAnnotations( final JCas jCas,
+                                  final Map<Pair<Integer>, Collection<DiscoveredTerm>> allDiscoveredTermsMap,
+                                  final Map<DiscoveredTerm, Collection<TermEncoding>> termEncodingMap,
+                                  final Map<SemanticTui, SemanticGroup> reassignSemantics ) {
+
+      final Map<SemanticGroup, Collection<DiscoveredTerm>> semanticTermsMap
+            = AnnotationCreatorUtil.mapSemanticTerms( termEncodingMap, reassignSemantics );
+
+      final Map<DiscoveredTerm, Collection<MagicTextSpan>> termSpanMap
+            = AnnotationCreatorUtil.mapTermSpans( allDiscoveredTermsMap );
+
+
+      for ( SemanticGroup subsumingGroup : SemanticGroup.values() ) {
+         final Collection<DiscoveredTerm> semanticTerms = semanticTermsMap.get( subsumingGroup );
+         if ( semanticTerms == null || semanticTerms.isEmpty() ) {
+            continue;
+         }
+         final Collection<SemanticGroup> subsumedGroups
+               = SUBSUME_MAP.getOrDefault( subsumingGroup, Collections.emptyList() );
+         final Map<MagicTextSpan, Collection<DiscoveredTerm>> subsumedTermsMap
+               = getSemanticSubsumedSpanTerms(
+               subsumingGroup, subsumedGroups, semanticTermsMap, termSpanMap );
+
+         for ( Map.Entry<MagicTextSpan, Collection<DiscoveredTerm>> subsumedTerms : subsumedTermsMap.entrySet() ) {
+            allDiscoveredTermsMap.getOrDefault( subsumedTerms.getKey().toIntPair(), new HashSet<>() )
+                                 .removeAll( subsumedTerms.getValue() );
+            semanticTerms.removeAll( subsumedTerms.getValue() );
+            for ( SemanticGroup subsumedGroup : subsumedGroups ) {
+               semanticTermsMap.getOrDefault( subsumedGroup, new HashSet<>() ).removeAll( subsumedTerms.getValue() );
+            }
+         }
+
+         // WSD
+         final Map<MagicTextSpan, Collection<DiscoveredTerm>> wsdedTermsMap
+               = WsdUtil.getSemanticWsdSpanTerms( semanticTerms, termSpanMap );
+         for ( Map.Entry<MagicTextSpan, Collection<DiscoveredTerm>> wsdedTerms : wsdedTermsMap.entrySet() ) {
+            allDiscoveredTermsMap.getOrDefault( wsdedTerms.getKey().toIntPair(), new HashSet<>() )
+                                 .removeAll( wsdedTerms.getValue() );
+         }
+
+      }
+
+      allDiscoveredTermsMap.forEach(
+            ( k, v ) -> AnnotationCreatorUtil.createAnnotations( jCas, k, v, termEncodingMap, reassignSemantics ) );
+   }
+
+
+   static public Map<MagicTextSpan, Collection<DiscoveredTerm>> getSemanticSubsumedSpanTerms(
+         final SemanticGroup subsumingGroup,
+         final Collection<SemanticGroup> subsumedGroups,
+         final Map<SemanticGroup, Collection<DiscoveredTerm>> semanticTermsMap,
+         final Map<DiscoveredTerm, Collection<MagicTextSpan>> termSpanMap ) {
+      final Map<MagicTextSpan, Collection<DiscoveredTerm>> subsumedSpanTermsMap = new HashMap<>();
+      // Get subsuming spans and their corresponding terms.
+      final Map<MagicTextSpan, Collection<DiscoveredTerm>> subsumingSpanTermsMap
+            = SubsumptionUtil.mapSpanTerms( subsumingGroup, semanticTermsMap, termSpanMap );
+      if ( subsumingSpanTermsMap.isEmpty() ) {
+         // No subsuming Spans.
+         return Collections.emptyMap();
+      }
+      // List of spans for subsuming terms, sorted by end character index.
+      final List<MagicTextSpan> subsumingSpans = new ArrayList<>( subsumingSpanTermsMap.keySet() );
+//      subsumingSpans.sort( Comparator.comparingInt( MagicTextSpan::getEnd ) );
+      // Remove smaller terms of the same semantic group
+      if ( subsumingSpanTermsMap.size() > 1 ) {
+         subsumedSpanTermsMap.putAll( SubsumptionUtil.mapFullySubsumedTermSpans( subsumingSpans, subsumingSpanTermsMap ) );
+         if ( subsumedGroups.isEmpty() ) {
+            return subsumedSpanTermsMap;
+         }
+         subsumingSpans.removeAll( subsumedSpanTermsMap.keySet() );
+      }
+
+      // Remove smaller or the same span terms of the other semantic groups
+      final Map<MagicTextSpan, Collection<DiscoveredTerm>> subsumableSpanTermsMap = new HashMap<>();
+      for ( SemanticGroup group : subsumedGroups ) {
+         final Map<MagicTextSpan, Collection<DiscoveredTerm>> subsumedGroupSpanTermsMap
+               = SubsumptionUtil.mapSpanTerms( group, semanticTermsMap, termSpanMap );
+         for ( Map.Entry<MagicTextSpan, Collection<DiscoveredTerm>> subsumedGroupSpanTerms
+               : subsumedGroupSpanTermsMap.entrySet() ) {
+            subsumableSpanTermsMap.computeIfAbsent( subsumedGroupSpanTerms.getKey(),
+                  t -> new HashSet<>() ).addAll( subsumedGroupSpanTerms.getValue() );
+         }
+      }
+
+      if ( subsumableSpanTermsMap.isEmpty() ) {
+         return subsumedSpanTermsMap;
+      }
+      final Map<MagicTextSpan, Collection<DiscoveredTerm>> subsumedGroupsSpanTermsMap
+            = SubsumptionUtil.mapSubsumedOrSameTermSpans( subsumingSpans, subsumableSpanTermsMap );
+      for ( Map.Entry<MagicTextSpan, Collection<DiscoveredTerm>> subsumedGroupsSpanTerms
+            : subsumedGroupsSpanTermsMap.entrySet() ) {
+         subsumedSpanTermsMap.computeIfAbsent( subsumedGroupsSpanTerms.getKey(),
+               t -> new HashSet<>() ).addAll( subsumedGroupsSpanTerms.getValue() );
+      }
+      return subsumedSpanTermsMap;
+   }
+
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/SubsumptionUtil.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/SubsumptionUtil.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/SubsumptionUtil.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/annotation/SubsumptionUtil.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,118 @@
+package org.apache.ctakes.dictionary.cased.annotation;
+
+
+import jdk.nashorn.internal.ir.annotations.Immutable;
+import org.apache.ctakes.core.util.annotation.SemanticGroup;
+import org.apache.ctakes.dictionary.cased.lookup.DiscoveredTerm;
+import org.apache.ctakes.dictionary.cased.util.textspan.MagicTextSpan;
+
+import java.util.*;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/24/2020
+ */
+@Immutable
+final public class SubsumptionUtil {
+
+
+   private SubsumptionUtil() {
+   }
+
+   static public Map<MagicTextSpan, Collection<DiscoveredTerm>> mapSpanTerms(
+         final SemanticGroup semanticGroup,
+         final Map<SemanticGroup, Collection<DiscoveredTerm>> semanticTermsMap,
+         final Map<DiscoveredTerm, Collection<MagicTextSpan>> termSpanMap ) {
+      final Collection<DiscoveredTerm> semanticTerms = semanticTermsMap.get( semanticGroup );
+      if ( semanticTerms == null || semanticTerms.isEmpty() ) {
+         return Collections.emptyMap();
+      }
+      return mapSpanTerms( semanticTerms, termSpanMap );
+   }
+
+   static private Map<MagicTextSpan, Collection<DiscoveredTerm>> mapSpanTerms(
+         final Collection<DiscoveredTerm> discoveredTerms,
+         final Map<DiscoveredTerm, Collection<MagicTextSpan>> termSpanMap ) {
+      final Map<MagicTextSpan, Collection<DiscoveredTerm>> spanTerms = new HashMap<>();
+      for ( DiscoveredTerm term : discoveredTerms ) {
+         final Collection<MagicTextSpan> termSpans = termSpanMap.get( term );
+         if ( termSpans == null ) {
+            continue;
+         }
+         termSpans.forEach(
+               p -> spanTerms.computeIfAbsent( p, s -> new HashSet<>() )
+                             .add( term ) );
+      }
+      return spanTerms;
+   }
+
+
+   static public Map<MagicTextSpan, Collection<DiscoveredTerm>> mapFullySubsumedTermSpans(
+         final List<MagicTextSpan> subsumingSpans,
+         final Map<MagicTextSpan, Collection<DiscoveredTerm>> subsumableSpanTermsMap ) {
+      final List<MagicTextSpan> possiblySubsumedSpans = new ArrayList<>( subsumableSpanTermsMap.keySet() );
+//      possiblySubsumedSpans.sort( Comparator.comparingInt( MagicTextSpan::getBegin ) );
+
+      final Collection<MagicTextSpan> subsumedSpans = getFullySubsumedSpans( subsumingSpans, possiblySubsumedSpans );
+      final Map<MagicTextSpan, Collection<DiscoveredTerm>> subsumedLikeTermMap
+            = new HashMap<>( subsumableSpanTermsMap );
+      subsumedLikeTermMap.keySet().retainAll( subsumedSpans );
+      return subsumedLikeTermMap;
+   }
+
+   static public Map<MagicTextSpan, Collection<DiscoveredTerm>> mapSubsumedOrSameTermSpans(
+         final List<MagicTextSpan> subsumingSpans,
+         final Map<MagicTextSpan, Collection<DiscoveredTerm>> subsumableSpanTermsMap ) {
+      final List<MagicTextSpan> possiblySubsumedSpans = new ArrayList<>( subsumableSpanTermsMap.keySet() );
+//      possiblySubsumedSpans.sort( Comparator.comparingInt( MagicTextSpan::getBegin ) );
+
+      final Collection<MagicTextSpan> subsumedSpans = getSubsumedOrSameSpans( subsumingSpans, possiblySubsumedSpans );
+      final Map<MagicTextSpan, Collection<DiscoveredTerm>> subsumedLikeTermMap
+            = new HashMap<>( subsumableSpanTermsMap );
+      subsumedLikeTermMap.keySet().retainAll( subsumedSpans );
+      return subsumedLikeTermMap;
+   }
+
+   /**
+    * Refine a collection of dictionary terms to only contain the most specific variations:
+    * "colon cancer" instead of "cancer", performed by span inclusion /complete containment, not overlap
+    */
+   static private Collection<MagicTextSpan> getFullySubsumedSpans(
+         final List<MagicTextSpan> subsumingSpans,
+         final List<MagicTextSpan> possiblySubsumedSpans ) {
+      final Collection<MagicTextSpan> subsumedSpans = new HashSet<>();
+
+      // Subsuming spans start at the begin of the document and move forward
+      for ( MagicTextSpan subsumingSpan : subsumingSpans ) {
+         for ( MagicTextSpan possiblySubsumedSpan : possiblySubsumedSpans ) {
+            if ( subsumingSpan.fullyContainsAll( possiblySubsumedSpan ) ) {
+               subsumedSpans.add( possiblySubsumedSpan );
+            }
+         }
+      }
+      return subsumedSpans;
+   }
+
+
+   /**
+    * Refine a collection of dictionary terms to only contain the most specific variations:
+    * "headache medicine" instead of "headache", performed by span inclusion /complete containment, not overlap
+    */
+   static public Collection<MagicTextSpan> getSubsumedOrSameSpans(
+         final List<MagicTextSpan> subsumingSpans,
+         final List<MagicTextSpan> possiblySubsumedSpans ) {
+      final Collection<MagicTextSpan> subsumedSpans = new HashSet<>();
+      // Subsuming spans start at the begin of the document and move forward
+      for ( MagicTextSpan subsumingSpan : subsumingSpans ) {
+         for ( MagicTextSpan possiblySubsumedSpan : possiblySubsumedSpans ) {
+            if ( subsumingSpan.containsAll( possiblySubsumedSpan ) ) {
+               subsumedSpans.add( possiblySubsumedSpan );
+            }
+         }
+      }
+      return subsumedSpans;
+   }
+
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/BsvDictionary.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/BsvDictionary.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/BsvDictionary.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/BsvDictionary.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,102 @@
+package org.apache.ctakes.dictionary.cased.dictionary;
+
+
+import org.apache.ctakes.dictionary.cased.lookup.CandidateTerm;
+import org.apache.ctakes.dictionary.cased.lookup.LookupToken;
+import org.apache.ctakes.dictionary.cased.util.bsv.BsvFileParser;
+import org.apache.ctakes.dictionary.cased.util.bsv.BsvObjectCreator;
+import org.apache.ctakes.dictionary.cased.util.tokenize.TokenizedTerm;
+import org.apache.ctakes.dictionary.cased.util.tokenize.TokenizedTermMapper;
+import org.apache.ctakes.utils.env.EnvironmentVariable;
+import org.apache.log4j.Logger;
+import org.apache.uima.UimaContext;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/13/2020
+ */
+final public class BsvDictionary implements CasedDictionary {
+
+   static public final String DICTIONARY_TYPE = "BSV";
+
+   static private final Logger LOGGER = Logger.getLogger( "BsvDictionary" );
+
+   final private CasedDictionary _delegateDictionary;
+
+   /**
+    * @param name        unique name for dictionary
+    * @param uimaContext -
+    */
+   public BsvDictionary( final String name, final UimaContext uimaContext ) {
+      this( name, EnvironmentVariable.getEnv( name + "_file", uimaContext ) );
+   }
+
+   /**
+    * @param name    unique name for dictionary
+    * @param bsvPath path to bsv file containing synonyms and cuis
+    */
+   public BsvDictionary( final String name, final String bsvPath ) {
+      final Collection<TokenizedTerm> tokenizedTerms = parseBsvFile( bsvPath );
+      final Map<String, Collection<CandidateTerm>> upperWordTermMap = new HashMap<>();
+      final Map<String, Collection<CandidateTerm>> mixedWordTermMap = new HashMap<>();
+      final Map<String, Collection<CandidateTerm>> lowerWordTermMap = new HashMap<>();
+      TokenizedTermMapper.createTermMap( tokenizedTerms, upperWordTermMap, mixedWordTermMap, lowerWordTermMap );
+      _delegateDictionary = new InMemoryDictionary( name, upperWordTermMap, mixedWordTermMap, lowerWordTermMap );
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public String getName() {
+      return _delegateDictionary.getName();
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public Collection<CandidateTerm> getCandidateTerms( final LookupToken lookupToken ) {
+      return _delegateDictionary.getCandidateTerms( lookupToken );
+   }
+
+   /**
+    * Create a collection of {@link TokenizedTerm} Objects
+    * by parsing a bsv file.  The file can be in one of two columnar formats:
+    * <p>
+    * CUI|Text
+    * </p>
+    *
+    * @param bsvFilePath path to file containing term rows and bsv columns
+    * @return collection of all valid terms read from the bsv file
+    */
+   static private Collection<TokenizedTerm> parseBsvFile( final String bsvFilePath ) {
+      try {
+         return BsvFileParser.parseBsvFile( bsvFilePath, new TokenizedTermCreator() );
+      } catch ( IOException ioE ) {
+         LOGGER.error( ioE.getMessage() );
+      }
+      return Collections.emptyList();
+   }
+
+
+   static private class TokenizedTermCreator implements BsvObjectCreator<TokenizedTerm> {
+      public TokenizedTerm createBsvObject( final String[] columns ) {
+         if ( columns.length != 2 ) {
+            return null;
+         }
+         return new TokenizedTerm( columns[ 0 ].trim(), columns[ 1 ].trim() );
+      }
+   }
+
+
+}
+

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/BsvListDictionary.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/BsvListDictionary.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/BsvListDictionary.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/BsvListDictionary.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,96 @@
+package org.apache.ctakes.dictionary.cased.dictionary;
+
+
+import org.apache.ctakes.core.util.StringUtil;
+import org.apache.ctakes.dictionary.cased.lookup.CandidateTerm;
+import org.apache.ctakes.dictionary.cased.lookup.LookupToken;
+import org.apache.ctakes.dictionary.cased.util.tokenize.TokenizedTerm;
+import org.apache.ctakes.dictionary.cased.util.tokenize.TokenizedTermMapper;
+import org.apache.ctakes.utils.env.EnvironmentVariable;
+import org.apache.log4j.Logger;
+import org.apache.uima.UimaContext;
+
+import java.util.*;
+
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/13/2020
+ */
+final public class BsvListDictionary implements CasedDictionary {
+
+   static public final String DICTIONARY_TYPE = "BSV_LIST";
+
+   static private final Logger LOGGER = Logger.getLogger( "BsvListDictionary" );
+
+   final private CasedDictionary _delegateDictionary;
+
+   /**
+    * @param name        unique name for dictionary
+    * @param uimaContext -
+    */
+   public BsvListDictionary( final String name, final UimaContext uimaContext ) {
+      this( name, EnvironmentVariable.getEnv( name + "_list", uimaContext ) );
+   }
+
+   /**
+    * @param name    unique name for dictionary
+    * @param bsvList list containing synonyms and cuis
+    */
+   public BsvListDictionary( final String name, final String bsvList ) {
+      final Collection<TokenizedTerm> tokenizedTerms = parseList( name, bsvList );
+      LOGGER.info( "Parsed " + tokenizedTerms.size() + " terms for dictionary " + name );
+      final Map<String, Collection<CandidateTerm>> upperWordTermMap = new HashMap<>();
+      final Map<String, Collection<CandidateTerm>> mixedWordTermMap = new HashMap<>();
+      final Map<String, Collection<CandidateTerm>> lowerWordTermMap = new HashMap<>();
+      TokenizedTermMapper.createTermMap( tokenizedTerms, upperWordTermMap, mixedWordTermMap, lowerWordTermMap );
+      _delegateDictionary = new InMemoryDictionary( name, upperWordTermMap, mixedWordTermMap, lowerWordTermMap );
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public String getName() {
+      return _delegateDictionary.getName();
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public Collection<CandidateTerm> getCandidateTerms( final LookupToken lookupToken ) {
+      return _delegateDictionary.getCandidateTerms( lookupToken );
+   }
+
+   /**
+    * Create a collection of {@link TokenizedTerm} Objects
+    * by parsing a bsv file.  The file can be in one of two columnar formats:
+    * <p>
+    * CUI|Text
+    * </p>
+    *
+    * @param termList list containing synonyms and cuis
+    * @return collection of all valid terms read from the bsv file
+    */
+   static private Collection<TokenizedTerm> parseList( final String name, final String termList ) {
+      if ( termList.isEmpty() ) {
+         LOGGER.error( "List of terms is empty for " + name );
+         return Collections.emptyList();
+      }
+      final Collection<TokenizedTerm> tokenizedTerms = new HashSet<>();
+      for ( String term : StringUtil.fastSplit( termList, '|' ) ) {
+         final String[] keyValue = StringUtil.fastSplit( term, ':' );
+         if ( keyValue.length != 2 ) {
+            LOGGER.warn( "Improper Key : Value pair for Dictionary Term " + term );
+            continue;
+         }
+         tokenizedTerms.add( new TokenizedTerm( keyValue[ 0 ].trim(), keyValue[ 1 ].trim() ) );
+      }
+      return tokenizedTerms;
+   }
+
+
+}
+

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/CasedDictionary.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/CasedDictionary.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/CasedDictionary.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/CasedDictionary.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,35 @@
+package org.apache.ctakes.dictionary.cased.dictionary;
+
+import org.apache.ctakes.dictionary.cased.lookup.CandidateTerm;
+import org.apache.ctakes.dictionary.cased.lookup.LookupToken;
+
+import java.util.Collection;
+
+/**
+ * Dictionary used to lookup terms by the most rare word within them.
+ *
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/13/2020
+ */
+public interface CasedDictionary {
+
+
+   /**
+    * The Type identifier and Name are used to maintain a collection of dictionaries,
+    * so the combination of Type and Name should be unique for each dictionary if possible.
+    *
+    * @return simple name for the dictionary
+    */
+   String getName();
+
+   /**
+    * Any single token can exist in zero or more terms in the dictionary.  It may exist as its -own- form or as an
+    * alternate canonical variant.  This method will check the dictionary for both.
+    *
+    * @param lookupToken a single-word token
+    * @return zero or more terms that contain the lookup token
+    */
+   Collection<CandidateTerm> getCandidateTerms( final LookupToken lookupToken );
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/DictionaryStore.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/DictionaryStore.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/DictionaryStore.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/DictionaryStore.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,42 @@
+package org.apache.ctakes.dictionary.cased.dictionary;
+
+import java.util.ArrayList;
+import java.util.Collection;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/13/2020
+ */
+public enum DictionaryStore {
+   INSTANCE;
+
+   static public DictionaryStore getInstance() {
+      return INSTANCE;
+   }
+
+
+   private final Collection<CasedDictionary> _dictionaries = new ArrayList<>();
+
+   public boolean addDictionary( final CasedDictionary dictionary ) {
+      final String name = dictionary.getName();
+      synchronized ( _dictionaries ) {
+         final boolean present = _dictionaries.stream()
+                                              .map( CasedDictionary::getName )
+                                              .anyMatch( name::equals );
+         if ( present ) {
+            // Dictionary with given name already exists.
+            return false;
+         }
+         _dictionaries.add( dictionary );
+         return true;
+      }
+   }
+
+
+   public Collection<CasedDictionary> getDictionaries() {
+      return _dictionaries;
+   }
+
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/InMemoryDictionary.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/InMemoryDictionary.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/InMemoryDictionary.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/InMemoryDictionary.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,70 @@
+package org.apache.ctakes.dictionary.cased.dictionary;
+
+
+import org.apache.ctakes.dictionary.cased.lookup.CandidateTerm;
+import org.apache.ctakes.dictionary.cased.lookup.LookupToken;
+
+import java.util.Collection;
+import java.util.Map;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/13/2020
+ */
+final public class InMemoryDictionary implements CasedDictionary {
+
+   private final String _name;
+
+   // Map of rare tokens to terms that contain those tokens.  Used like "First Word Token Lookup" but faster
+   private final Map<String, Collection<CandidateTerm>> _upperTermMap;
+   // Map of rare tokens to terms that contain those tokens.  Used like "First Word Token Lookup" but faster
+   private final Map<String, Collection<CandidateTerm>> _mixedTermMap;
+   // Map of rare tokens to terms that contain those tokens.  Used like "First Word Token Lookup" but faster
+   private final Map<String, Collection<CandidateTerm>> _lowerTermMap;
+
+   /**
+    * @param name         unique name for dictionary
+    * @param upperTermMap Map with a case-sensitive Rare Word (tokens) as key, and RareWordTerm Collection as value
+    * @param mixedTermMap Map with a case-sensitive Rare Word (tokens) as key, and RareWordTerm Collection as value
+    * @param lowerTermMap Map with a lowercase Rare Word (tokens) as key, and RareWordTerm Collection as value
+    */
+   public InMemoryDictionary( final String name,
+                              final Map<String, Collection<CandidateTerm>> upperTermMap,
+                              final Map<String, Collection<CandidateTerm>> mixedTermMap,
+                              final Map<String, Collection<CandidateTerm>> lowerTermMap ) {
+      _name = name;
+      _upperTermMap = upperTermMap;
+      _mixedTermMap = mixedTermMap;
+      _lowerTermMap = lowerTermMap;
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public String getName() {
+      return _name;
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public Collection<CandidateTerm> getCandidateTerms( final LookupToken lookupToken ) {
+      if ( lookupToken.isAllUpperCase() ) {
+         final Collection<CandidateTerm> cased = _upperTermMap.get( lookupToken.getText() );
+         if ( cased != null ) {
+            return cased;
+         }
+      } else if ( !lookupToken.isAllLowerCase() ) {
+         final Collection<CandidateTerm> mixed = _mixedTermMap.get( lookupToken.getText() );
+         if ( mixed != null ) {
+            return mixed;
+         }
+      }
+      return _lowerTermMap.get( lookupToken.getLowerText() );
+   }
+
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/JdbcDictionary.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/JdbcDictionary.java?rev=1881994&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/JdbcDictionary.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/cased/dictionary/JdbcDictionary.java Fri Sep 25 00:59:37 2020
@@ -0,0 +1,236 @@
+package org.apache.ctakes.dictionary.cased.dictionary;
+
+
+import org.apache.ctakes.dictionary.cased.lookup.CandidateTerm;
+import org.apache.ctakes.dictionary.cased.lookup.LookupToken;
+import org.apache.ctakes.dictionary.cased.util.jdbc.JdbcUtil;
+import org.apache.log4j.Logger;
+import org.apache.uima.UimaContext;
+
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+
+import static org.apache.ctakes.dictionary.cased.table.column.Synonym.*;
+import static org.apache.ctakes.dictionary.cased.util.jdbc.JdbcUtil.*;
+
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 8/14/2020
+ */
+final public class JdbcDictionary implements CasedDictionary {
+
+   static public final String DICTIONARY_TYPE = "JDBC";
+
+   static private final Logger LOGGER = Logger.getLogger( "JdbcDictionary" );
+
+   static private final String snomed_rxnorm_2020aa_url
+         = "jdbc:hsqldb:file:resources/org/apache/ctakes/dictionary/lookup/cased/sno_rx_2020aa/sno_rx_2020aa";
+   static private final String snomed_rxnorm_2020aa_driver = "org.hsqldb.jdbcDriver";
+   static private final String snomed_rxnorm_2020aa_user = "sa";
+   static private final String snomed_rxnorm_2020aa_pass = "";
+
+   private final String _name;
+
+
+   private final PreparedStatement _selectUpperCall;
+   private final PreparedStatement _selectMixedCall;
+   private final PreparedStatement _selectLowerCall;
+
+
+   /**
+    * @param name        unique name for dictionary
+    * @param uimaContext -
+    */
+   public JdbcDictionary( final String name, final UimaContext uimaContext ) throws SQLException {
+      this( name,
+            getParameterValue( name, "driver", uimaContext, HSQL_DRIVER ),
+            getParameterValue( name, "url", uimaContext, "" ),
+            getParameterValue( name, "upper", uimaContext, UPPER_TABLE ),
+            getParameterValue( name, "mixed", uimaContext, MIXED_TABLE ),
+            getParameterValue( name, "lower", uimaContext, LOWER_TABLE ),
+            getParameterValue( name, "user", uimaContext, DEFAULT_USER ),
+            getParameterValue( name, "pass", uimaContext, DEFAULT_PASS ) );
+   }
+
+   /**
+    * @param name       unique name for dictionary
+    * @param jdbcDriver -
+    * @param jdbcUrl    -
+    * @param upperName  Name of table containing uppercase-only terms
+    * @param mixedName  Name of table containing mixed case terms
+    * @param lowerName  Name of table containing lowercase-only terms
+    * @param jdbcUser   -
+    * @param jdbcPass   -
+    */
+   public JdbcDictionary( final String name,
+                          final String jdbcDriver,
+                          final String jdbcUrl,
+                          final String upperName,
+                          final String mixedName,
+                          final String lowerName,
+                          final String jdbcUser,
+                          final String jdbcPass ) throws SQLException {
+      _name = name;
+      _selectUpperCall = JdbcUtil.createPreparedStatement( name,
+            jdbcDriver, jdbcUrl, jdbcUser, jdbcPass, upperName, INDEX_WORD.name() );
+      LOGGER.info( "Connected to " + name + " table " + upperName );
+      _selectMixedCall = JdbcUtil.createPreparedStatement( name,
+            jdbcDriver, jdbcUrl, jdbcUser, jdbcPass, mixedName, INDEX_WORD.name() );
+      LOGGER.info( "Connected to " + name + " table " + mixedName );
+      _selectLowerCall = JdbcUtil.createPreparedStatement( name,
+            jdbcDriver, jdbcUrl, jdbcUser, jdbcPass, lowerName, INDEX_WORD.name() );
+      LOGGER.info( "Connected to " + name + " table " + lowerName );
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public String getName() {
+      return _name;
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public Collection<CandidateTerm> getCandidateTerms( final LookupToken lookupToken ) {
+      final Collection<CandidateTerm> candidates = new HashSet<>();
+      if ( lookupToken.isAllUpperCase() ) {
+         candidates.addAll( getUpperTerms( lookupToken.getText() ) );
+//         final Collection<CandidateTerm> cased = getUpperTerms( lookupToken.getText() );
+//         if ( !cased.isEmpty() ) {
+//            LOGGER.info( "Token " + lookupToken.getText() + " UPPER " + cased.stream()
+//                                                                             .map( CandidateTerm::getTokens )
+//                                                                             .map( t -> String.join( " ", t ) )
+//                                                                             .collect( Collectors.joining( " ; " ) ) );
+//            return cased;
+//         }
+      }
+      if ( !lookupToken.isAllLowerCase() ) {
+         candidates.addAll( getMixedTerms( lookupToken.getText() ) );
+//         final Collection<CandidateTerm> mixed = getMixedTerms( lookupToken.getText() );
+//         if ( !mixed.isEmpty() ) {
+//            LOGGER.info( "Token " + lookupToken.getText() + " MIXED " + mixed.stream()
+//                                                                             .map( CandidateTerm::getTokens )
+//                                                                             .map( t -> String.join( " ", t ) )
+//                                                                             .collect( Collectors.joining( " ; " ) ) );
+//            return mixed;
+//         }
+      }
+      candidates.addAll( getLowerTerms( lookupToken.getLowerText() ) );
+//      final Collection<CandidateTerm> lower = getLowerTerms( lookupToken.getLowerText() );
+//      if ( !lower.isEmpty() ) {
+//         LOGGER.info( "Token " + lookupToken.getText() + " LOWER " + lower.stream()
+//                                                                          .map( CandidateTerm::getTokens )
+//                                                                          .map( t -> String.join( " ", t ) )
+//                                                                          .collect( Collectors.joining( " ; " ) ) );
+//         return lower;
+//      }
+//      LOGGER.info( "Token " + lookupToken.getText() + " NOTHING " );
+
+      //      return getLowerTerms( lookupToken.getLowerText() );
+      return candidates;
+   }
+
+
+   /**
+    * @param text to lookup
+    * @return uppercase candidate terms
+    */
+   public Collection<CandidateTerm> getUpperTerms( final String text ) {
+      final List<CandidateTerm> candidateTerms = new ArrayList<>();
+      try {
+         JdbcUtil.fillSelectCall( _selectUpperCall, text );
+         final ResultSet resultSet = _selectUpperCall.executeQuery();
+         while ( resultSet.next() ) {
+            final CandidateTerm candidateTerm = new CandidateTerm(
+                  resultSet.getLong( CUI.getColumn() ),
+                  resultSet.getString( PREFIX.getColumn() ),
+                  resultSet.getString( INDEX_WORD.getColumn() ),
+                  resultSet.getString( SUFFIX.getColumn() ),
+                  true,
+                  false,
+                  resultSet.getInt( RANK.getColumn() ),
+                  resultSet.getInt( INSTANCES.getColumn() ) );
+            candidateTerms.add( candidateTerm );
+         }
+         // Though the ResultSet interface documentation states that there are automatic closures,
+         // it is up to the driver to implement this behavior ...  historically some drivers have not done so
+         resultSet.close();
+      } catch ( SQLException e ) {
+         LOGGER.error( e.getMessage() );
+      }
+      return candidateTerms;
+   }
+
+   /**
+    * @param text to lookup
+    * @return mixed case candidate terms
+    */
+   public Collection<CandidateTerm> getMixedTerms( final String text ) {
+      final List<CandidateTerm> candidateTerms = new ArrayList<>();
+      try {
+         JdbcUtil.fillSelectCall( _selectMixedCall, text );
+         final ResultSet resultSet = _selectMixedCall.executeQuery();
+         while ( resultSet.next() ) {
+            final CandidateTerm candidateTerm = new CandidateTerm(
+                  resultSet.getLong( CUI.getColumn() ),
+                  resultSet.getString( PREFIX.getColumn() ),
+                  resultSet.getString( INDEX_WORD.getColumn() ),
+                  resultSet.getString( SUFFIX.getColumn() ),
+                  false,
+                  false,
+                  resultSet.getInt( RANK.getColumn() ),
+                  resultSet.getInt( INSTANCES.getColumn() ) );
+            candidateTerms.add( candidateTerm );
+         }
+         // Though the ResultSet interface documentation states that there are automatic closures,
+         // it is up to the driver to implement this behavior ...  historically some drivers have not done so
+         resultSet.close();
+      } catch ( SQLException e ) {
+         LOGGER.error( e.getMessage() );
+      }
+      return candidateTerms;
+   }
+
+
+   /**
+    * @param text to lookup
+    * @return lowercase candidate terms
+    */
+   public Collection<CandidateTerm> getLowerTerms( final String text ) {
+      final List<CandidateTerm> candidateTerms = new ArrayList<>();
+      try {
+         JdbcUtil.fillSelectCall( _selectLowerCall, text );
+         final ResultSet resultSet = _selectLowerCall.executeQuery();
+         while ( resultSet.next() ) {
+            final CandidateTerm candidateTerm = new CandidateTerm(
+                  resultSet.getLong( CUI.getColumn() ),
+                  resultSet.getString( PREFIX.getColumn() ),
+                  resultSet.getString( INDEX_WORD.getColumn() ),
+                  resultSet.getString( SUFFIX.getColumn() ),
+                  false,
+                  true,
+                  resultSet.getInt( RANK.getColumn() ),
+                  resultSet.getInt( INSTANCES.getColumn() ) );
+            candidateTerms.add( candidateTerm );
+         }
+         // Though the ResultSet interface documentation states that there are automatic closures,
+         // it is up to the driver to implement this behavior ...  historically some drivers have not done so
+         resultSet.close();
+      } catch ( SQLException e ) {
+         LOGGER.error( e.getMessage() );
+      }
+      return candidateTerms;
+   }
+
+
+}



Mime
View raw message