ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From seanfi...@apache.org
Subject svn commit: r1771026 - /ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/RegexSectionizer.java
Date Wed, 23 Nov 2016 18:57:49 GMT
Author: seanfinan
Date: Wed Nov 23 18:57:49 2016
New Revision: 1771026

URL: http://svn.apache.org/viewvc?rev=1771026&view=rev
Log:
improved Subsumption for sections

Modified:
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/RegexSectionizer.java

Modified: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/RegexSectionizer.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/RegexSectionizer.java?rev=1771026&r1=1771025&r2=1771026&view=diff
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/RegexSectionizer.java
(original)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/RegexSectionizer.java
Wed Nov 23 18:57:49 2016
@@ -225,8 +225,7 @@ abstract public class RegexSectionizer e
          docSegment.addToIndexes();
          return;
       }
-      final List<Pair<Integer>> boundsList = new ArrayList<>( sectionTags.keySet()
);
-      boundsList.sort( ( p1, p2 ) -> p1.getValue1() - p2.getValue2() );
+      final List<Pair<Integer>> boundsList = createBoundsList( sectionTags.keySet()
);
       Pair<Integer> leftBounds = boundsList.get( 0 );
       int sectionEnd;
       if ( leftBounds.getValue1() > 0 ) {
@@ -259,7 +258,7 @@ abstract public class RegexSectionizer e
             // Section has no text, parsing would be pointless
             continue;
          }
-         while ( docText.charAt( sectionBegin ) == ' ' ) {
+         while ( Character.isWhitespace( docText.charAt( sectionBegin ) ) ) {
             sectionBegin++;
          }
          final SectionTag leftTag = sectionTags.get( leftBounds );
@@ -277,6 +276,32 @@ abstract public class RegexSectionizer e
       }
    }
 
+
+   static private List<Pair<Integer>> createBoundsList( final Collection<Pair<Integer>>
bounds ) {
+      final List<Pair<Integer>> boundsList = new ArrayList<>( bounds );
+      boundsList.sort( ( p1, p2 ) -> p1.getValue1() - p2.getValue2() );
+      final Collection<Pair<Integer>> removalBounds = new HashSet<>();
+      for ( int i = 0; i < boundsList.size() - 1; i++ ) {
+         final Pair<Integer> pairI = boundsList.get( i );
+         for ( int j = i + 1; j < boundsList.size(); j++ ) {
+            final Pair<Integer> pairJ = boundsList.get( j );
+            if ( pairJ.getValue1() >= pairI.getValue2() ) {
+               break;
+            }
+            if ( pairI.getValue2() >= pairJ.getValue2() ) {
+               removalBounds.add( pairJ );
+               break;
+            } else if ( pairI.getValue1() >= pairJ.getValue1() && pairJ.getValue2()
> pairI.getValue2() ) {
+               removalBounds.add( pairI );
+               break;
+            }
+         }
+      }
+      boundsList.removeAll( removalBounds );
+      return boundsList;
+   }
+
+
    /**
     * @param text -
     * @return true if the text to lower case is "true" or "false"



Mime
View raw message