ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From seanfi...@apache.org
Subject svn commit: r1768376 [2/2] - in /ctakes/trunk/ctakes-core/src: main/java/org/apache/ctakes/core/ae/ main/java/org/apache/ctakes/core/cc/ main/java/org/apache/ctakes/core/cr/ main/java/org/apache/ctakes/core/util/ main/java/org/apache/ctakes/core/util/r...
Date Sun, 06 Nov 2016 19:04:52 GMT
Added: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/regex/TimeoutMatcher.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/regex/TimeoutMatcher.java?rev=1768376&view=auto
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/regex/TimeoutMatcher.java
(added)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/regex/TimeoutMatcher.java
Sun Nov  6 19:04:52 2016
@@ -0,0 +1,161 @@
+package org.apache.ctakes.core.util.regex;
+
+import org.apache.log4j.Logger;
+
+import java.io.Closeable;
+import java.util.concurrent.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+/**
+ * Class that can / should be used to find text spans using regular expressions.
+ * It runs Matcher find {@link Matcher#find()} in a separate thread so that it may be interrupted
at a set timeout.
+ * This prevents infinite loop problems that can be caused by poorly-built expressions or
unexpected text contents.
+ * The timeout can be specified in milliseconds between 100 and 10,000.  Large timeouts are
unadvised.  If a large
+ * amount of text needs to be parsed then it is better to split up the text logically and
use smaller timeouts.
+ * The default timeout is 1000 milliseconds.
+ * Extending Matcher would be better, but it is final
+ * <p>
+ * <p>
+ * Proper usage is:
+ * try ( TimeoutMatcher finder = new TimeoutMatcher( "\\s+", "Hello World !" ) ) {
+ * Matcher matcher = finder.find();
+ * while ( matcher != null ) {
+ * ...
+ * matcher = finder.find();
+ * }
+ * } catch ( IllegalArgumentException iaE ) {
+ * ...
+ * }
+ * </p>
+ *
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 11/5/2016
+ */
+public class TimeoutMatcher implements Closeable {
+
+   static private final Logger LOGGER = Logger.getLogger( "TimeoutMatcher" );
+
+   static private final int DEFAULT_TIMEOUT_MILLIS = 1000;
+   static private final int MIN_TIMEOUT_MILLIS = 100;
+   static private final int MAX_TIMEOUT_MILLIS = 10000;
+
+   private final ExecutorService _executor;
+   private final int _timeoutMillis;
+   private final Matcher _matcher;
+
+
+   /**
+    * Uses the default timeout of 1000 milliseconds
+    *
+    * @param regex regular expression
+    * @param text  text to parse
+    * @throws IllegalArgumentException if the regular expression is null or malformed
+    */
+   public TimeoutMatcher( final String regex, final String text ) throws IllegalArgumentException
{
+      this( Pattern.compile( regex ), text );
+   }
+
+   /**
+    * @param regex         regular expression
+    * @param text          text to parse
+    * @param timeoutMillis milliseconds at which the regex match should abort, between 100
and 10000
+    * @throws IllegalArgumentException if the regular expression is null or malformed
+    */
+   public TimeoutMatcher( final String regex, final String text, final int timeoutMillis
)
+         throws IllegalArgumentException {
+      this( Pattern.compile( regex ), text, timeoutMillis );
+   }
+
+   /**
+    * Uses the default timeout of 1000 milliseconds
+    *
+    * @param pattern Pattern compiled from a regular expression
+    * @param text    text to parse
+    * @throws IllegalArgumentException if the pattern is null or malformed
+    */
+   public TimeoutMatcher( final Pattern pattern, final String text ) throws IllegalArgumentException
{
+      this( pattern, text, DEFAULT_TIMEOUT_MILLIS );
+   }
+
+   /**
+    * Uses the default timeout of 1000 milliseconds
+    *
+    * @param pattern       Pattern compiled from a regular expression
+    * @param text          text to parse
+    * @param timeoutMillis milliseconds at which the regex match should abort, between 100
and 10000
+    * @throws IllegalArgumentException if the pattern is null or malformed
+    */
+   public TimeoutMatcher( final Pattern pattern, final String text, final int timeoutMillis
)
+         throws IllegalArgumentException {
+      if ( pattern == null ) {
+         throw new PatternSyntaxException( "Pattern cannot be null", "", -1 );
+      }
+      if ( timeoutMillis < MIN_TIMEOUT_MILLIS || timeoutMillis > MAX_TIMEOUT_MILLIS
) {
+         throw new IllegalArgumentException( "Timeout must be between "
+                                             + MIN_TIMEOUT_MILLIS + " and " + MAX_TIMEOUT_MILLIS
);
+      }
+      _matcher = pattern.matcher( new ThreadString( text ) );
+      _timeoutMillis = timeoutMillis;
+      _executor = Executors.newSingleThreadExecutor();
+   }
+
+
+   /**
+    * @return a matcher representing the next call to {@link Matcher#find()}
+    */
+   public Matcher nextMatch() {
+      final Callable<Matcher> callable = new RegexCallable();
+      final Future<Matcher> future = _executor.submit( callable );
+      try {
+         return future.get( _timeoutMillis, TimeUnit.MILLISECONDS );
+      } catch ( InterruptedException | ExecutionException | TimeoutException multE ) {
+         LOGGER.debug( "Timeout for " + _matcher.pattern() );
+         if ( !future.cancel( true ) ) {
+            LOGGER.error( "Timed out but could not be cancelled while detecting " + _matcher.pattern()
);
+         }
+      }
+      if ( future.isCancelled() ) {
+         LOGGER.error( "Cancelled while detecting " + _matcher.pattern() );
+      } else if ( !future.isDone() ) {
+         LOGGER.error( "Not cancelled but didn't complete while detecting " + _matcher.pattern()
);
+      }
+      return null;
+   }
+
+
+   /**
+    * shut down the executor
+    * {@inheritDoc}
+    */
+   @Override
+   public void close() {
+      _executor.shutdownNow();
+   }
+
+
+   /**
+    * Simple Callable that runs a {@link Matcher} on text
+    */
+   private final class RegexCallable implements Callable<Matcher> {
+
+      private RegexCallable() {
+      }
+
+      /**
+       * {@inheritDoc}
+       *
+       * @return matcher if there is another find, else null
+       */
+      @Override
+      public Matcher call() {
+         if ( _matcher.find() ) {
+            return _matcher;
+         }
+         return null;
+      }
+   }
+
+}

Added: ctakes/trunk/ctakes-core/src/test/java/org/apache/ctakes/core/cr/FileTreeReaderTester.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/test/java/org/apache/ctakes/core/cr/FileTreeReaderTester.java?rev=1768376&view=auto
==============================================================================
--- ctakes/trunk/ctakes-core/src/test/java/org/apache/ctakes/core/cr/FileTreeReaderTester.java
(added)
+++ ctakes/trunk/ctakes-core/src/test/java/org/apache/ctakes/core/cr/FileTreeReaderTester.java
Sun Nov  6 19:04:52 2016
@@ -0,0 +1,131 @@
+package org.apache.ctakes.core.cr;
+
+import org.apache.log4j.Logger;
+import org.junit.Test;
+
+import java.io.File;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+
+import static org.junit.Assert.*;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 2/10/2016
+ */
+public class FileTreeReaderTester {
+
+   static private final Logger LOGGER = Logger.getLogger( "FileTreeReaderTester" );
+
+   static private final String DOCUMENT_ID = "someFile";
+   static private final String PREFIX_SHORT = "subdir";
+   static private final String PREFIX_LONG = "parent/child";
+
+   static private final File TXT_EXTENDED = new File( "/home/subdir/someFile.txt" );
+   static private final File BSV_EXTENDED = new File( "/home/subdir/someFile.bsv" );
+   static private final File JPG_EXTENDED = new File( "/home/parent/child/someFile.jpg" );
+   static private final File NOT_EXTENDED = new File( "/home/parent/child/someFile" );
+   static private final File TXTXMI_EXTENDED = new File( "/home/parent/child/someFile.txt.xmi"
);
+
+   static private final Collection<String> TXT_BSV_EXTENSIONS = Arrays.asList( ".txt",
".bsv" );
+   static private final Collection<String> TXT_XMI_EXTENSIONS = Arrays.asList( ".txt",
".xmi" );
+   static private final Collection<String> TXT_TXTXMI_EXTENSIONS = Arrays.asList( ".txt",
".txt.xmi" );
+
+
+   //
+   //    Extension collection creation
+   //
+
+   @Test
+   public void testCreateEmptyExtensions() {
+      assertEquals( "Empty wanted extension array should create empty extension collection",
+            FileTreeReader.createValidExtensions( new String[ 0 ] ).size(), 0 );
+      assertEquals( "Star * wanted extension array should create empty extension collection",
+            FileTreeReader.createValidExtensions( "*" ).size(), 0 );
+      assertEquals( "dot Star .* wanted extension array should create empty extension collection",
+            FileTreeReader.createValidExtensions( ".*" ).size(), 0 );
+   }
+
+   @Test
+   public void testCreateSimpleExtensions() {
+      assertTrue( "txt array should produce collection containing .txt",
+            FileTreeReader.createValidExtensions( "txt" ).contains( ".txt" ) );
+      assertTrue( ".txt .bsv array should produce collection containing .txt",
+            FileTreeReader.createValidExtensions( ".txt" ).contains( ".txt" ) );
+   }
+
+   @Test
+   public void testCreateComplexExtensions() {
+      assertTrue( "txt.xmi array should produce collection containing .txt.xmi",
+            FileTreeReader.createValidExtensions( "txt.xmi" ).contains( ".txt.xmi" ) );
+      assertTrue( ".txt.xmi array should produce collection containing .txt.xmi",
+            FileTreeReader.createValidExtensions( ".txt.xmi" ).contains( ".txt.xmi" ) );
+   }
+
+   @Test
+   public void testCreateMultiExtensions() {
+      assertTrue( ".txt .bsv array should produce collection containing .txt",
+            FileTreeReader.createValidExtensions( ".txt", ".bsv" ).contains( ".txt" ) );
+      assertTrue( ".txt .bsv array should produce collection containing .bsv",
+            FileTreeReader.createValidExtensions( ".txt", ".bsv" ).contains( ".bsv" ) );
+   }
+
+   //
+   //    Extension validity
+   //
+
+   @Test
+   public void testNoExtension() {
+      assertTrue( "no-extension Files should be valid when extension list is empty",
+            FileTreeReader.isExtensionValid( NOT_EXTENDED, Collections.emptyList() ) );
+      assertFalse( "no-extension Files should be invalid when extension list is not empty",
+            FileTreeReader.isExtensionValid( NOT_EXTENDED, TXT_BSV_EXTENSIONS ) );
+   }
+
+   @Test
+   public void testRightExtension() {
+      assertTrue( ".txt extension Files should be valid when extension list is empty",
+            FileTreeReader.isExtensionValid( TXT_EXTENDED, Collections.emptyList() ) );
+      assertTrue( ".txt extension Files should be valid when extension list contains .txt",
+            FileTreeReader.isExtensionValid( TXT_EXTENDED, TXT_BSV_EXTENSIONS ) );
+      assertTrue( ".bsv extension Files should be valid when extension list contains .bsv",
+            FileTreeReader.isExtensionValid( BSV_EXTENDED, TXT_BSV_EXTENSIONS ) );
+   }
+
+   @Test
+   public void testWrongExtension() {
+      assertFalse( ".jpg extension Files should be invalid when extension list does not contain
.jpg",
+            FileTreeReader.isExtensionValid( JPG_EXTENDED, TXT_BSV_EXTENSIONS ) );
+   }
+
+   //
+   //    Document Id
+   //
+
+   @Test
+   public void testCreateDocId() {
+      checkDocumentId( NOT_EXTENDED, Collections.emptyList() );
+      checkDocumentId( TXT_EXTENDED, Collections.emptyList() );
+      checkDocumentId( JPG_EXTENDED, Collections.emptyList() );
+
+      checkDocumentId( NOT_EXTENDED, TXT_BSV_EXTENSIONS );
+      checkDocumentId( TXT_EXTENDED, TXT_BSV_EXTENSIONS );
+      checkDocumentId( JPG_EXTENDED, TXT_BSV_EXTENSIONS );
+
+      checkDocumentId( TXTXMI_EXTENDED, TXT_TXTXMI_EXTENSIONS );
+
+      assertEquals( "Document ID for " + TXTXMI_EXTENDED.getPath() + " should be " + DOCUMENT_ID
+ ".txt with " +
+                    TXT_XMI_EXTENSIONS,
+            DOCUMENT_ID + ".txt", FileTreeReader.createDocumentID( TXTXMI_EXTENDED, TXT_XMI_EXTENSIONS
) );
+   }
+
+   static private void checkDocumentId( final File file, final Collection<String> extensions
) {
+      assertEquals( "Document ID for " + file.getPath() + " should be " + DOCUMENT_ID + "
with " + extensions,
+            DOCUMENT_ID, FileTreeReader.createDocumentID( file, extensions ) );
+   }
+
+   // TODO createDocumentIdPrefix(..)
+
+}



Mime
View raw message