ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From seanfi...@apache.org
Subject svn commit: r1768376 [1/2] - in /ctakes/trunk/ctakes-core/src: main/java/org/apache/ctakes/core/ae/ main/java/org/apache/ctakes/core/cc/ main/java/org/apache/ctakes/core/cr/ main/java/org/apache/ctakes/core/util/ main/java/org/apache/ctakes/core/util/r...
Date Sun, 06 Nov 2016 19:04:52 GMT
Author: seanfinan
Date: Sun Nov  6 19:04:52 2016
New Revision: 1768376

URL: http://svn.apache.org/viewvc?rev=1768376&view=rev
Log:
Adding RegexSectionizer, BsvRegexSectionizer
Adding Regex Timeout utilities
Adding ListAnnotator,
Adding ParagraphAnnotator, ParagraphSentenceFixer
Adding FileTreeReader
Adding FileTreeXmiWriter, AbstractOutputFileWriter, XMISerializer
Adding DotLogger
Adding Pair<T>

Added:
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/BsvRegexSectionizer.java
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/ListAnnotator.java
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/ParagraphAnnotator.java
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/ParagraphSentenceFixer.java
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/RegexSectionizer.java
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/AbstractOutputFileWriter.java
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/FileTreeXmiWriter.java
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/XMISerializer.java
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/FileTreeReader.java
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/DotLogger.java
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/Pair.java
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/regex/
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/regex/RegexSpanFinder.java
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/regex/ThreadString.java
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/regex/TimeoutMatcher.java
    ctakes/trunk/ctakes-core/src/test/java/org/apache/ctakes/core/cr/
    ctakes/trunk/ctakes-core/src/test/java/org/apache/ctakes/core/cr/FileTreeReaderTester.java

Added: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/BsvRegexSectionizer.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/BsvRegexSectionizer.java?rev=1768376&view=auto
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/BsvRegexSectionizer.java (added)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/BsvRegexSectionizer.java Sun Nov  6 19:04:52 2016
@@ -0,0 +1,102 @@
+package org.apache.ctakes.core.ae;
+
+
+import org.apache.ctakes.core.resource.FileLocator;
+import org.apache.log4j.Logger;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.resource.ResourceInitializationException;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 10/5/2016
+ */
+public class BsvRegexSectionizer extends RegexSectionizer {
+
+   static private final Logger LOGGER = Logger.getLogger( "BsvRegexSectionizer" );
+
+
+   static public final String SECTION_TYPES_PATH = "SectionsBsv";
+   static public final String SECTION_TYPES_DESC
+         = "path to a BSV file containing a list of regular expressions and corresponding section types.";
+
+   @ConfigurationParameter(
+         name = SECTION_TYPES_PATH,
+         description = SECTION_TYPES_DESC
+   )
+   private String _sectionTypesPath;
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   protected void loadSections() throws ResourceInitializationException {
+      if ( _sectionTypesPath == null ) {
+         LOGGER.error( "No " + SECTION_TYPES_DESC );
+         return;
+      }
+      LOGGER.info( "Parsing " + _sectionTypesPath );
+      try ( BufferedReader reader = new BufferedReader( new InputStreamReader( FileLocator
+            .getAsStream( _sectionTypesPath ) ) ) ) {
+         String line = reader.readLine();
+         while ( line != null ) {
+            parseBsvLine( line );
+            line = reader.readLine();
+         }
+      } catch ( IOException ioE ) {
+         throw new ResourceInitializationException( ioE );
+      }
+      LOGGER.info( "Finished Parsing" );
+   }
+
+   /**
+    * @param line double-bar separated text
+    */
+   static private void parseBsvLine( final String line ) {
+      if ( line.isEmpty() || line.startsWith( "#" ) || line.startsWith( "//" ) ) {
+         // comment
+         return;
+      }
+      final String[] splits = line.split( "\\|\\|" );
+      if ( splits.length < 2 || isBoolean( splits[ 1 ] ) ) {
+         LOGGER.warn( "Bad Section definition: " + line + " ; please use one of the following:\n" +
+                      "NAME||HEADER_REGEX\n" +
+                      "NAME||HEADER_REGEX||SHOULD_PARSE(true/false)\n" +
+                      "NAME||HEADER_REGEX||FOOTER_REGEX\n" +
+                      "NAME||HEADER_REGEX||FOOTER_REGEX||SHOULD_PARSE(true/false)\n" +
+                      "The regex may contain \"(?<SECTION_NAME>regex_for_custom_section_name)\"" );
+         return;
+      }
+      // Section Name is always first
+      final String name = splits[ 0 ].trim();
+      // Should parse flag is always last if specified, if not specified then true
+      final String lastColumn = splits[ splits.length - 1 ].trim().toLowerCase();
+      final boolean shouldParse = !lastColumn.equalsIgnoreCase( "false" );
+      // header regex is first
+      String headerRegex = splits[ 1 ].trim();
+      // footer regex is after header regex, or may not be specified
+      String footerRegex = null;
+      if ( splits.length > 2 && !isBoolean( splits[ 2 ] ) ) {
+         footerRegex = splits[ 2 ].trim();
+      }
+      final RegexSectionizer.SectionType sectionType
+            = new RegexSectionizer.SectionType( name, headerRegex, footerRegex, shouldParse );
+      addSectionType( sectionType );
+   }
+
+
+   static public AnalysisEngineDescription createEngineDescription( final String sectionTypesPath )
+         throws ResourceInitializationException {
+      return AnalysisEngineFactory.createEngineDescription( BsvRegexSectionizer.class,
+            SECTION_TYPES_PATH, sectionTypesPath );
+   }
+
+
+}

Added: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/ListAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/ListAnnotator.java?rev=1768376&view=auto
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/ListAnnotator.java (added)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/ListAnnotator.java Sun Nov  6 19:04:52 2016
@@ -0,0 +1,305 @@
+package org.apache.ctakes.core.ae;
+
+import org.apache.ctakes.core.resource.FileLocator;
+import org.apache.ctakes.core.util.Pair;
+import org.apache.ctakes.core.util.regex.RegexSpanFinder;
+import org.apache.ctakes.typesystem.type.textspan.ListEntry;
+import org.apache.ctakes.typesystem.type.textspan.Segment;
+import org.apache.log4j.Logger;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.util.FSCollectionFactory;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSList;
+import org.apache.uima.resource.ResourceInitializationException;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 9/26/2016
+ */
+final public class ListAnnotator extends JCasAnnotator_ImplBase {
+
+   static private final Logger LOGGER = Logger.getLogger( "ListAnnotator" );
+
+
+   static public final String LIST_TYPES_PATH = "LIST_TYPES_PATH";
+   static private final String LIST_TYPES_DESC
+         = "path to a file containing a list of regular expressions and corresponding list types.";
+
+   /**
+    * classic ctakes default segment id
+    */
+   static private final String DEFAULT_LIST_ID = "SIMPLE_LIST";
+
+
+   @ConfigurationParameter(
+         name = LIST_TYPES_PATH,
+         description = LIST_TYPES_DESC
+   )
+   private String _listTypesPath;
+
+
+   /**
+    * Holder for list type as defined in the user's specification bsv file
+    */
+   static private final class ListType {
+      private final String __name;
+      private final Pattern __listPattern;
+      private final Pattern __entrySeparator;
+
+      private ListType( final String name, final String listRegex, final String entrySplitRegex ) {
+         __name = name;
+         __listPattern = listRegex == null ? null
+                                           : Pattern.compile( listRegex, Pattern.MULTILINE );
+         __entrySeparator = entrySplitRegex == null ? null
+                                                    : Pattern.compile( entrySplitRegex, Pattern.MULTILINE );
+      }
+   }
+
+   private final Collection<ListType> _listTypes = new HashSet<>();
+
+//   private final ExecutorService _executor = Executors.newSingleThreadExecutor();
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public void initialize( final UimaContext context ) throws ResourceInitializationException {
+      super.initialize( context );
+      if ( _listTypesPath == null ) {
+         LOGGER.error( "No " + LIST_TYPES_DESC );
+         return;
+      }
+      LOGGER.info( "Parsing " + _listTypesPath );
+      try ( BufferedReader reader = new BufferedReader( new InputStreamReader( FileLocator
+            .getAsStream( _listTypesPath ) ) ) ) {
+         String line = reader.readLine();
+         while ( line != null ) {
+            parseBsvLine( line );
+            line = reader.readLine();
+         }
+      } catch ( IOException ioE ) {
+         throw new ResourceInitializationException( ioE );
+      }
+      LOGGER.info( "Finished Parsing" );
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public void process( final JCas jcas ) throws AnalysisEngineProcessException {
+      LOGGER.info( "Starting processing" );
+      if ( _listTypes.isEmpty() ) {
+         LOGGER.info( "Finished processing, no list types defined" );
+         return;
+      }
+      for ( Segment section : JCasUtil.select( jcas, Segment.class ) ) {
+         final Map<Pair<Integer>, ListType> listTypes = findListTypes( section.getCoveredText() );
+         final Map<Pair<Integer>, ListType> uniqueListTypes = getUniqueListTypes( listTypes );
+         createLists( jcas, uniqueListTypes, section.getCoveredText(), section.getBegin() );
+      }
+      LOGGER.info( "Finished processing" );
+   }
+
+
+   private Map<Pair<Integer>, ListType> findListTypes( final String text ) {
+      final Map<Pair<Integer>, ListType> listTypes = new HashMap<>();
+      for ( ListType listType : _listTypes ) {
+         if ( listType.__listPattern == null ) {
+            continue;
+         }
+         try ( RegexSpanFinder finder = new RegexSpanFinder( listType.__listPattern ) ) {
+            final List<Pair<Integer>> spans = finder.findSpans( text );
+            spans.forEach( s -> listTypes.put( s, listType ) );
+         }
+      }
+      return listTypes;
+   }
+
+
+   /**
+    * Get rid of list overlaps
+    *
+    * @param listTypes -
+    * @return list types that don't overlap
+    */
+   static private Map<Pair<Integer>, ListType> getUniqueListTypes( final Map<Pair<Integer>, ListType> listTypes ) {
+      if ( listTypes == null || listTypes.size() <= 1 ) {
+         return listTypes;
+      }
+      final Collection<Pair<Integer>> removalTypeBounds = new HashSet<>();
+      final Map<Pair<Integer>, Pair<Integer>> newTypeBounds = new HashMap<>();
+      while ( true ) {
+         final List<Pair<Integer>> sortedBounds = listTypes.keySet().stream()
+               .sorted( ( p1, p2 ) -> (p2.getValue2() - p2.getValue1()) - (p1.getValue2() - p1.getValue1()) )
+               .collect( Collectors.toList() );
+         for ( int i = 0; i < sortedBounds.size() - 1; i++ ) {
+            final Pair<Integer> boundsI = sortedBounds.get( i );
+            // boundsI is larger than boundsJ, therefore dominant
+            for ( int j = i + 1; j < sortedBounds.size(); j++ ) {
+               final Pair<Integer> boundsJ = sortedBounds.get( j );
+               if ( boundsJ.getValue1() >= boundsI.getValue1() && boundsJ.getValue1() <= boundsI.getValue2() ) {
+                  removalTypeBounds.add( boundsJ );
+                  if ( boundsJ.getValue2() > boundsI.getValue2() ) {
+                     newTypeBounds.put( new Pair<>( boundsI.getValue2(), boundsJ.getValue2() ), boundsI );
+                  }
+               } else if ( boundsJ.getValue2() >= boundsI.getValue1() && boundsJ.getValue2() <= boundsI.getValue2() ) {
+                  removalTypeBounds.add( boundsJ );
+                  if ( boundsJ.getValue1() < boundsI.getValue1() ) {
+                     newTypeBounds.put( new Pair<>( boundsI.getValue1(), boundsJ.getValue1() ), boundsI );
+                  }
+               }
+            }
+         }
+         if ( removalTypeBounds.isEmpty() ) {
+            return listTypes;
+         }
+         for ( Map.Entry<Pair<Integer>, Pair<Integer>> pairEntry : newTypeBounds.entrySet() ) {
+            listTypes.put( pairEntry.getKey(), listTypes.get( pairEntry.getValue() ) );
+         }
+         listTypes.keySet().removeAll( removalTypeBounds );
+         if ( listTypes.size() == 1 ) {
+            return listTypes;
+         }
+         newTypeBounds.clear();
+         removalTypeBounds.clear();
+      }
+   }
+
+
+   static private Collection<Pair<Integer>> findEntrySeparators( final String listText, final Pattern entrySeparator ) {
+      final Collection<Pair<Integer>> separators = new HashSet<>();
+      final Matcher tagMatcher = entrySeparator.matcher( listText );
+      while ( tagMatcher.find() ) {
+         // the start tag of this tag is the start of the current match
+         // the end tag of this tag is the end of the current match, exclusive
+         final Pair<Integer> tagBounds = new Pair<>( tagMatcher.start(), tagMatcher.end() );
+         separators.add( tagBounds );
+      }
+      return separators;
+   }
+
+
+   static private Collection<ListEntry> findListEntries( final JCas jCas, final Pair<Integer> listBounds,
+                                                         final String listText,
+                                                         final int offset, final Pattern entrySeparator ) {
+      final Collection<Pair<Integer>> separators = findEntrySeparators( listText, entrySeparator );
+      final int listBegin = listBounds.getValue1();
+      final int listEnd = listBounds.getValue2();
+      if ( separators.isEmpty() ) {
+         // whole text is simple entry
+         final ListEntry listEntry = new ListEntry( jCas,
+               offset + listBounds.getValue1(), offset + listBounds.getValue2() );
+         listEntry.addToIndexes();
+         LOGGER.warn( "One List Entry for " + listText );
+         return Collections.singletonList( listEntry );
+      }
+      final Collection<ListEntry> listEntries = new ArrayList<>( separators.size() + 1 );
+      final List<Pair<Integer>> boundsList = new ArrayList<>( separators );
+      boundsList.sort( ( p1, p2 ) -> p1.getValue1() - p2.getValue2() );
+      Pair<Integer> leftBounds;
+      int previousEntryEnd = listBegin;
+      final int length = boundsList.size();
+      // add entries 1 -> n
+      for ( int i = 0; i < length; i++ ) {
+         leftBounds = boundsList.get( i );
+         final int entryBegin = previousEntryEnd;
+         final int entryEnd = listBegin + leftBounds.getValue2();
+         if ( entryEnd - entryBegin <= 0 ) {
+            continue;
+         }
+         final ListEntry listEntry = new ListEntry( jCas, offset + entryBegin, offset + entryEnd );
+         listEntry.addToIndexes();
+         listEntries.add( listEntry );
+         previousEntryEnd = entryEnd;
+      }
+      if ( previousEntryEnd < listEnd ) {
+         // add an entry for the end of the list
+         final ListEntry listEntry = new ListEntry( jCas, offset + previousEntryEnd, offset + listEnd );
+         listEntry.addToIndexes();
+         listEntries.add( listEntry );
+      }
+      return listEntries;
+   }
+
+
+   /**
+    * All tags are treated equally as segment bounds, whether header or footer
+    *
+    * @param jcas      -
+    * @param listTypes segment names are assigned based upon preceding headers
+    * @param text      -
+    * @param offset    offset of the given text within the document
+    */
+   static private void createLists( final JCas jcas,
+                                    final Map<Pair<Integer>, ListType> listTypes, final String text,
+                                    final int offset ) {
+      if ( listTypes == null || listTypes.isEmpty() ) {
+         return;
+      }
+      for ( Map.Entry<Pair<Integer>, ListType> boundedListType : listTypes.entrySet() ) {
+         final Pair<Integer> listBounds = boundedListType.getKey();
+         final ListType listType = boundedListType.getValue();
+         final Collection<ListEntry> listEntries = findListEntries( jcas, listBounds,
+               text.substring( listBounds.getValue1(), listBounds.getValue2() ), offset, listType.__entrySeparator );
+         final FSList fsList = FSCollectionFactory.createFSList( jcas, listEntries );
+         fsList.addToIndexes();
+         final org.apache.ctakes.typesystem.type.textspan.List list
+               = new org.apache.ctakes.typesystem.type.textspan.List( jcas,
+               offset + listBounds.getValue1(), offset + listBounds.getValue2() );
+         list.setId( listType.__name );
+         list.setItems( fsList );
+         list.addToIndexes();
+      }
+   }
+
+
+   private void parseBsvLine( final String line ) {
+      if ( line.isEmpty() || line.startsWith( "#" ) || line.startsWith( "//" ) ) {
+         // comment
+         return;
+      }
+      final String[] splits = line.split( "\\|\\|" );
+      if ( splits.length < 3 || isBoolean( splits[ 1 ] ) ) {
+         LOGGER.warn( "Bad List definition: " + line + " ; please use one of the following:\n" +
+                      "NAME||LIST_REGEX||ENTRY_SEPARATOR_REGEX" );
+         return;
+      }
+      // Section Name is always first
+      final String name = splits[ 0 ].trim();
+      final String listRegex = splits[ 1 ].trim();
+      final String separatorRegex = splits[ 2 ].trim();
+      final ListType listType = new ListType( name, listRegex, separatorRegex );
+      _listTypes.add( listType );
+   }
+
+   static private boolean isBoolean( final String text ) {
+      final String text2 = text.trim().toLowerCase();
+      return text2.equalsIgnoreCase( "true" ) || text2.equalsIgnoreCase( "false" );
+   }
+
+
+   static public AnalysisEngineDescription createEngineDescription( final String sectionTypesPath )
+         throws ResourceInitializationException {
+      return AnalysisEngineFactory.createEngineDescription( ListAnnotator.class,
+            LIST_TYPES_PATH, sectionTypesPath );
+   }
+
+
+}

Added: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/ParagraphAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/ParagraphAnnotator.java?rev=1768376&view=auto
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/ParagraphAnnotator.java (added)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/ParagraphAnnotator.java Sun Nov  6 19:04:52 2016
@@ -0,0 +1,198 @@
+package org.apache.ctakes.core.ae;
+
+import org.apache.ctakes.core.resource.FileLocator;
+import org.apache.ctakes.core.util.Pair;
+import org.apache.ctakes.core.util.regex.RegexSpanFinder;
+import org.apache.ctakes.typesystem.type.textspan.Paragraph;
+import org.apache.ctakes.typesystem.type.textspan.Segment;
+import org.apache.log4j.Logger;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.*;
+import java.util.regex.Pattern;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 9/23/2016
+ */
+final public class ParagraphAnnotator extends JCasAnnotator_ImplBase {
+
+   static private final Logger LOGGER = Logger.getLogger( "ParagraphAnnotator" );
+
+
+   static public final String PARAGRAPH_TYPES_PATH = "PARAGRAPH_TYPES_PATH";
+   static public final String PARAGRAPH_TYPES_DESC
+         = "path to a file containing a list of regular expressions and corresponding paragraph types.";
+
+
+   @ConfigurationParameter(
+         name = PARAGRAPH_TYPES_PATH,
+         description = PARAGRAPH_TYPES_DESC,
+         mandatory = false
+   )
+   private String _paragraphTypesPath;
+
+   static private final String DEFAULT_PARAGRAPH = "Default Paragraph||(?:(?:\\r?\\n){2,})";
+
+   /**
+    * Holder for section type as defined in the user's specification bsv file
+    */
+   static private final class ParagraphType {
+      private final String __name;
+      private final Pattern __separatorPattern;
+
+      private ParagraphType( final String name, final String separatorRegex ) {
+         __name = name;
+         __separatorPattern = separatorRegex == null ? null : Pattern.compile( separatorRegex, Pattern.MULTILINE );
+      }
+   }
+
+   private final Collection<ParagraphType> _paragraphTypes = new HashSet<>();
+
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public void initialize( final UimaContext context ) throws ResourceInitializationException {
+      super.initialize( context );
+      if ( _paragraphTypesPath == null ) {
+         LOGGER.info( "No " + PARAGRAPH_TYPES_DESC );
+         LOGGER.info( "Using default paragraph separator: two newlines" );
+         parseBsvLine( DEFAULT_PARAGRAPH );
+         return;
+      }
+      LOGGER.info( "Parsing " + _paragraphTypesPath );
+      try ( BufferedReader reader = new BufferedReader( new InputStreamReader( FileLocator
+            .getAsStream( _paragraphTypesPath ) ) ) ) {
+         String line = reader.readLine();
+         while ( line != null ) {
+            parseBsvLine( line );
+            line = reader.readLine();
+         }
+      } catch ( IOException ioE ) {
+         throw new ResourceInitializationException( ioE );
+      }
+      LOGGER.info( "Finished Parsing" );
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public void process( final JCas jcas ) throws AnalysisEngineProcessException {
+      LOGGER.info( "Starting processing" );
+      if ( _paragraphTypes.isEmpty() ) {
+         LOGGER.info( "Finished processing, no section types defined" );
+         return;
+      }
+      createParagraphs( jcas );
+      LOGGER.info( "Finished processing" );
+   }
+
+
+   private Collection<Pair<Integer>> findSeparators( final String docText ) {
+      final Collection<Pair<Integer>> separators = new HashSet<>();
+      for ( ParagraphType paragraphType : _paragraphTypes ) {
+         if ( paragraphType.__separatorPattern == null ) {
+            continue;
+         }
+         separators.addAll( findSeparators( docText, paragraphType.__separatorPattern ) );
+      }
+      return separators;
+   }
+
+   // package protected for unit tests
+   static Collection<Pair<Integer>> findSeparators( final String docText,
+                                                    final Pattern pattern ) {
+      // the start tag of this tag is the start of the current match
+      // the end tag of this tag is the end of the current match, exclusive
+      try ( RegexSpanFinder finder = new RegexSpanFinder( pattern ) ) {
+         return finder.findSpans( docText );
+      } catch ( IllegalArgumentException iaE ) {
+         LOGGER.error( iaE.getMessage() );
+      }
+      return Collections.emptyList();
+   }
+
+
+   /**
+    * All tags are treated equally as segment bounds, whether header or footer
+    *
+    * @param jcas -
+    */
+   private void createParagraphs( final JCas jcas ) {
+      final Collection<Segment> sections = JCasUtil.select( jcas, Segment.class );
+      for ( Segment section : sections ) {
+         final int offset = section.getBegin();
+         final String text = section.getCoveredText();
+         final Collection<Pair<Integer>> separators = findSeparators( text );
+         if ( separators.isEmpty() ) {
+            // whole text is simple paragraph
+            final Paragraph paragraph = new Paragraph( jcas, offset, section.getEnd() );
+            paragraph.addToIndexes();
+            continue;
+         }
+         final List<Pair<Integer>> boundsList = new ArrayList<>( separators );
+         Collections.sort( boundsList, ( p1, p2 ) -> p1.getValue1() - p2.getValue2() );
+         Pair<Integer> leftBounds = boundsList.get( 0 );
+         int paragraphEnd;
+         if ( leftBounds.getValue1() > 0 ) {
+            // Add unspecified generic first paragraph
+            paragraphEnd = leftBounds.getValue1();
+            final Paragraph paragraph = new Paragraph( jcas, offset, offset + paragraphEnd );
+            paragraph.addToIndexes();
+            // will start the next paragraph with bounds at 0
+         }
+         final int length = boundsList.size();
+         // add segments 1 -> n
+         for ( int i = 0; i < length; i++ ) {
+            leftBounds = boundsList.get( i );
+            final int paragraphBegin = leftBounds.getValue2();
+            if ( i + 1 < length ) {
+               paragraphEnd = boundsList.get( i + 1 ).getValue1();
+            } else {
+               // the last paragraph
+               paragraphEnd = text.length();
+            }
+            if ( paragraphEnd - paragraphBegin <= 1 ) {
+               // a length <= 1 means that we have one tag right after another, so the paragraph is empty
+               continue;
+            }
+            final Paragraph paragraph = new Paragraph( jcas, offset + paragraphBegin, offset + paragraphEnd );
+            paragraph.addToIndexes();
+         }
+      }
+   }
+
+
+   private void parseBsvLine( final String line ) {
+      if ( line.isEmpty() || line.startsWith( "#" ) || line.startsWith( "//" ) ) {
+         // comment
+         return;
+      }
+      final String[] splits = line.split( "\\|\\|" );
+      if ( splits.length < 2 ) {
+         LOGGER.warn( "Bad Paragraph definition: " + line + " ; please use the following:\n" +
+                      "NAME||SEPARATOR_REGEX" );
+         return;
+      }
+      // paragraph Name is always first
+      final String name = splits[ 0 ].trim();
+      // separator regex
+      String separatorRegex = splits[ 1 ].trim();
+      final ParagraphType paragraphType = new ParagraphType( name, separatorRegex );
+      _paragraphTypes.add( paragraphType );
+   }
+
+}

Added: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/ParagraphSentenceFixer.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/ParagraphSentenceFixer.java?rev=1768376&view=auto
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/ParagraphSentenceFixer.java (added)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/ParagraphSentenceFixer.java Sun Nov  6 19:04:52 2016
@@ -0,0 +1,109 @@
+package org.apache.ctakes.core.ae;
+
+import org.apache.ctakes.core.util.Pair;
+import org.apache.ctakes.typesystem.type.textspan.Paragraph;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.log4j.Logger;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.function.Predicate;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 10/6/2016
+ */
+final public class ParagraphSentenceFixer extends JCasAnnotator_ImplBase {
+
+   static private final Logger LOGGER = Logger.getLogger( "ParagraphSentenceFixer" );
+
+
+   /**
+    * Where Sentence annotations and paragraph annotations overlap, Sentences are abbreviated or removed.
+    * {@inheritDoc}
+    */
+   @Override
+   public void process( final JCas jcas ) throws AnalysisEngineProcessException {
+      LOGGER.info( "Starting Processing" );
+      final Collection<Paragraph> paragraphs = JCasUtil.select( jcas, Paragraph.class );
+      if ( paragraphs == null || paragraphs.isEmpty() ) {
+         LOGGER.info( "Finished Processing" );
+         return;
+      }
+      adjustParagraphSentences( jcas, paragraphs );
+      LOGGER.info( "Finished Processing" );
+   }
+
+   static private void adjustParagraphSentences( final JCas jCas, final Collection<Paragraph> paragraphs ) {
+      final Collection<Sentence> allSentences = JCasUtil.select( jCas, Sentence.class );
+      final Collection<Pair<Integer>> newBounds = new HashSet<>();
+      // gather map of sentences that cross boundaries of lists; add list entry sentences
+      final Map<Sentence, Collection<Paragraph>> boundarySentences = new HashMap<>();
+      for ( Paragraph paragraph : paragraphs ) {
+         for ( Sentence sentence : allSentences ) {
+            if ( (sentence.getBegin() < paragraph.getBegin() && sentence.getEnd() > paragraph.getBegin())
+                 || (sentence.getEnd() > paragraph.getEnd() && sentence.getBegin() < paragraph.getEnd()) ) {
+               // sentence overlaps but isn't contained
+               Collection<Paragraph> sentenceParagraphs = boundarySentences.get( sentence );
+               if ( sentenceParagraphs == null ) {
+                  sentenceParagraphs = new HashSet<>();
+                  boundarySentences.put( sentence, sentenceParagraphs );
+               }
+               sentenceParagraphs.add( paragraph );
+            }
+         }
+      }
+      // cut up the boundary sentences, paying attention to sentences that span two or more paragraphs
+      for ( Map.Entry<Sentence, Collection<Paragraph>> boundarySentence : boundarySentences.entrySet() ) {
+         final int sentenceBegin = boundarySentence.getKey().getBegin();
+         final int sentenceEnd = boundarySentence.getKey().getEnd();
+         final java.util.List<Paragraph> sorted = boundarySentence.getValue().stream()
+               .sorted( ( l1, l2 ) -> l1.getBegin() - l2.getBegin() )
+               .collect( Collectors.toList() );
+         final Paragraph first = sorted.get( 0 );
+         if ( sentenceBegin < first.getBegin() && sentenceEnd > first.getBegin() ) {
+            // sentence starts before but ends in or after paragraph
+            newBounds.add( new Pair<>( sentenceBegin, first.getBegin() ) );
+            final int end = Math.min( sentenceEnd, first.getEnd() );
+            newBounds.add( new Pair<>( first.getBegin(), end ) );
+         }
+         for ( int i = 0; i < sorted.size() - 1; i++ ) {
+            if ( sorted.get( i + 1 ).getBegin() > sorted.get( i ).getEnd() ) {
+               // sentence extends between two paragraphs
+               newBounds.add( new Pair<>( sorted.get( i ).getEnd(), sorted.get( i + 1 ).getBegin() ) );
+            }
+         }
+         final Paragraph last = sorted.get( sorted.size() - 1 );
+         if ( sentenceEnd > last.getEnd() && sentenceBegin < last.getEnd() ) {
+            // sentence ends after but begins in or before the paragraph
+            final int begin = Math.max( last.getBegin(), sentenceBegin );
+            newBounds.add( new Pair<>( begin, last.getEnd() ) );
+            newBounds.add( new Pair<>( last.getEnd(), sentenceEnd ) );
+         }
+      }
+      // adjust the cas
+      boundarySentences.keySet().forEach( Sentence::removeFromIndexes );
+      boundarySentences.keySet().forEach( jCas::removeFsFromIndexes );
+      newBounds.stream()
+            .filter( p -> p.getValue2() - p.getValue1() > 0 )
+            .map( p -> new Sentence( jCas, p.getValue1(), p.getValue2() ) )
+            .filter( notEmpty )
+            .forEach( Sentence::addToIndexes );
+   }
+
+   static private final Pattern WHITESPACE = Pattern.compile( "\\s+" );
+   static private final Predicate<Sentence> notEmpty
+         = s -> WHITESPACE.matcher( s.getCoveredText() ).replaceAll( " " ).trim().length() > 0;
+
+
+}

Added: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/RegexSectionizer.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/RegexSectionizer.java?rev=1768376&view=auto
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/RegexSectionizer.java (added)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/ae/RegexSectionizer.java Sun Nov  6 19:04:52 2016
@@ -0,0 +1,285 @@
+package org.apache.ctakes.core.ae;
+
+
+import org.apache.ctakes.core.util.Pair;
+import org.apache.ctakes.core.util.regex.TimeoutMatcher;
+import org.apache.ctakes.typesystem.type.textspan.Segment;
+import org.apache.log4j.Logger;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+
+import java.util.*;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 7/20/2016
+ */
+abstract public class RegexSectionizer extends JCasAnnotator_ImplBase {
+
+   static private final Logger LOGGER = Logger.getLogger( "RegexSectionizer" );
+
+   /**
+    * classic ctakes default segment id
+    */
+   static private final String DEFAULT_SEGMENT_ID = "SIMPLE_SEGMENT";
+   static private final String SECTION_NAME_EX = "SECTION_NAME";
+
+   private enum TagType {
+      HEADER, FOOTER
+   }
+
+
+   /**
+    * Holder for section type as defined in the user's specification bsv file
+    */
+   static protected final class SectionType {
+      static private final SectionType DEFAULT_TYPE = new SectionType( DEFAULT_SEGMENT_ID, null, null, true );
+      private final String __name;
+      private final Pattern __headerPattern;
+      private final Pattern __footerPattern;
+      private final boolean __shouldParse;
+
+      public SectionType( final String name, final String headerRegex, final String footerRegex,
+                          final boolean shouldParse ) {
+         __name = name;
+         __headerPattern = headerRegex == null ? null : Pattern.compile( headerRegex, Pattern.MULTILINE );
+         __footerPattern = footerRegex == null ? null : Pattern.compile( footerRegex, Pattern.MULTILINE );
+         __shouldParse = shouldParse;
+      }
+   }
+
+   /**
+    * Holder for information about a section tag discovered in text
+    */
+   static final class SectionTag {
+      private final String __name;
+      private final String __typeName;
+      private final TagType __tagType;
+
+      private SectionTag( final String name, final String typeName, final TagType tagType ) {
+         __name = name;
+         __typeName = typeName;
+         __tagType = tagType;
+      }
+   }
+
+   /**
+    * Normally I would put this in a singleton but I'm not sure that a singleton will work well with/as uima ae
+    *
+    * @param segmentId id of a section / segment
+    * @return false iff a section by the given id is known and was assigned the "don't parse" flag
+    */
+   static public boolean shouldParseSegment( final String segmentId ) {
+      final SectionType sectionType = _sectionTypes.getOrDefault( segmentId, SectionType.DEFAULT_TYPE );
+      return sectionType.__shouldParse;
+   }
+
+
+   // ugly, and I wouldn't normally do this, but ...
+   static private final Map<String, SectionType> _sectionTypes = new ConcurrentHashMap<>();
+
+   static protected void addSectionType( final SectionType sectionType ) {
+      _sectionTypes.put( sectionType.__name, sectionType );
+   }
+
+   static public Map<String, SectionType> getSectionTypes() {
+      return Collections.unmodifiableMap( _sectionTypes );
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public void initialize( final UimaContext context ) throws ResourceInitializationException {
+      super.initialize( context );
+      loadSections();
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public void process( final JCas jcas ) throws AnalysisEngineProcessException {
+      LOGGER.info( "Starting processing" );
+      if ( _sectionTypes.isEmpty() ) {
+         LOGGER.info( "Finished processing, no section types defined" );
+         return;
+      }
+      final String docText = jcas.getDocumentText();
+      final Map<Pair<Integer>, SectionTag> headerTags = findHeaderTags( docText );
+      if ( headerTags.isEmpty() ) {
+         LOGGER.debug( "No section headers found" );
+      }
+      final Map<Pair<Integer>, SectionTag> footerTags = findFooterTags( docText );
+      createSegments( jcas, headerTags, footerTags );
+      LOGGER.info( "Finished processing" );
+   }
+
+   /**
+    * Load Sections in a manner appropriate for the Regex Sectionizer
+    *
+    * @throws ResourceInitializationException -
+    */
+   abstract protected void loadSections() throws ResourceInitializationException;
+
+   /**
+    * find all section separator header tags
+    *
+    * @param docText -
+    * @return section tags mapped to index pairs
+    */
+   static private Map<Pair<Integer>, SectionTag> findHeaderTags( final String docText ) {
+      final Map<Pair<Integer>, SectionTag> headerTags = new HashMap<>();
+      for ( SectionType sectionType : _sectionTypes.values() ) {
+         if ( sectionType.__headerPattern == null ) {
+            continue;
+         }
+         headerTags
+               .putAll( findSectionTags( docText, sectionType.__name, sectionType.__headerPattern, TagType.HEADER ) );
+      }
+      return headerTags;
+   }
+
+   /**
+    * find all section separator footer tags
+    *
+    * @param docText -
+    * @return section tags mapped to index pairs
+    */
+   static private Map<Pair<Integer>, SectionTag> findFooterTags( final String docText ) {
+      final Map<Pair<Integer>, SectionTag> footerTags = new HashMap<>();
+      for ( SectionType sectionType : _sectionTypes.values() ) {
+         if ( sectionType.__footerPattern == null ) {
+            continue;
+         }
+         footerTags
+               .putAll( findSectionTags( docText, sectionType.__name, sectionType.__footerPattern, TagType.FOOTER ) );
+      }
+      return footerTags;
+   }
+
+   /**
+    * @param docText    -
+    * @param typeName   section type name
+    * @param tagPattern regex pattern for section type
+    * @param tagType    header or footer
+    * @return section tags mapped to index pairs
+    */
+   static Map<Pair<Integer>, SectionTag> findSectionTags( final String docText,
+                                                          final String typeName,
+                                                          final Pattern tagPattern,
+                                                          final TagType tagType ) {
+      final Map<Pair<Integer>, SectionTag> sectionTags = new HashMap<>();
+      try ( TimeoutMatcher finder = new TimeoutMatcher( tagPattern, docText ) ) {
+         Matcher tagMatcher = finder.nextMatch();
+         while ( tagMatcher != null ) {
+            String name;
+            // the start tag of this tag is the start of the current match
+            // the end tag of this tag is the end of the current match, exclusive
+            final Pair<Integer> tagBounds = new Pair<>( tagMatcher.start(), tagMatcher.end() );
+            try {
+               name = tagMatcher.group( SECTION_NAME_EX );
+               if ( name == null || name.isEmpty() ) {
+                  name = typeName;
+               }
+            } catch ( IllegalArgumentException iaE ) {
+               name = typeName;
+            }
+            sectionTags.put( tagBounds, new SectionTag( name, typeName, tagType ) );
+            tagMatcher = finder.nextMatch();
+         }
+      } catch ( IllegalArgumentException iaE ) {
+         LOGGER.error( iaE.getMessage() );
+      }
+      return sectionTags;
+   }
+
+   /**
+    * All tags are treated equally as segment bounds, whether header or footer
+    *
+    * @param jcas       -
+    * @param headerTags segment names are assigned based upon preceding headers
+    * @param footerTags footers reset segment names to {@link #DEFAULT_SEGMENT_ID}
+    */
+   static private void createSegments( final JCas jcas,
+                                       final Map<Pair<Integer>, SectionTag> headerTags,
+                                       final Map<Pair<Integer>, SectionTag> footerTags ) {
+      final String docText = jcas.getDocumentText();
+      final Map<Pair<Integer>, SectionTag> sectionTags = new HashMap<>( headerTags.size() + footerTags.size() );
+      sectionTags.putAll( headerTags );
+      sectionTags.putAll( footerTags );
+      if ( sectionTags.isEmpty() ) {
+         // whole text is simple segment
+         final Segment docSegment = new Segment( jcas, 0, docText.length() - 1 );
+         docSegment.setId( DEFAULT_SEGMENT_ID );
+         docSegment.setPreferredText( DEFAULT_SEGMENT_ID );
+         docSegment.addToIndexes();
+         return;
+      }
+      final List<Pair<Integer>> boundsList = new ArrayList<>( sectionTags.keySet() );
+      boundsList.sort( ( p1, p2 ) -> p1.getValue1() - p2.getValue2() );
+      Pair<Integer> leftBounds = boundsList.get( 0 );
+      int sectionEnd;
+      if ( leftBounds.getValue1() > 0 ) {
+         // Add unspecified generic first segment
+         sectionEnd = leftBounds.getValue1();
+         if ( !docText.substring( 0, sectionEnd ).trim().isEmpty() ) {
+            final Segment simpleSegment = new Segment( jcas, 0, sectionEnd );
+            simpleSegment.setId( DEFAULT_SEGMENT_ID );
+            simpleSegment.setPreferredText( DEFAULT_SEGMENT_ID );
+            simpleSegment.addToIndexes();
+            // will start the next segment with bounds at 0
+         }
+      }
+      final int length = boundsList.size();
+      // add segments 1 -> n
+      for ( int i = 0; i < length; i++ ) {
+         leftBounds = boundsList.get( i );
+         final int sectionBegin = leftBounds.getValue2();
+         if ( i + 1 < length ) {
+            sectionEnd = boundsList.get( i + 1 ).getValue1();
+         } else {
+            // the last segment
+            sectionEnd = docText.length();
+         }
+         if ( sectionEnd - sectionBegin <= 1 ) {
+            // a length <= 1 means that we have one tag right after another, so the segment is empty
+            continue;
+         }
+         if ( docText.substring( sectionBegin, sectionEnd ).trim().isEmpty() ) {
+            // Section has no text, parsing would be pointless
+            continue;
+         }
+         final SectionTag leftTag = sectionTags.get( leftBounds );
+         final Segment segment = new Segment( jcas, sectionBegin, sectionEnd );
+         if ( leftTag.__tagType == TagType.HEADER ) {
+            // this tag is for a header, so the following segment has a defined name
+            segment.setId( leftTag.__typeName );
+            segment.setPreferredText( leftTag.__name );
+         } else {
+            // this tag is for a footer, so the following segment is generic
+            segment.setId( DEFAULT_SEGMENT_ID );
+            segment.setPreferredText( DEFAULT_SEGMENT_ID );
+         }
+         segment.addToIndexes();
+      }
+   }
+
+   /**
+    * @param text -
+    * @return true if the text to lower case is "true" or "false"
+    */
+   static protected boolean isBoolean( final String text ) {
+      final String text2 = text.trim().toLowerCase();
+      return text2.equalsIgnoreCase( "true" ) || text2.equalsIgnoreCase( "false" );
+   }
+
+
+}

Added: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/AbstractOutputFileWriter.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/AbstractOutputFileWriter.java?rev=1768376&view=auto
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/AbstractOutputFileWriter.java (added)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/AbstractOutputFileWriter.java Sun Nov  6 19:04:52 2016
@@ -0,0 +1,160 @@
+package org.apache.ctakes.core.cc;
+
+
+import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
+import org.apache.ctakes.typesystem.type.structured.DocumentIdPrefix;
+import org.apache.ctakes.typesystem.type.structured.DocumentPath;
+import org.apache.log4j.Logger;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.fit.component.CasConsumer_ImplBase;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Collection;
+
+/**
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 9/16/2016
+ */
+abstract public class AbstractOutputFileWriter extends CasConsumer_ImplBase {
+
+   static private final Logger LOGGER = Logger.getLogger( "AbstractOutputFileWriter" );
+
+
+   /**
+    * Name of configuration parameter that must be set to the path of a directory into which the
+    * output files will be written.
+    */
+   public static final String PARAM_OUTPUTDIR = "OutputDirectory";
+   @ConfigurationParameter( name = PARAM_OUTPUTDIR,
+         description = "Root output directory to write files" )
+   private File _outputRootDir;
+
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public void initialize( final UimaContext context ) throws ResourceInitializationException {
+      super.initialize( context );
+      if ( !_outputRootDir.exists() ) {
+         _outputRootDir.mkdirs();
+      }
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public void process( final CAS cas ) throws AnalysisEngineProcessException {
+      JCas jcas;
+      try {
+         jcas = cas.getJCas();
+      } catch ( CASException casE ) {
+         throw new AnalysisEngineProcessException( casE );
+      }
+      final String documentId = DocumentIDAnnotationUtil.getDocumentIdForFile( jcas );
+      final String outputDir = getOutputDirectory( jcas, _outputRootDir.getPath(), documentId );
+      final String fileName = getSourceFileName( jcas, documentId );
+      try {
+         writeFile( jcas, outputDir, documentId, fileName );
+      } catch ( IOException ioE ) {
+         throw new AnalysisEngineProcessException( ioE );
+      }
+   }
+
+
+   /**
+    * Write information into a file named based upon the document id and located based upon the document id prefix.
+    *
+    * @param jCas       ye olde
+    * @param outputDir  output directory
+    * @param documentId some id for the cas document
+    * @param fileName   name for the output file
+    * @throws IOException if anything goes wrong
+    */
+   abstract public void writeFile( final JCas jCas,
+                                   final String outputDir,
+                                   final String documentId,
+                                   final String fileName ) throws IOException;
+
+
+   /**
+    * @param jcas       ye olde
+    * @param rootPath   the root path for all output subdirectories and files
+    * @param documentId some id for the cas document
+    * @return the full output path up to but not including the fileName
+    */
+   protected String getOutputDirectory( final JCas jcas, final String rootPath, final String documentId ) {
+      String subDirectory = getSubdirectory( jcas, documentId );
+      if ( subDirectory == null || subDirectory.isEmpty() ) {
+         return rootPath;
+      }
+      final File outputDir = new File( rootPath + "/" + subDirectory );
+      outputDir.mkdirs();
+      return outputDir.getPath();
+   }
+
+   /**
+    * @param jCas       ye olde
+    * @param documentId some id for the cas document
+    * @return a subdirectory based upon the {@link DocumentIdPrefix} stored in the cas, or none if none
+    */
+   protected String getSubdirectory( final JCas jCas, final String documentId ) {
+      String subDirectory = "";
+      final Collection<DocumentIdPrefix> prefices = JCasUtil.select( jCas, DocumentIdPrefix.class );
+      if ( prefices == null || prefices.isEmpty() ) {
+         LOGGER.debug( "No subdirectory information for " + documentId );
+         return "";
+      }
+      for ( DocumentIdPrefix prefix : prefices ) {
+         subDirectory = prefix.getDocumentIdPrefix();
+         if ( subDirectory != null && !subDirectory.isEmpty() ) {
+            return subDirectory;
+         }
+      }
+      LOGGER.debug( "No subdirectory information for " + documentId );
+      return "";
+   }
+
+   /**
+    * @param jCas ye olde
+    * @return the full path to the file containing the processed text, or an empty string ("") if unknown
+    */
+   protected String getSourceFilePath( final JCas jCas ) {
+      final Collection<DocumentPath> documentPaths = JCasUtil.select( jCas, DocumentPath.class );
+      if ( documentPaths == null || documentPaths.isEmpty() ) {
+         return "";
+      }
+      for ( DocumentPath documentPath : documentPaths ) {
+         final String path = documentPath.getDocumentPath();
+         if ( path != null && !path.isEmpty() ) {
+            return path;
+         }
+      }
+      return "";
+   }
+
+   /**
+    * @param jcas       ye olde
+    * @param documentId some id for the cas document
+    * @return a filename based upon the documentId
+    */
+   protected String getSourceFileName( final JCas jcas, final String documentId ) {
+      final String path = getSourceFilePath( jcas );
+      if ( path != null && !path.isEmpty() ) {
+         return new File( path ).getName();
+      }
+      return documentId;
+   }
+
+
+}

Added: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/FileTreeXmiWriter.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/FileTreeXmiWriter.java?rev=1768376&view=auto
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/FileTreeXmiWriter.java (added)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/FileTreeXmiWriter.java Sun Nov  6 19:04:52 2016
@@ -0,0 +1,57 @@
+package org.apache.ctakes.core.cc;
+
+import org.apache.log4j.Logger;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.impl.XmiCasSerializer;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.xml.sax.SAXException;
+
+import java.io.*;
+
+/**
+ * Write xmi files in a directory tree mimicking that of the input files
+ *
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 2/12/2016
+ */
+final public class FileTreeXmiWriter extends AbstractOutputFileWriter {
+
+   static private final Logger LOGGER = Logger.getLogger( "FileTreeXmiWriter" );
+
+   @Override
+   public void writeFile( final JCas jCas, final String outputDir,
+                          final String documentId, final String fileName ) throws IOException {
+      final File xmiFile = new File( outputDir, fileName + ".xmi" );
+      try {
+         writeXmi( jCas.getCas(), xmiFile );
+      } catch ( IOException | SAXException multE ) {
+         throw new IOException( multE );
+      }
+   }
+
+   /**
+    * Serialize a CAS to a file in XMI format
+    *
+    * @param cas  CAS to serialize
+    * @param file output file
+    * @throws IOException  -
+    * @throws SAXException -
+    */
+   static private void writeXmi( final CAS cas, final File file ) throws IOException, SAXException {
+      try ( OutputStream outputStream = new BufferedOutputStream( new FileOutputStream( file ) ) ) {
+         XmiCasSerializer casSerializer = new XmiCasSerializer( cas.getTypeSystem() );
+         XMISerializer xmiSerializer = new XMISerializer( outputStream );
+         casSerializer.serialize( cas, xmiSerializer.getContentHandler() );
+      }
+   }
+
+   public static AnalysisEngine createEngine( final String outputDirectory ) throws ResourceInitializationException {
+      return AnalysisEngineFactory.createEngine( FileTreeXmiWriter.class, PARAM_OUTPUTDIR, outputDirectory );
+   }
+
+
+}

Added: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/XMISerializer.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/XMISerializer.java?rev=1768376&view=auto
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/XMISerializer.java (added)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/XMISerializer.java Sun Nov  6 19:04:52 2016
@@ -0,0 +1,232 @@
+package org.apache.ctakes.core.cc;
+
+import org.apache.uima.UIMARuntimeException;
+import org.apache.uima.internal.util.XMLUtils;
+import org.xml.sax.*;
+
+import javax.xml.transform.OutputKeys;
+import javax.xml.transform.Result;
+import javax.xml.transform.Transformer;
+import javax.xml.transform.TransformerConfigurationException;
+import javax.xml.transform.sax.SAXTransformerFactory;
+import javax.xml.transform.sax.TransformerHandler;
+import javax.xml.transform.stream.StreamResult;
+import java.io.OutputStream;
+import java.io.Writer;
+
+/*
+ * This class mimic's UIMA's XMLSerilizer, but it explictly uses the internal xalan
+ * for xml handling, rather than any potential 3rd party such as Saxon.
+ * Code was redudant because UIMA setter is private.
+ * We also want to allow both to co-exist in the same environment for use case such as
+ * FHIR which depends on SAXON-HE.
+ */
+
+final public class XMISerializer {
+
+   private SAXTransformerFactory transformerFactory;
+   private TransformerHandler mHandler;
+   private Transformer mTransformer;
+
+   public XMISerializer() {
+      try {
+         transformerFactory = (SAXTransformerFactory)SAXTransformerFactory
+               .newInstance(
+                     "com.sun.org.apache.xalan.internal.xsltc.trax.TransformerFactoryImpl",
+                     this.getClass().getClassLoader() );
+
+         mHandler = transformerFactory.newTransformerHandler();
+         mTransformer = mHandler.getTransformer();
+
+      } catch ( TransformerConfigurationException e ) {
+         throw new UIMARuntimeException( e );
+      }
+   }
+
+   public XMISerializer( OutputStream aOutputStream ) {
+      this();
+      setOutputStream( aOutputStream );
+   }
+
+   private OutputStream mOutputStream;
+   private Writer mWriter;
+
+   public void setOutputStream( OutputStream aOutputStream ) {
+      mWriter = null;
+      mOutputStream = aOutputStream;
+      mHandler.setResult( createSaxResultObject() );
+   }
+
+   public void setWriter( Writer aWriter ) {
+      mOutputStream = null;
+      mWriter = aWriter;
+      mHandler.setResult( createSaxResultObject() );
+   }
+
+   private Result createSaxResultObject() {
+      if ( mOutputStream != null ) {
+         return new StreamResult( mOutputStream );
+      } else if ( mWriter != null ) {
+         return new StreamResult( mWriter );
+      } else {
+         return null;
+      }
+   }
+
+
+   public ContentHandler getContentHandler() {
+      String xmlVer = mTransformer.getOutputProperty( OutputKeys.VERSION );
+      boolean xml10 = xmlVer == null || "1.0".equals( xmlVer );
+      return new CharacterValidatingContentHandler( !xml10, mHandler );
+   }
+
+   static class CharacterValidatingContentHandler implements ContentHandler {
+      ContentHandler mHandler;
+      boolean mXml11;
+
+      CharacterValidatingContentHandler( boolean xml11,
+                                         ContentHandler serializerHandler ) {
+         mHandler = serializerHandler;
+         mXml11 = xml11;
+      }
+
+      /*
+       * (non-Javadoc)
+       *
+       * @see org.xml.sax.ContentHandler#startElement(java.lang.String,
+       * java.lang.String, java.lang.String, org.xml.sax.Attributes)
+       */
+      public void startElement( String uri, String localName, String qName,
+                                Attributes atts ) throws SAXException {
+         for ( int i = 0; i < atts.getLength(); i++ ) {
+            String val = atts.getValue( i );
+            checkForInvalidXmlChars( val, mXml11 );
+         }
+         mHandler.startElement( uri, localName, qName, atts );
+
+      }
+
+      /*
+       * (non-Javadoc)
+       *
+       * @see org.xml.sax.ContentHandler#characters(char[], int, int)
+       */
+      public void characters( char[] ch, int start, int length )
+            throws SAXException {
+         checkForInvalidXmlChars( ch, start, length, mXml11 );
+         mHandler.characters( ch, start, length );
+      }
+
+      /*
+       * (non-Javadoc)
+       *
+       * @see org.xml.sax.ContentHandler#endDocument()
+       */
+      public void endDocument() throws SAXException {
+         mHandler.endDocument();
+      }
+
+      /*
+       * (non-Javadoc)
+       *
+       * @see org.xml.sax.ContentHandler#endElement(java.lang.String,
+       * java.lang.String, java.lang.String)
+       */
+      public void endElement( String uri, String localName, String qName )
+            throws SAXException {
+         mHandler.endElement( uri, localName, qName );
+      }
+
+      /*
+       * (non-Javadoc)
+       *
+       * @see org.xml.sax.ContentHandler#endPrefixMapping(java.lang.String)
+       */
+      public void endPrefixMapping( String prefix ) throws SAXException {
+         mHandler.endPrefixMapping( prefix );
+      }
+
+      /*
+       * (non-Javadoc)
+       *
+       * @see org.xml.sax.ContentHandler#ignorableWhitespace(char[], int, int)
+       */
+      public void ignorableWhitespace( char[] ch, int start, int length )
+            throws SAXException {
+         mHandler.ignorableWhitespace( ch, start, length );
+      }
+
+      /*
+       * (non-Javadoc)
+       *
+       * @see
+       * org.xml.sax.ContentHandler#processingInstruction(java.lang.String,
+       * java.lang.String)
+       */
+      public void processingInstruction( String target, String data )
+            throws SAXException {
+         mHandler.processingInstruction( target, data );
+      }
+
+      /*
+       * (non-Javadoc)
+       *
+       * @see
+       * org.xml.sax.ContentHandler#setDocumentLocator(org.xml.sax.Locator)
+       */
+      public void setDocumentLocator( Locator locator ) {
+         mHandler.setDocumentLocator( locator );
+      }
+
+      /*
+       * (non-Javadoc)
+       *
+       * @see org.xml.sax.ContentHandler#skippedEntity(java.lang.String)
+       */
+      public void skippedEntity( String name ) throws SAXException {
+         mHandler.skippedEntity( name );
+      }
+
+      /*
+       * (non-Javadoc)
+       *
+       * @see org.xml.sax.ContentHandler#startDocument()
+       */
+      public void startDocument() throws SAXException {
+         mHandler.startDocument();
+      }
+
+      /*
+       * (non-Javadoc)
+       *
+       * @see org.xml.sax.ContentHandler#startPrefixMapping(java.lang.String,
+       * java.lang.String)
+       */
+      public void startPrefixMapping( String prefix, String uri )
+            throws SAXException {
+         mHandler.startPrefixMapping( prefix, uri );
+      }
+
+      private final void checkForInvalidXmlChars( String s, boolean xml11 )
+            throws SAXParseException {
+         final int index = XMLUtils.checkForNonXmlCharacters( s, xml11 );
+         if ( index >= 0 ) {
+            throw new SAXParseException( "Trying to serialize non-XML "
+                                         + (xml11 ? "1.1" : "1.0") + " character: "
+                                         + s.charAt( index ) + ", 0x"
+                                         + Integer.toHexString( s.charAt( index ) ), null );
+         }
+      }
+
+      private final void checkForInvalidXmlChars( char[] ch, int start,
+                                                  int length, boolean xml11 ) throws SAXParseException {
+         final int index = XMLUtils.checkForNonXmlCharacters( ch, start,
+               length, xml11 );
+         if ( index >= 0 ) {
+            throw new SAXParseException( "Trying to serialize non-XML "
+                                         + (xml11 ? "1.1" : "1.0") + " character: " + ch[ index ]
+                                         + ", 0x" + Integer.toHexString( ch[ index ] ), null );
+         }
+      }
+   }
+}

Added: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/FileTreeReader.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/FileTreeReader.java?rev=1768376&view=auto
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/FileTreeReader.java (added)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/FileTreeReader.java Sun Nov  6 19:04:52 2016
@@ -0,0 +1,283 @@
+package org.apache.ctakes.core.cr;
+
+import org.apache.ctakes.core.resource.FileLocator;
+import org.apache.ctakes.typesystem.type.structured.DocumentID;
+import org.apache.ctakes.typesystem.type.structured.DocumentIdPrefix;
+import org.apache.ctakes.typesystem.type.structured.DocumentPath;
+import org.apache.log4j.Logger;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.collection.CollectionException;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.collection.CollectionReader_ImplBase;
+import org.apache.uima.fit.factory.CollectionReaderFactory;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.util.Progress;
+import org.apache.uima.util.ProgressImpl;
+
+import java.io.*;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+
+
+/**
+ * Recursively reads a directory tree of files, sorted by level (root first),
+ * creating the DocumentID from the file name and the DocumentIdPrefix by the subdirectory path between
+ * the root and the leaf file
+ *
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 2/10/2016
+ */
+final public class FileTreeReader extends CollectionReader_ImplBase {
+
+   static private final Logger LOGGER = Logger.getLogger( "FileTreeReader" );
+
+   /**
+    * Name of configuration parameter that must be set to the path of
+    * a directory containing input files.
+    */
+   public static final String PARAM_INPUTDIR = "InputDirectory";
+
+   /**
+    * Name of configuration parameter that contains the character encoding used
+    * by the input files.  If not specified, the default system encoding will
+    * be used.
+    */
+   public static final String PARAM_ENCODING = "Encoding";
+
+   /**
+    * Name of optional configuration parameter that specifies the extensions
+    * of the files that the collection reader will read.  Values for this
+    * parameter should not begin with a dot <code>'.'</code>.
+    */
+   public static final String PARAM_EXTENSIONS = "Extensions";
+
+   private List<File> _files;
+   private String _encoding;
+   private Collection<String> _validExtensions;
+   private File _rootDir;
+   private int _currentIndex;
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public void initialize() throws ResourceInitializationException {
+      try {
+         _rootDir = FileLocator.locateFile( (String)getConfigParameterValue( PARAM_INPUTDIR ) );
+      } catch ( FileNotFoundException fnfE ) {
+         throw new ResourceInitializationException( fnfE );
+      }
+      _encoding = (String)getConfigParameterValue( PARAM_ENCODING );
+      final String[] explicitExtensions = (String[])getConfigParameterValue( PARAM_EXTENSIONS );
+      _validExtensions = createValidExtensions( explicitExtensions );
+
+      _currentIndex = 0;
+      _files = getDescendentFiles( _rootDir, _validExtensions );
+   }
+
+   /**
+    * @param explicitExtensions array of file extensions as specified in the uima parameters
+    * @return a collection of dot-prefixed extensions or none if {@code explicitExtensions} is null or empty
+    */
+   static Collection<String> createValidExtensions( final String... explicitExtensions ) {
+      if ( explicitExtensions == null || explicitExtensions.length == 0 ) {
+         return Collections.emptyList();
+      }
+      if ( explicitExtensions.length == 1
+           && (explicitExtensions[ 0 ].equals( "*" ) || explicitExtensions[ 0 ].equals( ".*" )) ) {
+         return Collections.emptyList();
+      }
+      final Collection<String> validExtensions = new ArrayList<>( explicitExtensions.length );
+      for ( String extension : explicitExtensions ) {
+         if ( extension.startsWith( "." ) ) {
+            validExtensions.add( extension );
+         } else {
+            validExtensions.add( '.' + extension );
+         }
+      }
+      return validExtensions;
+   }
+
+   /**
+    * @param parentDir       -
+    * @param validExtensions collection of valid extensions or empty collection if all extensions are valid
+    * @return List of files descending from the parent directory
+    */
+   static private List<File> getDescendentFiles( final File parentDir, final Collection<String> validExtensions ) {
+      final File[] children = parentDir.listFiles();
+      if ( children == null || children.length == 0 ) {
+         return Collections.emptyList();
+      }
+      final Collection<File> childDirs = new ArrayList<>();
+      final List<File> descendentFiles = new ArrayList<>();
+      for ( File child : children ) {
+         if ( child.isDirectory() ) {
+            childDirs.add( child );
+            continue;
+         }
+         if ( isExtensionValid( child, validExtensions ) && !child.isHidden() ) {
+            descendentFiles.add( child );
+         }
+      }
+      for ( File childDir : childDirs ) {
+         descendentFiles.addAll( getDescendentFiles( childDir, validExtensions ) );
+      }
+      return descendentFiles;
+   }
+
+   /**
+    * @param file            -
+    * @param validExtensions -
+    * @return true if validExtensions is empty or contains an extension belonging to the given file
+    */
+   static boolean isExtensionValid( final File file, final Collection<String> validExtensions ) {
+      if ( validExtensions.isEmpty() ) {
+         return true;
+      }
+      final String fileName = file.getName();
+      for ( String extension : validExtensions ) {
+         if ( fileName.endsWith( extension ) ) {
+            if ( fileName.equals( extension ) ) {
+               LOGGER.warn( "File " + file.getPath() + " is named as extension " + extension + " ; discarded" );
+               return false;
+            }
+            return true;
+         }
+      }
+      return false;
+   }
+
+   /**
+    * @param file            -
+    * @param validExtensions -
+    * @return the file name with the longest valid extension removed
+    */
+   static String createDocumentID( final File file, final Collection<String> validExtensions ) {
+      final String fileName = file.getName();
+      String maxExtension = "";
+      for ( String extension : validExtensions ) {
+         if ( fileName.endsWith( extension ) && extension.length() > maxExtension.length() ) {
+            maxExtension = extension;
+         }
+      }
+      int lastDot = fileName.lastIndexOf( '.' );
+      if ( !maxExtension.isEmpty() ) {
+         lastDot = fileName.length() - maxExtension.length();
+      }
+      if ( lastDot < 0 ) {
+         return fileName;
+      }
+      return fileName.substring( 0, lastDot );
+   }
+
+   /**
+    * @param file    -
+    * @param rootDir -
+    * @return the subdirectory path between the root directory and the file
+    */
+   static private String createDocumentIdPrefix( final File file, final File rootDir ) {
+      final String parentPath = file.getParent();
+      final String rootPath = rootDir.getPath();
+      if ( parentPath.equals( rootPath ) || !parentPath.startsWith( rootPath ) ) {
+         return "";
+      }
+      return parentPath.substring( rootPath.length() + 1 );
+   }
+
+   /**
+    * Gets the total number of documents that will be returned by this
+    * collection reader.  This is not part of the general collection reader
+    * interface.
+    *
+    * @return the number of documents in the collection
+    */
+   public int getNumberOfDocuments() {
+      return _files.size();
+   }
+
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public boolean hasNext() {
+      return _currentIndex < _files.size();
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public void getNext( final CAS cas ) throws IOException, CollectionException {
+      JCas jcas;
+      try {
+         jcas = cas.getJCas();
+      } catch ( CASException casE ) {
+         _currentIndex++;
+         throw new IOException( casE );
+      }
+      final File file = _files.get( _currentIndex );
+      _currentIndex++;
+      // Use 8KB as the default buffer size
+      byte[] buffer = new byte[ 8192 ];
+      final StringBuilder sb = new StringBuilder();
+      try ( final InputStream inputStream = new BufferedInputStream( new FileInputStream( file ), buffer.length ) ) {
+         while ( true ) {
+            final int length = inputStream.read( buffer );
+            if ( length < 0 ) {
+               break;
+            }
+            if ( _encoding != null ) {
+               sb.append( new String( buffer, 0, length, _encoding ) );
+            } else {
+               sb.append( new String( buffer, 0, length ) );
+            }
+         }
+      } catch ( FileNotFoundException fnfE ) {
+         throw new IOException( fnfE );
+      }
+      // put document text and id annotations in CAS (assume CAS)
+      jcas.setDocumentText( sb.toString() );
+      final DocumentID documentId = new DocumentID( jcas );
+      final String id = createDocumentID( file, _validExtensions );
+      documentId.setDocumentID( id );
+      documentId.addToIndexes();
+      final DocumentIdPrefix documentIdPrefix = new DocumentIdPrefix( jcas );
+      final String idPrefix = createDocumentIdPrefix( file, _rootDir );
+      documentIdPrefix.setDocumentIdPrefix( idPrefix );
+      documentIdPrefix.addToIndexes();
+      final DocumentPath documentPath = new DocumentPath( jcas );
+      documentPath.setDocumentPath( file.getAbsolutePath() );
+      documentPath.addToIndexes();
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public void close() throws IOException {
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public Progress[] getProgress() {
+      return new Progress[] {
+            new ProgressImpl( _currentIndex, _files.size(), Progress.ENTITIES )
+      };
+   }
+
+
+   public static CollectionReader createReader( final String inputDirectory ) throws ResourceInitializationException {
+      return CollectionReaderFactory.createReader( FileTreeReader.class,
+            FilesInDirectoryCollectionReader.PARAM_INPUTDIR,
+            inputDirectory );
+   }
+
+}

Added: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/DotLogger.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/DotLogger.java?rev=1768376&view=auto
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/DotLogger.java (added)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/DotLogger.java Sun Nov  6 19:04:52 2016
@@ -0,0 +1,52 @@
+package org.apache.ctakes.core.util;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.Timer;
+import java.util.TimerTask;
+
+/**
+ * Dot Logger Usable in try as resource blocks
+ *
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 3/18/2016
+ */
+final public class DotLogger implements Closeable {
+
+   static private final org.apache.log4j.Logger DOT_LOGGER = org.apache.log4j.Logger.getLogger( "ProgressAppender" );
+   static private final org.apache.log4j.Logger EOL_LOGGER = org.apache.log4j.Logger.getLogger( "ProgressDone" );
+
+   private final Timer _timer;
+
+   /**
+    * Starts the Dot Logging
+    */
+   public DotLogger() {
+      _timer = new Timer();
+      _timer.scheduleAtFixedRate( new DotPlotter(), 333, 333 );
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public void close() throws IOException {
+      _timer.cancel();
+      EOL_LOGGER.error( "" );
+   }
+
+   static private class DotPlotter extends TimerTask {
+      private int _count = 0;
+
+      @Override
+      public void run() {
+         DOT_LOGGER.info( "." );
+         _count++;
+         if ( _count % 30 == 0 ) {
+            EOL_LOGGER.info( " " + (_count / 3) );
+         }
+      }
+   }
+
+}

Added: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/Pair.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/Pair.java?rev=1768376&view=auto
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/Pair.java (added)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/Pair.java Sun Nov  6 19:04:52 2016
@@ -0,0 +1,67 @@
+package org.apache.ctakes.core.util;
+
+
+import javax.annotation.Nonnull;
+import javax.annotation.concurrent.Immutable;
+
+/**
+ * Why oh why is there not a simple immutable class representing a pair of values in the jdk ?
+ *
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 7/22/2016
+ */
+@Immutable
+final public class Pair<T> {
+
+   private final T _value1;
+   private final T _value2;
+
+   /**
+    * @param value1 not null
+    * @param value2 not null
+    */
+   public Pair( @Nonnull final T value1, @Nonnull final T value2 ) {
+      if ( value1 == null || value2 == null ) {
+         throw new NullPointerException( "Cannot pass null value to Pair: " + value1 + "," + value2 );
+      }
+      _value1 = value1;
+      _value2 = value2;
+   }
+
+   /**
+    * @return the first value in the Pair
+    */
+   public T getValue1() {
+      return _value1;
+   }
+
+   /**
+    * @return the second value in the Pair
+    */
+   public T getValue2() {
+      return _value2;
+   }
+
+   @Override
+   public String toString() {
+      return _value1.toString() + "," + _value2.toString();
+   }
+
+   @Override
+   public int hashCode() {
+      return _value1.hashCode() + 13 * _value2.hashCode();
+   }
+
+   /**
+    * @param other -
+    * @return true iff the other object is a Pair and its values equal this Pair's values
+    */
+   @Override
+   public boolean equals( final Object other ) {
+      return other instanceof Pair
+             && ((Pair)other)._value1.equals( _value1 )
+             && ((Pair)other)._value2.equals( _value2 );
+   }
+
+}

Added: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/regex/RegexSpanFinder.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/regex/RegexSpanFinder.java?rev=1768376&view=auto
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/regex/RegexSpanFinder.java (added)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/regex/RegexSpanFinder.java Sun Nov  6 19:04:52 2016
@@ -0,0 +1,165 @@
+package org.apache.ctakes.core.util.regex;
+
+
+import org.apache.ctakes.core.util.Pair;
+import org.apache.log4j.Logger;
+
+import java.io.Closeable;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.concurrent.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+/**
+ * Class that can / should be used to find text spans using regular expressions.
+ * It runs Matcher find {@link Matcher#find()} in a separate thread so that it may be interrupted at a set timeout.
+ * This prevents infinite loop problems that can be caused by poorly-built expressions or unexpected text contents.
+ * The timeout can be specified in milliseconds between 100 and 10,000.  Large timeouts are unadvised.  If a large
+ * amount of text needs to be parsed then it is better to split up the text logically and use smaller timeouts.
+ * The default timeout is 1000 milliseconds.
+ * <p>
+ * Proper usage is:
+ * try ( RegexSpanFinder finder = new RegexSpanFinder( "\\s+" ) ) {
+ * final List<Pair<Integer>> spans = finder.findSpans( "Hello World !" );
+ * ...
+ * } catch ( IllegalArgumentException iaE ) {
+ * ...
+ * }
+ * </p>
+ *
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 11/5/2016
+ */
+final public class RegexSpanFinder implements Closeable {
+
+   static private final Logger LOGGER = Logger.getLogger( "RegexSpanFinder" );
+
+   static private final int DEFAULT_TIMEOUT_MILLIS = 1000;
+   static private final int MIN_TIMEOUT_MILLIS = 100;
+   static private final int MAX_TIMEOUT_MILLIS = 10000;
+
+   private final ExecutorService _executor;
+   private final Pattern _pattern;
+   private final int _timeoutMillis;
+
+   /**
+    * Uses the default timeout of 1000 milliseconds
+    *
+    * @param regex regular expression
+    * @throws IllegalArgumentException if the regular expression is null or malformed
+    */
+   public RegexSpanFinder( final String regex ) throws IllegalArgumentException {
+      this( Pattern.compile( regex ) );
+   }
+
+   /**
+    * @param regex         regular expression
+    * @param timeoutMillis milliseconds at which the regex match should abort, between 100 and 10000
+    * @throws IllegalArgumentException if the regular expression is null or malformed
+    */
+   public RegexSpanFinder( final String regex, final int timeoutMillis ) throws IllegalArgumentException {
+      this( Pattern.compile( regex ), timeoutMillis );
+   }
+
+   /**
+    * Uses the default timeout of 1000 milliseconds
+    *
+    * @param pattern Pattern compiled from a regular expression
+    * @throws IllegalArgumentException if the pattern is null or malformed
+    */
+   public RegexSpanFinder( final Pattern pattern ) throws IllegalArgumentException {
+      this( pattern, DEFAULT_TIMEOUT_MILLIS );
+   }
+
+   /**
+    * Uses the default timeout of 1000 milliseconds
+    *
+    * @param pattern       Pattern compiled from a regular expression
+    * @param timeoutMillis milliseconds at which the regex match should abort, between 100 and 10000
+    * @throws IllegalArgumentException if the pattern is null or malformed
+    */
+   public RegexSpanFinder( final Pattern pattern, final int timeoutMillis ) throws IllegalArgumentException {
+      if ( pattern == null ) {
+         throw new PatternSyntaxException( "Pattern cannot be null", "", -1 );
+      }
+      if ( timeoutMillis < MIN_TIMEOUT_MILLIS || timeoutMillis > MAX_TIMEOUT_MILLIS ) {
+         throw new IllegalArgumentException( "Timeout must be between "
+                                             + MIN_TIMEOUT_MILLIS + " and " + MAX_TIMEOUT_MILLIS );
+      }
+      _pattern = pattern;
+      _timeoutMillis = timeoutMillis;
+      _executor = Executors.newSingleThreadExecutor();
+   }
+
+
+   /**
+    * @param text text in which a find should be conducted
+    * @return List of Integer Pairs representing text span begin and end offsets
+    */
+   public List<Pair<Integer>> findSpans( final String text ) {
+      if ( text == null || text.isEmpty() ) {
+         return Collections.emptyList();
+      }
+      final ThreadString threadText = new ThreadString( text );
+      final Callable<List<Pair<Integer>>> callable = new RegexCallable( threadText, _pattern );
+      final Future<List<Pair<Integer>>> future = _executor.submit( callable );
+      try {
+         return future.get( _timeoutMillis, TimeUnit.MILLISECONDS );
+      } catch ( InterruptedException | ExecutionException | TimeoutException multE ) {
+         LOGGER.debug( "Timeout for " + _pattern );
+         if ( !future.cancel( true ) ) {
+            LOGGER.error( "Timed out but could not be cancelled while detecting " + _pattern );
+         }
+      }
+      if ( future.isCancelled() ) {
+         LOGGER.error( "Cancelled while detecting " + _pattern );
+      } else if ( !future.isDone() ) {
+         LOGGER.error( "Not cancelled but didn't complete while detecting " + _pattern );
+      }
+      return Collections.emptyList();
+   }
+
+   /**
+    * shut down the executor
+    * {@inheritDoc}
+    */
+   @Override
+   public void close() {
+      _executor.shutdownNow();
+   }
+
+
+   /**
+    * Simple Callable that runs a {@link Matcher} on text to find text span begin and end offsets
+    */
+   static private final class RegexCallable implements Callable<List<Pair<Integer>>> {
+      final private CharSequence __text;
+      final private Pattern __pattern;
+
+      private RegexCallable( final CharSequence text, final Pattern pattern ) {
+         __text = text;
+         __pattern = pattern;
+      }
+
+      /**
+       * {@inheritDoc}
+       *
+       * @return text span begin and end offsets
+       */
+      @Override
+      public List<Pair<Integer>> call() {
+         final List<Pair<Integer>> listBounds = new ArrayList<>();
+         final Matcher matcher = __pattern.matcher( __text );
+         while ( matcher.find() && !Thread.currentThread().isInterrupted() ) {
+            final Pair<Integer> bounds = new Pair<>( matcher.start(), matcher.end() );
+            listBounds.add( bounds );
+         }
+         return listBounds;
+      }
+   }
+
+}

Added: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/regex/ThreadString.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/regex/ThreadString.java?rev=1768376&view=auto
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/regex/ThreadString.java (added)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/regex/ThreadString.java Sun Nov  6 19:04:52 2016
@@ -0,0 +1,36 @@
+package org.apache.ctakes.core.util.regex;
+
+/**
+ * A representation of text that can check its container thread for interruptions.
+ * This allows a break within tight charAt(..) calling loops, which can otherwise become infinite in a corrupt find.
+ */
+final class ThreadString implements CharSequence {
+   private final CharSequence _delegate;
+
+   ThreadString( final CharSequence delegate ) {
+      _delegate = delegate;
+   }
+
+   @Override
+   public char charAt( final int index ) {
+      if ( Thread.currentThread().isInterrupted() ) {
+         throw new RuntimeException( new InterruptedException() );
+      }
+      return _delegate.charAt( index );
+   }
+
+   @Override
+   public int length() {
+      return _delegate.length();
+   }
+
+   @Override
+   public CharSequence subSequence( final int start, final int end ) {
+      return new ThreadString( _delegate.subSequence( start, end ) );
+   }
+
+   @Override
+   public String toString() {
+      return _delegate.toString();
+   }
+}



Mime
View raw message