ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From seanfi...@apache.org
Subject svn commit: r1865052 - in /ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr: FileTreeReader.java FilesInDirectoryCollectionReader.java
Date Tue, 13 Aug 2019 17:25:02 GMT
Author: seanfinan
Date: Tue Aug 13 17:25:01 2019
New Revision: 1865052

URL: http://svn.apache.org/viewvc?rev=1865052&view=rev
Log:
Deprecated FilesInDirectoryCollectionReader in favor of FileTreeReader

Modified:
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/FileTreeReader.java
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/FilesInDirectoryCollectionReader.java

Modified: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/FileTreeReader.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/FileTreeReader.java?rev=1865052&r1=1865051&r2=1865052&view=diff
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/FileTreeReader.java (original)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/FileTreeReader.java Tue
Aug 13 17:25:01 2019
@@ -28,7 +28,7 @@ import java.util.stream.Stream;
  * @since 2/10/2016
  */
 @PipeBitInfo(
-      name = "Files in Dir Tree Reader",
+      name = "File Tree Reader",
       description = "Reads document texts from text files in a directory tree.",
       role = PipeBitInfo.Role.READER,
       products = { PipeBitInfo.TypeProduct.DOCUMENT_ID, PipeBitInfo.TypeProduct.DOCUMENT_ID_PREFIX
}

Modified: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/FilesInDirectoryCollectionReader.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/FilesInDirectoryCollectionReader.java?rev=1865052&r1=1865051&r2=1865052&view=diff
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/FilesInDirectoryCollectionReader.java
(original)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cr/FilesInDirectoryCollectionReader.java
Tue Aug 13 17:25:01 2019
@@ -43,6 +43,7 @@ import org.apache.ctakes.core.config.Con
 import org.apache.ctakes.core.pipeline.PipeBitInfo;
 import org.apache.ctakes.core.resource.FileLocator;
 import org.apache.ctakes.typesystem.type.structured.DocumentID;
+import org.apache.log4j.Logger;
 import org.apache.uima.cas.CAS;
 import org.apache.uima.cas.CASException;
 import org.apache.uima.collection.CollectionException;
@@ -59,246 +60,221 @@ import java.util.List;
 
 //import org.apache.uima.jcas.tcas.DocumentAnnotation;
 
+/**
+ * @deprecated Please use FileTreeReader.
+ * FileTreeReader is more robust plus it can handle patient names, doc types, doc times,
doc id prefixes, etc.
+ */
 @PipeBitInfo(
-      name = "Files in Dir Reader",
-      description = "Reads document texts from text files in a directory.",
-      role = PipeBitInfo.Role.READER,
-      products = { PipeBitInfo.TypeProduct.DOCUMENT_ID }
+		name = "Files in Dir Reader",
+		description = "Reads document texts from text files in a directory.",
+		role = PipeBitInfo.Role.READER,
+		products = { PipeBitInfo.TypeProduct.DOCUMENT_ID }
 )
 public class FilesInDirectoryCollectionReader extends CollectionReader_ImplBase
 {
-	  /**
-	   * Name of configuration parameter that contains the character encoding used
-	   * by the input files.  If not specified, the default system encoding will
-	   * be used.
-	   */
-	  public static final String PARAM_ENCODING = "Encoding";
-
-	  /**
-	   * Name of optional configuration parameter that contains the language of
-	   * the documents in the input directory.  If specified this information will
-	   * be added to the CAS.
-	   */
-	  public static final String PARAM_LANGUAGE = "Language";
-
-	  /**Name of optional configuration parameter that specifies the extensions
-	     * of the files that the collection reader will read.  Values for this
-	     * parameter should not begin with a dot <code>'.'</code>.
-	     */
-	    
-	  public static final String PARAM_EXTENSIONS = "Extensions";
-	    
-	  public static final String PARAM_RECURSE = "Recurse";
-	  
-	  protected ArrayList<File> iv_files;
-	  private String iv_encoding;
-	  private String iv_language;
-	  private static String[] iv_extensions; 
-
-      protected int iv_currentIndex;
-	  
-      private boolean iv_recurse = false;
-      
-      private String iv_rootPath = "";
-      
-	  /**
-	   * @see org.apache.uima.collection.CollectionReader_ImplBase#initialize()
-	   */
-	  @Override
-	public void initialize() throws ResourceInitializationException
-	{
-      final String inputDirPath = (String)getConfigParameterValue( ConfigParameterConstants.PARAM_INPUTDIR
);
-      File directory;
-      try {
+	/**
+	 * Name of configuration parameter that contains the character encoding used
+	 * by the input files.  If not specified, the default system encoding will
+	 * be used.
+	 */
+	public static final String PARAM_ENCODING = "Encoding";
+
+	/**
+	 * Name of optional configuration parameter that contains the language of
+	 * the documents in the input directory.  If specified this information will
+	 * be added to the CAS.
+	 */
+	public static final String PARAM_LANGUAGE = "Language";
+
+	/**Name of optional configuration parameter that specifies the extensions
+	 * of the files that the collection reader will read.  Values for this
+	 * parameter should not begin with a dot <code>'.'</code>.
+	 */
+
+	public static final String PARAM_EXTENSIONS = "Extensions";
+
+	public static final String PARAM_RECURSE = "Recurse";
+
+	protected ArrayList<File> iv_files;
+	private String iv_encoding;
+	private String iv_language;
+	private static String[] iv_extensions;
+
+	protected int iv_currentIndex;
+
+	private boolean iv_recurse = false;
+
+	private String iv_rootPath = "";
+
+	/**
+	 * @see org.apache.uima.collection.CollectionReader_ImplBase#initialize()
+	 */
+	@Override
+	public void initialize() throws ResourceInitializationException {
+		Logger.getLogger( getClass().getSimpleName() ).warn( "Deprecated.  Please use FileTreeReader
instead." );
+		final String inputDirPath = (String)getConfigParameterValue( ConfigParameterConstants.PARAM_INPUTDIR
);
+		File directory;
+		try {
 			directory = FileLocator.getFile( inputDirPath );
 		} catch ( IOException ioE ) {
 			throw new ResourceInitializationException(
 					ResourceConfigurationException.DIRECTORY_NOT_FOUND,
-               new Object[] { ConfigParameterConstants.PARAM_INPUTDIR, getMetaData().getName(),
inputDirPath } );
-      }
-      iv_encoding = (String)getConfigParameterValue( PARAM_ENCODING );
+					new Object[] { ConfigParameterConstants.PARAM_INPUTDIR, getMetaData().getName(), inputDirPath
} );
+		}
+		iv_encoding = (String)getConfigParameterValue( PARAM_ENCODING );
 		iv_language = (String)getConfigParameterValue(PARAM_LANGUAGE);
-	    iv_extensions = (String[]) getConfigParameterValue(PARAM_EXTENSIONS);
-	    
-	    iv_currentIndex = 0;
-
-	    iv_recurse = false;
-	    Boolean recurse = (Boolean) getConfigParameterValue(PARAM_RECURSE);
-	    if(recurse != null)
-	    	iv_recurse = recurse.booleanValue();
-	    iv_rootPath = directory.getPath();
-    	
+		iv_extensions = (String[]) getConfigParameterValue(PARAM_EXTENSIONS);
+
+		iv_currentIndex = 0;
+
+		iv_recurse = false;
+		Boolean recurse = (Boolean) getConfigParameterValue(PARAM_RECURSE);
+		if(recurse != null)
+			iv_recurse = recurse.booleanValue();
+		iv_rootPath = directory.getPath();
+
 		//if input directory does not exist or is not a directory, throw exception
-		if (!directory.exists() || !directory.isDirectory())
-		{
+		if (!directory.exists() || !directory.isDirectory()) {
 			throw new ResourceInitializationException(
-				ResourceConfigurationException.DIRECTORY_NOT_FOUND,
-               new Object[] { ConfigParameterConstants.PARAM_INPUTDIR, this.getMetaData().getName(),
-                              directory.getPath() } );
-      }
-
-
-      //get list of files (not subdirectories) in the specified directory
-	    iv_files = new ArrayList<File>();
-	    if(!iv_recurse)
-	    {
-	    	File[] files = directory.listFiles();
-	    	for (int i = 0; i < files.length; i++)
-	    	{
-	    		if (!files[i].isDirectory() && hasValidExtension(files[i]))
-	    		{
-	    			iv_files.add(files[i]);  
-	    		}
-	    	}
-	    }
-	    else
-	    {
-	    	try
-	    	{
-	    		collectFiles(directory, iv_files);
-	    		System.out.println("iv_files.size()="+iv_files.size());
-	    	}
-	    	catch(IOException ioe)
-	    	{
-	    		throw new ResourceInitializationException(ioe);
-	    	}
-	    }
-    }
-	
-    private void collectFiles(File directory, List<File> files) throws IOException
-    {
-        File[] dirFiles = directory.listFiles();
-        for(int i=0; i<dirFiles.length;i++)
-        {
-        	if(dirFiles[i].isDirectory())
-        	{
-                collectFiles(dirFiles[i], files);
-            }
-        	else if(hasValidExtension(dirFiles[i]))
-        	{
-        		files.add(dirFiles[i]);	
-        	}
-        }
-    }
-
-	
-    private boolean hasValidExtension(File file)
-    {
-	    if(iv_extensions == null) return true;
-	    for (int i = 0; i < iv_extensions.length; i++) 
-	    {
-		    if(file.getName().endsWith("."+iv_extensions[i]))
-		    {
-			    return true;
-		    }
-	    }
-	    return false;
-    }
-	 
-	
-    /**
-     * @see org.apache.uima.collection.CollectionReader#hasNext()
-     */
-    @Override
-    public boolean hasNext() {
-       return iv_currentIndex < iv_files.size();
-	}
+					ResourceConfigurationException.DIRECTORY_NOT_FOUND,
+					new Object[] { ConfigParameterConstants.PARAM_INPUTDIR, this.getMetaData().getName(),
+										directory.getPath() } );
+		}
 
-	  /**
-	   * @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS)
-	   */
-     @Override
-     public void getNext( CAS aCAS ) throws IOException, CollectionException {
-        JCas jcas;
-	  	InputStream fileInputStream = null;
-	  	Reader fileReader = null;
-	  	
-	  	try
-	    {
-	        jcas = aCAS.getJCas();
-	   	
-	  		//open input stream to file
-          File file = iv_files.get( iv_currentIndex );
-          fileInputStream = new FileInputStream( file );
-          fileReader = new BufferedReader(new InputStreamReader(fileInputStream));
-
-	      	DocumentID documentIDAnnotation = new DocumentID(jcas);
-		    String docID = createDocID(file);
-		    documentIDAnnotation.setDocumentID(docID);
-		    documentIDAnnotation.addToIndexes();
-
-				byte[] contents = new byte[(int)file.length() ];
-				fileInputStream.read( contents );   
-				String text;
-				if (iv_encoding != null)
-				{   
-					text = new String(contents, iv_encoding);
-				}
-				else
-				{ 
-					text = new String(contents); 
+
+		//get list of files (not subdirectories) in the specified directory
+		iv_files = new ArrayList<File>();
+		if(!iv_recurse) {
+			File[] files = directory.listFiles();
+			for ( int i = 0; i < files.length; i++) {
+				if (!files[i].isDirectory() && hasValidExtension(files[i])) {
+					iv_files.add(files[i]);
 				}
-				//put document in CAS (assume CAS)
-				jcas.setDocumentText(text);
+			}
+		} else {
+			try {
+				collectFiles(directory, iv_files);
+				System.out.println("iv_files.size()="+iv_files.size());
+			} catch(IOException ioe) {
+				throw new ResourceInitializationException(ioe);
+			}
+		}
+	}
+
+	private void collectFiles(File directory, List<File> files) throws IOException {
+		File[] dirFiles = directory.listFiles();
+		for( int i=0; i<dirFiles.length; i++) {
+			if(dirFiles[i].isDirectory()) {
+				collectFiles(dirFiles[i], files);
+			} else if(hasValidExtension(dirFiles[i])) {
+				files.add(dirFiles[i]);
+			}
+		}
+	}
+
+
+	private boolean hasValidExtension(File file) {
+		if(iv_extensions == null) return true;
+		for ( int i = 0; i < iv_extensions.length; i++) {
+			if(file.getName().endsWith("."+iv_extensions[i])) {
+				return true;
+			}
+		}
+		return false;
+	}
 
-			 //set language if it was explicitly specified as a configuration parameter
-		    if (iv_language != null)
-		    {
-		//      ((DocumentAnnotation)jcas.getDocumentAnnotationFs()).setLanguage(iv_language);
-		    }
-
-	    }		
-	    catch (CASException e)
-	    {
-	      throw new CollectionException(e);
-	    }
-	  	finally
-		{
+
+	/**
+	 * @see org.apache.uima.collection.CollectionReader#hasNext()
+	 */
+	@Override
+	public boolean hasNext() {
+		return iv_currentIndex < iv_files.size();
+	}
+
+	/**
+	 * @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS)
+	 */
+	@Override
+	public void getNext( CAS aCAS ) throws IOException, CollectionException {
+		JCas jcas;
+		InputStream fileInputStream = null;
+		Reader fileReader = null;
+
+		try {
+			jcas = aCAS.getJCas();
+
+			//open input stream to file
+			File file = iv_files.get( iv_currentIndex );
+			fileInputStream = new FileInputStream( file );
+			fileReader = new BufferedReader(new InputStreamReader(fileInputStream));
+
+			DocumentID documentIDAnnotation = new DocumentID(jcas);
+			String docID = createDocID(file);
+			documentIDAnnotation.setDocumentID(docID);
+			documentIDAnnotation.addToIndexes();
+
+			byte[] contents = new byte[(int)file.length() ];
+			fileInputStream.read( contents );
+			String text;
+			if (iv_encoding != null) {
+				text = new String(contents, iv_encoding);
+			} else {
+				text = new String(contents);
+			}
+			//put document in CAS (assume CAS)
+			jcas.setDocumentText(text);
+
+			//set language if it was explicitly specified as a configuration parameter
+			if (iv_language != null) {
+				//      ((DocumentAnnotation)jcas.getDocumentAnnotationFs()).setLanguage(iv_language);
+			}
+
+		} catch (CASException e) {
+			throw new CollectionException(e);
+		} finally {
 			if (fileInputStream != null)
 				fileInputStream.close();
-			iv_currentIndex++;	
-		}  
-	  }
-
-	  private String createDocID(File file)
-	  {
-		    String docID = file.getPath();
-		    if(iv_rootPath.endsWith(""+File.separator) ||
-	           iv_rootPath.equals(""))
-	        {
-	            docID = docID.substring(iv_rootPath.length());
-	        }
-	        else
-	            docID = docID.substring(iv_rootPath.length()+1);
-		   return docID;
-	  }
-	  /**
-	   * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#close()
-	   */
-     @Override
-     public void close() throws IOException {
-     }
-
-	  /**
-	   * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#getProgress()
-	   */
-     @Override
-     public Progress[] getProgress() {
-        return new Progress[]{
-	       new ProgressImpl(iv_currentIndex, iv_files.size(),Progress.ENTITIES)};
-	  }
-
-	  /**
-	   * Gets the total number of documents that will be returned by this
-	   * collection reader.  This is not part of the general collection reader
-	   * interface.
-	   * 
-	   * @return the number of documents in the collection
-	   */
-	  public int getNumberOfDocuments()
-	  {
-	    return iv_files.size();
-	  }
+			iv_currentIndex++;
+		}
+	}
+
+	private String createDocID(File file) {
+		String docID = file.getPath();
+		if( iv_rootPath.endsWith(""+File.separator) ||
+			 iv_rootPath.equals("")) {
+			docID = docID.substring(iv_rootPath.length());
+		} else
+			docID = docID.substring(iv_rootPath.length()+1);
+		return docID;
+	}
+
+	/**
+	 * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#close()
+	 */
+	@Override
+	public void close() throws IOException {
+	}
+
+	/**
+	 * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#getProgress()
+	 */
+	@Override
+	public Progress[] getProgress() {
+		return new Progress[]{
+				new ProgressImpl(iv_currentIndex, iv_files.size(),Progress.ENTITIES)};
+	}
+
+	/**
+	 * Gets the total number of documents that will be returned by this
+	 * collection reader.  This is not part of the general collection reader
+	 * interface.
+	 *
+	 * @return the number of documents in the collection
+	 */
+	public int getNumberOfDocuments() {
+		return iv_files.size();
+	}
 
 
 }



Mime
View raw message