ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From c...@apache.org
Subject svn commit: r1799255 - in /ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal: nn/data/ArgContextProvider.java utils/TokenPreprocForWord2Vec.java
Date Mon, 19 Jun 2017 18:35:29 GMT
Author: clin
Date: Mon Jun 19 18:35:29 2017
New Revision: 1799255

URL: http://svn.apache.org/viewvc?rev=1799255&view=rev
Log:
For token sequence output, completely tokenize arguments, including events and time expressions,
using Dima's Token processing logic 

Added:
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/utils/TokenPreprocForWord2Vec.java
Modified:
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/nn/data/ArgContextProvider.java

Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/nn/data/ArgContextProvider.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/nn/data/ArgContextProvider.java?rev=1799255&r1=1799254&r2=1799255&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/nn/data/ArgContextProvider.java
(original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/nn/data/ArgContextProvider.java
Mon Jun 19 18:35:29 2017
@@ -3,7 +3,11 @@ package org.apache.ctakes.temporal.nn.da
 import java.util.ArrayList;
 import java.util.List;
 
+import org.apache.ctakes.temporal.utils.TokenPreprocForWord2Vec;
 import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.syntax.WordToken;
+import org.apache.ctakes.typesystem.type.syntax.NewlineToken;
+import org.apache.ctakes.typesystem.type.textsem.EventMention;
 import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
 import org.apache.ctakes.typesystem.type.textsem.TimeMention;
 import org.apache.ctakes.typesystem.type.textspan.Sentence;
@@ -214,27 +218,47 @@ public class ArgContextProvider {
 		List<String> tokens = new ArrayList<>();
 		for(BaseToken baseToken :  JCasUtil.selectPreceding(jCas, BaseToken.class, left, contextSize))
{
 			if(sent.getBegin() <= baseToken.getBegin()) {
-				tokens.add(baseToken.getCoveredText()); 
+				//				if(!(baseToken instanceof NewlineToken)){
+				String stringValue = TokenPreprocForWord2Vec.tokenToString(baseToken);
+				tokens.add(stringValue);//baseToken.getCoveredText()); 
+				//				}
 			}
 		}
 		tokens.add("<" + leftType + ">");
-		tokens.add(left.getCoveredText());
+		//tokens.add(left.getCoveredText());
+		for(BaseToken base : JCasUtil.selectCovered(jCas, BaseToken.class, left)){
+			String stringValue = TokenPreprocForWord2Vec.tokenToString(base);
+			tokens.add(stringValue);
+		}
+
 		tokens.add("</" + leftType + ">");
 		for(BaseToken baseToken : JCasUtil.selectBetween(jCas, BaseToken.class, left, right)) {
-			tokens.add(baseToken.getCoveredText());
+			//			if(!(baseToken instanceof NewlineToken)){
+			String stringValue = TokenPreprocForWord2Vec.tokenToString(baseToken);
+			tokens.add(stringValue);//baseToken.getCoveredText()); 
+			//			}
 		}
 		tokens.add("<" + rightType + ">");
-		tokens.add(right.getCoveredText());
+		//tokens.add(right.getCoveredText());
+		for(BaseToken base : JCasUtil.selectCovered(jCas, BaseToken.class, right)){
+			String stringValue = TokenPreprocForWord2Vec.tokenToString(base);
+			tokens.add(stringValue);
+		}
+
 		tokens.add("</" + rightType + ">");
 		for(BaseToken baseToken : JCasUtil.selectFollowing(jCas, BaseToken.class, right, contextSize))
{
 			if(baseToken.getEnd() <= sent.getEnd()) {
-				tokens.add(baseToken.getCoveredText());
+				//				if(!(baseToken instanceof NewlineToken)){
+				String stringValue = TokenPreprocForWord2Vec.tokenToString(baseToken);
+				tokens.add(stringValue);//baseToken.getCoveredText()); 
+				//				}
 			}
 		}
 
 		return String.join(" ", tokens).replaceAll("[\r\n]", " ");
 	}
 
+
 	/**
 	 * Print POS tags from left to right.
 	 * @param contextSize number of tokens to include on the left of arg1 and on the right of
arg2
@@ -251,25 +275,35 @@ public class ArgContextProvider {
 		List<String> tokens = new ArrayList<>();
 		for(BaseToken baseToken :  JCasUtil.selectPreceding(jCas, BaseToken.class, left, contextSize))
{
 			if(sent.getBegin() <= baseToken.getBegin()) {
-				tokens.add(baseToken.getPartOfSpeech()); 
+				if(!baseToken.getCoveredText().equals(" ")){
+					tokens.add(baseToken.getPartOfSpeech());
+				}
 			}
 		}
 		tokens.add("<" + leftType + ">");
 		for(BaseToken baseToken : JCasUtil.selectCovered(jCas, BaseToken.class, left)) {
-			tokens.add(baseToken.getPartOfSpeech());
+			if(!(baseToken instanceof NewlineToken)){
+				tokens.add(baseToken.getPartOfSpeech());
+			}
 		}
 		tokens.add("</" + leftType + ">");
 		for(BaseToken baseToken : JCasUtil.selectBetween(jCas, BaseToken.class, left, right)) {
-			tokens.add(baseToken.getPartOfSpeech());
+			if(!(baseToken instanceof NewlineToken)){
+				tokens.add(baseToken.getPartOfSpeech());
+			}
 		}
 		tokens.add("<" + rightType + ">");
 		for(BaseToken baseToken : JCasUtil.selectCovered(jCas, BaseToken.class, right)) {
-			tokens.add(baseToken.getPartOfSpeech());
+			if(!(baseToken instanceof NewlineToken)){
+				tokens.add(baseToken.getPartOfSpeech());
+			}
 		}
 		tokens.add("</" + rightType + ">");
 		for(BaseToken baseToken : JCasUtil.selectFollowing(jCas, BaseToken.class, right, contextSize))
{
 			if(baseToken.getEnd() <= sent.getEnd()) {
-				tokens.add(baseToken.getPartOfSpeech());
+				if(!(baseToken instanceof NewlineToken)){
+					tokens.add(baseToken.getPartOfSpeech());
+				}
 			}
 		}
 
@@ -363,4 +397,65 @@ public class ArgContextProvider {
 		return null;
 	}
 
+	public static String getTokenContext(JCas jCas, Sentence sent, IdentifiedAnnotation left,
String leftType,
+			String umlsleft, IdentifiedAnnotation right, String rightType, String umlsright, int contextSize)
{
+		List<String> tokens = new ArrayList<>();
+		for(BaseToken baseToken :  JCasUtil.selectPreceding(jCas, BaseToken.class, left, contextSize))
{
+			if(sent.getBegin() <= baseToken.getBegin()) {
+				//				if(!(baseToken instanceof NewlineToken)){
+				tokens.add(baseToken.getCoveredText()); 
+				//				}
+			}
+		}
+		tokens.add("<" + leftType + ">");
+		tokens.add(umlsleft);
+		tokens.add("</" + leftType + ">");
+		//		for(BaseToken baseToken : JCasUtil.selectBetween(jCas, BaseToken.class, left, right))
{
+		////			if(!(baseToken instanceof NewlineToken)){
+		//				tokens.add(baseToken.getCoveredText()); 
+		////			}
+		//		}
+		//find all non-overlapping events between to arguments:
+		List<EventMention> nonOverlapEvents = new ArrayList<>();
+		for(EventMention event : JCasUtil.selectBetween(jCas, EventMention.class, left, right)){
+			int coveringNum = JCasUtil.selectCovering(jCas, EventMention.class, event).size();
+			int coveredWord = JCasUtil.selectCovered(jCas, WordToken.class, event).size();
+			if(coveringNum <=1 && !event.getClass().equals(EventMention.class) &&
coveredWord > 1){
+				nonOverlapEvents.add(event);
+			}
+		}
+		if(nonOverlapEvents.size()==0){
+			for(BaseToken baseToken : JCasUtil.selectBetween(jCas, BaseToken.class, left, right))
{
+				//			if(!(baseToken instanceof NewlineToken)){
+				tokens.add(baseToken.getCoveredText()); 
+				//			}
+			}
+		}else{
+			IdentifiedAnnotation leftentity = left;
+			for(EventMention event : nonOverlapEvents){
+				for(BaseToken baseToken : JCasUtil.selectBetween(jCas, BaseToken.class, leftentity, event))
{
+					tokens.add(baseToken.getCoveredText()); 
+				}
+				tokens.add("umls_"+event.getTypeID());
+				leftentity=event;
+			}
+			for(BaseToken baseToken : JCasUtil.selectBetween(jCas, BaseToken.class, leftentity, right))
{
+				tokens.add(baseToken.getCoveredText()); 
+			}
+		}
+
+		tokens.add("<" + rightType + ">");
+		tokens.add(umlsright);
+		tokens.add("</" + rightType + ">");
+		for(BaseToken baseToken : JCasUtil.selectFollowing(jCas, BaseToken.class, right, contextSize))
{
+			if(baseToken.getEnd() <= sent.getEnd()) {
+				//				if(!(baseToken instanceof NewlineToken)){
+				tokens.add(baseToken.getCoveredText()); 
+				//				}
+			}
+		}
+
+		return String.join(" ", tokens).replaceAll("[\r\n]", " ");
+	}
+
 }

Added: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/utils/TokenPreprocForWord2Vec.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/utils/TokenPreprocForWord2Vec.java?rev=1799255&view=auto
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/utils/TokenPreprocForWord2Vec.java
(added)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/utils/TokenPreprocForWord2Vec.java
Mon Jun 19 18:35:29 2017
@@ -0,0 +1,40 @@
+package org.apache.ctakes.temporal.utils;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+
+public class TokenPreprocForWord2Vec {
+
+  /**
+   * Determine what to print based on the token's type.
+   */
+  public static String tokenToString(BaseToken token) {
+
+    String stringValue;
+    String tokenType = token.getClass().getSimpleName();
+    String tokenText = token.getCoveredText().toLowerCase();
+
+    switch(tokenType) {
+    case "ContractionToken":
+      stringValue = tokenText;
+      break;
+    case "NewlineToken":
+      stringValue = null;
+      break;
+    case "NumToken":
+      stringValue = "number_token";
+      break;
+    case "PunctuationToken":
+      stringValue = tokenText;
+      break;
+    case "SymbolToken":
+      stringValue = tokenText;
+      break;
+    case "WordToken":
+      stringValue = tokenText;
+      break;
+    default:
+      throw new IllegalArgumentException("Invalid token type: " + tokenType);
+    }
+
+    return stringValue;
+  }
+}
\ No newline at end of file



Mime
View raw message