ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From c...@apache.org
Subject svn commit: r1783420 [1/2] - in /ctakes/trunk/ctakes-temporal/src/main: java/org/apache/ctakes/temporal/ae/ java/org/apache/ctakes/temporal/ae/feature/ resources/org/apache/ctakes/temporal/
Date Fri, 17 Feb 2017 16:17:29 GMT
Author: clin
Date: Fri Feb 17 16:17:28 2017
New Revision: 1783420

URL: http://svn.apache.org/viewvc?rev=1783420&view=rev
Log:
updated docTimeRel annotator with:
embedding features
EventPropertyExtractor
TImeXExtractor

Added:
    ctakes/trunk/ctakes-temporal/src/main/resources/org/apache/ctakes/temporal/thyme_word2vec_mapped_50.vec
  (with props)
Modified:
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/DocTimeRelAnnotator.java
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/EventPropertyExtractor.java
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/TimeXExtractor.java

Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/DocTimeRelAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/DocTimeRelAnnotator.java?rev=1783420&r1=1783419&r2=1783420&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/DocTimeRelAnnotator.java
(original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/DocTimeRelAnnotator.java
Fri Feb 17 16:17:28 2017
@@ -19,6 +19,7 @@
 package org.apache.ctakes.temporal.ae;
 
 import java.io.File;
+import java.io.IOException;
 import java.util.ArrayList;
 //import java.io.IOException;
 import java.util.List;
@@ -27,11 +28,14 @@ import java.util.Map;
 
 import org.apache.ctakes.temporal.ae.feature.ClosestVerbExtractor;
 import org.apache.ctakes.temporal.ae.feature.ContinuousTextExtractor;
+import org.apache.ctakes.temporal.ae.feature.CoveredTextToValuesExtractor;
+import org.apache.ctakes.temporal.ae.feature.DateAndMeasurementExtractor;
 import org.apache.ctakes.temporal.ae.feature.EventPropertyExtractor;
 import org.apache.ctakes.temporal.ae.feature.NearbyVerbTenseXExtractor;
 import org.apache.ctakes.temporal.ae.feature.SectionHeaderExtractor;
 import org.apache.ctakes.temporal.ae.feature.TimeXExtractor;
 import org.apache.ctakes.temporal.ae.feature.UmlsSingleFeatureExtractor;
+import org.apache.ctakes.temporal.ae.feature.duration.DurationExpectationFeatureExtractor;
 import org.apache.ctakes.temporal.utils.SoftMaxUtil;
 import org.apache.ctakes.typesystem.type.refsem.Event;
 import org.apache.ctakes.typesystem.type.refsem.EventProperties;
@@ -53,9 +57,14 @@ import org.cleartk.ml.DataWriter;
 import org.cleartk.ml.Feature;
 import org.cleartk.ml.Instance;
 import org.cleartk.ml.feature.extractor.CleartkExtractor;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Bag;
 import org.cleartk.ml.feature.extractor.CleartkExtractor.Covered;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.FirstCovered;
 import org.cleartk.ml.feature.extractor.CleartkExtractor.Following;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.LastCovered;
 import org.cleartk.ml.feature.extractor.CleartkExtractor.Preceding;
+import org.cleartk.ml.feature.function.CharacterCategoryPatternFunction;
+import org.cleartk.ml.feature.function.CharacterCategoryPatternFunction.PatternType;
 import org.cleartk.ml.feature.extractor.CleartkExtractorException;
 import org.cleartk.ml.feature.extractor.CombinedExtractor1;
 import org.cleartk.ml.feature.extractor.CoveredTextExtractor;
@@ -64,6 +73,8 @@ import org.cleartk.ml.jar.DefaultDataWri
 import org.cleartk.ml.jar.DirectoryDataWriterFactory;
 import org.cleartk.ml.jar.GenericJarClassifierFactory;
 
+import com.google.common.base.Charsets;
+
 //import com.google.common.base.Charsets;
 
 public class DocTimeRelAnnotator extends CleartkAnnotator<String> {
@@ -109,17 +120,19 @@ public class DocTimeRelAnnotator extends
 
 	private CleartkExtractor<EventMention, BaseToken> contextExtractor;  
 	private CleartkExtractor<EventMention, BaseToken> tokenVectorContext;
+	private CleartkExtractor<EventMention, BaseToken> tokenVectorContext2;
 	private ContinuousTextExtractor continuousText;
+	private ContinuousTextExtractor continuousText2;
 	private SectionHeaderExtractor sectionIDExtractor;
 	private ClosestVerbExtractor closestVerbExtractor;
 	private TimeXExtractor timeXExtractor;
 	private EventPropertyExtractor genericExtractor;
-	private UmlsSingleFeatureExtractor umlsExtractor;
+//	private UmlsSingleFeatureExtractor umlsExtractor;
 	private NearbyVerbTenseXExtractor verbTensePatternExtractor;
 
-	//  private DateAndMeasurementExtractor dateExtractor;  
-	//  private CoveredTextToValuesExtractor disSemExtractor;
-	//  private DurationExpectationFeatureExtractor durationExtractor;
+	private DateAndMeasurementExtractor dateExtractor;  
+//	private CoveredTextToValuesExtractor disSemExtractor;
+//	private DurationExpectationFeatureExtractor durationExtractor;
 
 	public static final String PARAM_PROB_VIEW = "ProbView";
 	@ConfigurationParameter(name=PARAM_PROB_VIEW, mandatory=false)
@@ -130,16 +143,21 @@ public class DocTimeRelAnnotator extends
 		super.initialize(context);
 		CombinedExtractor1<BaseToken> baseExtractor = new CombinedExtractor1<>(
 				new CoveredTextExtractor<BaseToken>(),
+				CharacterCategoryPatternFunction.<BaseToken>createExtractor(PatternType.ONE_PER_CHAR),
 				new TypePathExtractor<>(BaseToken.class, "partOfSpeech"));
 		this.contextExtractor = new CleartkExtractor<>(
 				BaseToken.class,
 				baseExtractor,
 				new Preceding(3),
-				new Covered(),
+				new FirstCovered(1),
+				new LastCovered(1),
+				new Bag(new Covered()),
 				new Following(3));
 		final String vectorFile = "org/apache/ctakes/temporal/mimic_vectors.txt";
+		final String vectorFile2 = "org/apache/ctakes/temporal/thyme_word2vec_mapped_50.vec";
 		try {
 			this.continuousText = new ContinuousTextExtractor(vectorFile);
+			this.continuousText2 = new ContinuousTextExtractor(vectorFile2);
 		} catch (CleartkExtractorException e) {
 			System.err.println("cannot find file: "+ vectorFile);
 			e.printStackTrace();
@@ -147,25 +165,29 @@ public class DocTimeRelAnnotator extends
 		this.tokenVectorContext = new CleartkExtractor<>(
 				BaseToken.class,
 				continuousText,      
-				//new Preceding(5),
+				new Preceding(5),
+				new Covered(),
+				new Following(5));
+		this.tokenVectorContext2 = new CleartkExtractor<>(
+				BaseToken.class,
+				continuousText2,  
 				new Covered());
-		//new Following(5));
 		this.sectionIDExtractor = new SectionHeaderExtractor();
 		this.closestVerbExtractor = new ClosestVerbExtractor();
 		this.timeXExtractor = new TimeXExtractor();
 		this.genericExtractor = new EventPropertyExtractor();
-		this.umlsExtractor = new UmlsSingleFeatureExtractor();
+//		this.umlsExtractor = new UmlsSingleFeatureExtractor();
 		this.verbTensePatternExtractor = new NearbyVerbTenseXExtractor();
 
-		//    this.dateExtractor = new DateAndMeasurementExtractor();
-
-		//    try {
-		//    	Map<String, double[]> word_disSem = CoveredTextToValuesExtractor.parseTextDoublesMap(new
File("src/main/resources/embeddings.size25.txt"), Charsets.UTF_8);
-		//    	this.disSemExtractor = new CoveredTextToValuesExtractor("DisSemFeat", word_disSem);
-		//	} catch (IOException e) {
-		//		e.printStackTrace();
-		//	}
-		//    this.durationExtractor = new DurationExpectationFeatureExtractor();
+		this.dateExtractor = new DateAndMeasurementExtractor();
+		
+//		try {
+//			Map<String, double[]> word_disSem = CoveredTextToValuesExtractor.parseTextDoublesMap(new
File("src/main/resources/embeddings.size25.txt"), Charsets.UTF_8);
+//			this.disSemExtractor = new CoveredTextToValuesExtractor("DisSemFeat", word_disSem);
+//		} catch (IOException e) {
+//			e.printStackTrace();
+//		}
+//		this.durationExtractor = new DurationExpectationFeatureExtractor();
 	}
 
 	@Override
@@ -176,22 +198,24 @@ public class DocTimeRelAnnotator extends
 			if(sents!=null && sents.size()>0){
 				features.addAll(this.contextExtractor.extractWithin(jCas, eventMention, sents.get(0)));
 				features.addAll(this.tokenVectorContext.extractWithin(jCas, eventMention, sents.get(0)));
+				features.addAll(this.tokenVectorContext2.extractWithin(jCas, eventMention, sents.get(0)));
 			}else{
 				features.addAll(this.contextExtractor.extract(jCas, eventMention));
 				features.addAll(this.tokenVectorContext.extract(jCas, eventMention));
+				features.addAll(this.tokenVectorContext2.extract(jCas, eventMention));
 			}
 
 			features.addAll(this.sectionIDExtractor.extract(jCas, eventMention)); //add section heading
 			features.addAll(this.closestVerbExtractor.extract(jCas, eventMention)); //add closest
verb
 			features.addAll(this.timeXExtractor.extract(jCas, eventMention)); //add the closest time
expression types
 			features.addAll(this.genericExtractor.extract(jCas, eventMention)); //add the closest
time expression types
-			features.addAll(this.umlsExtractor.extract(jCas, eventMention)); //add umls features
+//			features.addAll(this.umlsExtractor.extract(jCas, eventMention)); //add umls features
 			features.addAll(this.verbTensePatternExtractor.extract(jCas, eventMention));//add nearby
verb POS pattern feature
 
 			//    
-			//    features.addAll(this.dateExtractor.extract(jCas, eventMention)); //add the closest
NE type
-			//    features.addAll(this.durationExtractor.extract(jCas, eventMention)); //add duration
feature
-			//    features.addAll(this.disSemExtractor.extract(jCas, eventMention)); //add distributional
semantic features
+			features.addAll(this.dateExtractor.extract(jCas, eventMention)); //add the closest NE
type
+//			features.addAll(this.durationExtractor.extract(jCas, eventMention)); //add duration
feature
+//			features.addAll(this.disSemExtractor.extract(jCas, eventMention)); //add distributional
semantic features
 			if (this.isTraining()) {
 				if(eventMention.getEvent() != null){
 					String outcome = eventMention.getEvent().getProperties().getDocTimeRel();

Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/EventPropertyExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/EventPropertyExtractor.java?rev=1783420&r1=1783419&r2=1783420&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/EventPropertyExtractor.java
(original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/EventPropertyExtractor.java
Fri Feb 17 16:17:28 2017
@@ -25,14 +25,20 @@ import java.util.List;
 //import java.util.logging.Logger;
 import java.util.Set;
 
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.syntax.NewlineToken;
+import org.apache.ctakes.typesystem.type.syntax.PunctuationToken;
+import org.apache.ctakes.typesystem.type.syntax.WordToken;
 import org.apache.ctakes.typesystem.type.textsem.EventMention;
 import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.fit.util.JCasUtil;
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.jcas.tcas.Annotation;
 import org.cleartk.ml.Feature;
 import org.cleartk.ml.feature.extractor.CleartkExtractorException;
 import org.cleartk.ml.feature.extractor.FeatureExtractor1;
+import org.cleartk.util.ViewUriUtil;
 
 public class EventPropertyExtractor implements FeatureExtractor1<Annotation> {
 
@@ -46,10 +52,55 @@ public class EventPropertyExtractor impl
 		//		this.name = "EventContextualModality";
 
 	}
+	
+	private static final List<String> genericWords = new ArrayList<>();
+	static{
+		genericWords.add("potential");
+		genericWords.add("possible");
+		genericWords.add("may");
+		genericWords.add("likely");
+		genericWords.add("probable");			
+		genericWords.add("prospective");
+		genericWords.add("instruct");
+		genericWords.add("if");//newly added 4 on July 13 2016
+		genericWords.add("could");
+		genericWords.add("discussed");
+		genericWords.add("discussion");
+		genericWords.add("considered");
+		genericWords.add("monitor");//newly added on Aug 19 2016
+		genericWords.add("plan");//newly added on Aug 19 2016
+		genericWords.add("cxr");
+		genericWords.add("data");
+		//			genericWords.add("change");
+		//			genericWords.add("prescription");
+		//			genericWords.add("prescribe");
+		//			genericWords.add("prescribed");
+		//			genericWords.add("speak");
+		//			genericWords.add("spoke");
+	}
 
 	@Override
 	public List<Feature> extract(JCas view, Annotation annotation) throws CleartkExtractorException
{
 		List<Feature> features = new ArrayList<>();
+		
+		//get Document ID:
+		try {
+			String docID = ViewUriUtil.getURI(view).toString();
+			
+			int begin = docID.lastIndexOf("_");
+			String fname = docID.substring(begin+1);
+			features.add(new Feature("docName", fname));
+			
+			if(fname.equals("RAD")||fname.equals("SP")){
+				features.add(new Feature("docName:RAD+SP"));
+			}else{
+				features.add(new Feature("docName:others"));
+			}
+			
+		} catch (AnalysisEngineProcessException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		}
 
 		//1 get event:
 		EventMention event = (EventMention)annotation;
@@ -85,12 +136,79 @@ public class EventPropertyExtractor impl
 					features.add(new Feature("RightMostEvent"));
 				}
 			}
+			
+			//check if this event is generic:
+			List<WordToken> words = new ArrayList<>(JCasUtil.selectPreceding(view, WordToken.class,
event, 15));
+			words.addAll(JCasUtil.selectFollowing(view, WordToken.class, event, 15));
+			for(WordToken word : words){
+				if(outsideScope(word, coveringSent)){//if the word is outside the sentence
+					continue;
+				}
+				if(genericWords.contains(word.getCoveredText().toLowerCase())){
+					features.add(new Feature("GenericEvent"));
+					break;
+				}
+			}
+			
+			//check how many words are in the event mention:
+//			List<WordToken> coveredWords = new ArrayList<>(JCasUtil.selectCovered(view,
WordToken.class, event));
+//			int numWords = coveredWords.size();
+//			if(numWords==1){
+//				features.add(new Feature("singleWordEvent"));
+//			}
+//			features.add(new Feature("Event_Word_num", numWords));
+			
+			//check if there is any newLine token in close vicinity:
+			int newlineNum = 0;
+			for (BaseToken btoken: JCasUtil.selectPreceding(view, BaseToken.class, event, 20)){
+				if(btoken instanceof NewlineToken){
+					newlineNum++;
+				}
+			}
+			if(newlineNum > 0){
+				features.add(new Feature("hasPrecedingNewline"));
+				features.add(new Feature("newLineNum_preceding", newlineNum));
+			}
+			newlineNum = 0;
+			for (BaseToken btoken: JCasUtil.selectFollowing(view, BaseToken.class, event, 20)){
+				if(btoken instanceof NewlineToken){
+					newlineNum++;
+				}
+			}
+			if(newlineNum > 0){
+				features.add(new Feature("hasFollowingNewline"));
+				features.add(new Feature("newLineNum_following", newlineNum));
+			}
+			
+			//check if there is any semi-column is close vicinity:
+//			int	semiColumnNum = 0;
+//			for (BaseToken btoken: JCasUtil.selectFollowing(view, BaseToken.class, event, 5)){
+//				if(btoken instanceof PunctuationToken){
+//					if(btoken.getCoveredText().equals(":")){
+//						semiColumnNum++;
+//					}
+//				}
+//			}
+//			if(semiColumnNum > 0){
+//				features.add(new Feature("hasFollowingSemiColumn"));
+//				features.add(new Feature("semiColumn_following", semiColumnNum));
+//			}
 		}
 
 		features.addAll(getEventFeats("mentionProperty", event));
 
 		return features;
 	}
+	
+	private static boolean outsideScope(WordToken word, Sentence eventSent) {
+		if(word.getBegin()< eventSent.getBegin()){
+			return true;
+		}else if(word.getEnd()>eventSent.getEnd()){
+			return true;
+		}
+		return false;
+	}
+
 
 	private static Collection<? extends Feature> getEventFeats(String name, EventMention
mention) {
 		List<Feature> feats = new ArrayList<>();

Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/TimeXExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/TimeXExtractor.java?rev=1783420&r1=1783419&r2=1783420&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/TimeXExtractor.java
(original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/TimeXExtractor.java
Fri Feb 17 16:17:28 2017
@@ -25,6 +25,7 @@ import java.util.Map;
 import java.util.TreeMap;
 //import java.util.logging.Logger;
 
+import org.apache.ctakes.typesystem.type.syntax.NumToken;
 //import org.apache.ctakes.temporal.ae.feature.treekernel.TemporalPETExtractor;
 //import org.apache.ctakes.temporal.ae.feature.treekernel.TemporalSingleTreeExtractor;
 import org.apache.ctakes.typesystem.type.syntax.TreebankNode;
@@ -91,6 +92,8 @@ public class TimeXExtractor implements F
 	  if (sentList != null && !sentList.isEmpty()){
 		  timeDistMap = new TreeMap<>();
 		  
+		  //boolean hasNumberToken = false;
+		  
 		  for(Sentence sent : sentList) {
 			  for (TimeMention time : JCasUtil.selectCovered(view, TimeMention.class, sent)) {
 				  timeDistMap.put(Math.abs(time.getBegin() - annotation.getBegin()), time);
@@ -101,7 +104,16 @@ public class TimeXExtractor implements F
 			  for (DateAnnotation time : JCasUtil.selectCovered(view, DateAnnotation.class, sent))
{
 				  timeDistMap.put(Math.abs(time.getBegin() - annotation.getBegin()), time);
 			  }
+			  //for (NumToken number : JCasUtil.selectCovered(view, NumToken.class, sent)){
+			//	  hasNumberToken = true;
+			//	  int numDigit = number.getCoveredText().length();
+			//	  features.add(new Feature("num_digit_numToken", numDigit));
+			 // }
 		  }
+
+		  //if(hasNumberToken){
+			//  features.add(new Feature("has_number_tokens_in_sentence"));
+		  //}
 		  
 		  //get the closest Time Expression feature
 		  for (Map.Entry<Integer, IdentifiedAnnotation> entry : timeDistMap.entrySet()) {



Mime
View raw message