Author: dligach Date: Wed Sep 21 17:41:44 2016 New Revision: 1761788 URL: http://svn.apache.org/viewvc?rev=1761788&view=rev Log: updated to work with my version of dataset.py; also using env variable to locate target dir Modified: ctakes/trunk/ctakes-temporal/scripts/nn/dataset.py ctakes/trunk/ctakes-temporal/scripts/nn/dataset.pyc ctakes/trunk/ctakes-temporal/scripts/nn/predict.py ctakes/trunk/ctakes-temporal/scripts/nn/train_and_package.py Modified: ctakes/trunk/ctakes-temporal/scripts/nn/dataset.py URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/dataset.py?rev=1761788&r1=1761787&r2=1761788&view=diff ============================================================================== --- ctakes/trunk/ctakes-temporal/scripts/nn/dataset.py (original) +++ ctakes/trunk/ctakes-temporal/scripts/nn/dataset.py Wed Sep 21 17:41:44 2016 @@ -1,154 +1,77 @@ #!/usr/bin/env python import numpy as np - import sys sys.dont_write_bytecode = True - import ConfigParser - import glob, string, collections, operator -from fnmatch import fnmatch - -label2int = { - 'none':0, - 'contains':1, - 'contains-1':2 - } - -# will have to do this eventually -# label2int = { -# 'none': 0, -# 'contains': 1, -# 'contains-1': 2, -# 'before': 3, -# 'before-1': 4, -# 'begins-on': 5, -# 'begins-on-1': 6, -# 'ends-on': 7, -# 'ends-on-1': 8, -# 'overlap': 9, -# 'overlap-1': 10, -# } - class DatasetProvider: """THYME relation data""" - def __init__(self, file_names): - """Index words by frequency in a list of files""" - - self.alphabet = {} # words indexed by frequency - - unigrams = [] # read entire corpus into a list - for file_name in file_names: - for line in open(file_name): - label, text = line.strip().split('|') - unigrams.extend(text.split()) + def __init__(self, path): + """Index words by frequency in a file""" + self.word2int = {} # words indexed by frequency + self.label2int = {} # class to int mapping + + unigrams = [] # corpus as list + labels = [] # classes as list + for line in open(path): + label, text = line.strip().split('|') + unigrams.extend(text.split()) + labels.append(label) + index = 1 # zero used to encode unknown words + self.word2int['oov_word'] = 0 unigram_counts = collections.Counter(unigrams) - self.alphabet['oov_word'] = 0 for unigram, count in unigram_counts.most_common(): - self.alphabet[unigram] = index + self.word2int[unigram] = index + index = index + 1 + + index = 0 # index classes + for label in set(labels): + self.label2int[label] = index index = index + 1 - def load(self, path): + def load(self, path, maxlen=float('inf')): """Convert sentences (examples) into lists of indices""" examples = [] labels = [] + for line in open(path): label, text = line.strip().split('|') example = [] for unigram in text.split(): - example.append(self.alphabet[unigram]) - examples.append(example) - labels.append(label2int[label]) - - return examples, labels + if unigram in self.word2int: + example.append(self.word2int[unigram]) + else: + example.append(self.word2int['oov_word']) - def load_if_oov(self, path): + # truncate example if it's too long + if len(example) > maxlen: + example = example[0:maxlen] - examples = [] - labels = [] - for line in open(path): - label,text = line.strip().split('|') - example = [] - for unigram in text.split(): - if(self.alphabet.has_key(unigram)): - example.append(self.alphabet[unigram]) - else: - example.append(self.alphabet["none"]) examples.append(example) - labels.append(label2int[label]) + labels.append(self.label2int[label]) return examples, labels - def load_by_region(self, path): - pres = [] - arg1s = [] - conts = [] - arg2s = [] - posts = [] - labels = [] - for line in open(path): - label,text = line.strip().split('|') - pre,arg1,cont,arg2,post = self.processText(text) - pres.append(pre) - arg1s.append(arg1) - conts.append(cont) - arg2s.append(arg2) - posts.append(post) - labels.append(label2int[label]) - - return pres, arg1s, conts, arg2s, posts, labels - - def processText(self, text): - pre= [] - arg1= [] - cont= [] - arg2= [] - post= [] - - tag = 0 - for unigram in text.split(): - idx = self.alphabet[unigram] - if( fnmatch(unigram, '<*>')): - tag = tag + 1 - continue - if(tag ==0 ): - pre.append(idx) - elif(tag == 1): - arg1.append(idx) - elif(tag == 2): - cont.append(idx) - elif(tag == 3): - arg2.append(idx) - elif(tag == 4): - post.append(idx) - - return pre, arg1, cont, arg2, post - - - if __name__ == "__main__": cfg = ConfigParser.ConfigParser() - cfg.read('settings.ini') - - dataset = DatasetProvider([cfg.get('data', 'train'), - cfg.get('data', 'test')]) - print 'alphabet size:', len(dataset.alphabet) + cfg.read(sys.argv[1]) - x,y = dataset.load(cfg.get('data', 'test')) + dataset = DatasetProvider(cfg.get('data', 'train')) + print 'alphabet size:', len(dataset.word2int) - print 'max seq len:', max([len(s) for s in x]) + x,y = dataset.load(cfg.get('data', 'train')) + print 'train max seq len:', max([len(s) for s in x]) + + x,y = dataset.load(cfg.get('data', 'test'), maxlen=10) print 'number of examples:', len(x) - print 'number of labels:', len(set(y)) + print 'test max seq len:', max([len(s) for s in x]) + print 'labels:', dataset.label2int print 'label counts:', collections.Counter(y) print 'first 10 examples:', x[:10] - print 'class proportions:' - counter = collections.Counter(y) - for label in counter: - print label, counter[label] / float(len(y)), float(len(y)) / counter[label] Modified: ctakes/trunk/ctakes-temporal/scripts/nn/dataset.pyc URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/dataset.pyc?rev=1761788&r1=1761787&r2=1761788&view=diff ============================================================================== Binary files - no diff available. Modified: ctakes/trunk/ctakes-temporal/scripts/nn/predict.py URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/predict.py?rev=1761788&r1=1761787&r2=1761788&view=diff ============================================================================== --- ctakes/trunk/ctakes-temporal/scripts/nn/predict.py (original) +++ ctakes/trunk/ctakes-temporal/scripts/nn/predict.py Wed Sep 21 17:41:44 2016 @@ -12,23 +12,20 @@ def main(args): if len(args) < 1: sys.stderr.write("Error - one required argument: \n") sys.exit(-1) - working_dir = args[0] - int2label = { - 0:'none', - 1:'CONTAINS', - 2:'CONTAINS-1' - } - - ctakes_root = '/Users/Dima/Loyola/Workspaces/cTakes/ctakes/' target_dir = 'ctakes-temporal/target/eval/thyme/train_and_test/event-time/' - model_dir = ctakes_root + target_dir + model_dir = os.path.join(os.environ['CTAKES_ROOT'], target_dir) maxlen = pickle.load(open(os.path.join(model_dir, "maxlen.p"), "rb")) - alphabet = pickle.load(open(os.path.join(model_dir, "alphabet.p"), "rb")) + word2int = pickle.load(open(os.path.join(model_dir, "word2int.p"), "rb")) + label2int = pickle.load(open(os.path.join(model_dir, "label2int.p"), "rb")) model = model_from_json(open(os.path.join(model_dir, "model_0.json")).read()) model.load_weights(os.path.join(model_dir, "model_0.h5")) + int2label = {} + for label, integer in label2int.items(): + int2label[integer] = label + while True: try: line = sys.stdin.readline().rstrip() @@ -37,10 +34,10 @@ def main(args): feats=[] for unigram in line.rstrip().split(): - if(alphabet.has_key(unigram)): - feats.append(alphabet[unigram]) + if(word2int.has_key(unigram)): + feats.append(word2int[unigram]) else: - feats.append(alphabet["none"]) + feats.append(word2int["none"]) if(len(feats) > maxlen): feats=feats[0:maxlen] Modified: ctakes/trunk/ctakes-temporal/scripts/nn/train_and_package.py URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/scripts/nn/train_and_package.py?rev=1761788&r1=1761787&r2=1761788&view=diff ============================================================================== --- ctakes/trunk/ctakes-temporal/scripts/nn/train_and_package.py (original) +++ ctakes/trunk/ctakes-temporal/scripts/nn/train_and_package.py Wed Sep 21 17:41:44 2016 @@ -23,14 +23,13 @@ def main(args): if len(args) < 1: sys.stderr.write("Error - one required argument: \n") sys.exit(-1) - working_dir = args[0] data_file = os.path.join(working_dir, 'training-data.liblinear') # learn alphabet from training data - data_set = dataset.DatasetProvider([data_file]) + provider = dataset.DatasetProvider(data_file) # now load training examples and labels - train_x, train_y = data_set.load(data_file) + train_x, train_y = provider.load(data_file) # turn x and y into numpy array among other things maxlen = max([len(seq) for seq in train_x]) outcomes = set(train_y) @@ -40,7 +39,8 @@ def main(args): train_y = to_categorical(np.array(train_y), classes) pickle.dump(maxlen, open(os.path.join(working_dir, 'maxlen.p'),"wb")) - pickle.dump(data_set.alphabet, open(os.path.join(working_dir, 'alphabet.p'),"wb")) + pickle.dump(provider.word2int, open(os.path.join(working_dir, 'word2int.p'),"wb")) + pickle.dump(provider.label2int, open(os.path.join(working_dir, 'label2int.p'),"wb")) print 'train_x shape:', train_x.shape print 'train_y shape:', train_y.shape @@ -51,7 +51,7 @@ def main(args): for filter_len in '2,3,4,5'.split(','): branch = Sequential() - branch.add(Embedding(len(data_set.alphabet), + branch.add(Embedding(len(provider.word2int), 300, input_length=maxlen, weights=None))