lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From synhers...@apache.org
Subject [05/51] [abbrv] [partial] Cleaning up and getting ready to development towards v4.8
Date Sat, 06 Sep 2014 19:36:16 GMT
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1da1cb5b/src/Lucene.Net.Codecs/Memory/FSTOrdTermsReader.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Codecs/Memory/FSTOrdTermsReader.cs b/src/Lucene.Net.Codecs/Memory/FSTOrdTermsReader.cs
new file mode 100644
index 0000000..67eaa6e
--- /dev/null
+++ b/src/Lucene.Net.Codecs/Memory/FSTOrdTermsReader.cs
@@ -0,0 +1,860 @@
+package org.apache.lucene.codecs.memory;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.BitSet;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.TreeMap;
+
+import org.apache.lucene.codecs.BlockTermState;
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.FieldsProducer;
+import org.apache.lucene.codecs.PostingsReaderBase;
+import org.apache.lucene.codecs.memory.FSTTermsReader.TermsReader;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.DocsEnum;
+import org.apache.lucene.index.FieldInfo.IndexOptions;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.TermState;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.ChecksumIndexInput;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.automaton.ByteRunAutomaton;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
+import org.apache.lucene.util.fst.BytesRefFSTEnum.InputOutput;
+import org.apache.lucene.util.fst.BytesRefFSTEnum;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.Outputs;
+import org.apache.lucene.util.fst.PositiveIntOutputs;
+import org.apache.lucene.util.fst.Util;
+
+/** 
+ * FST-based terms dictionary reader.
+ *
+ * The FST index maps each term and its ord, and during seek 
+ * the ord is used fetch metadata from a single block.
+ * The term dictionary is fully memory resident.
+ *
+ * @lucene.experimental
+ */
+public class FSTOrdTermsReader extends FieldsProducer {
+  static final int INTERVAL = FSTOrdTermsWriter.SKIP_INTERVAL;
+  final TreeMap<String, TermsReader> fields = new TreeMap<>();
+  final PostingsReaderBase postingsReader;
+  int version;
+  //static final bool TEST = false;
+
+  public FSTOrdTermsReader(SegmentReadState state, PostingsReaderBase postingsReader)  {
+    final String termsIndexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, FSTOrdTermsWriter.TERMS_INDEX_EXTENSION);
+    final String termsBlockFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, FSTOrdTermsWriter.TERMS_BLOCK_EXTENSION);
+
+    this.postingsReader = postingsReader;
+    ChecksumIndexInput indexIn = null;
+    IndexInput blockIn = null;
+    bool success = false;
+    try {
+      indexIn = state.directory.openChecksumInput(termsIndexFileName, state.context);
+      blockIn = state.directory.openInput(termsBlockFileName, state.context);
+      version = readHeader(indexIn);
+      readHeader(blockIn);
+      if (version >= FSTOrdTermsWriter.TERMS_VERSION_CHECKSUM) {
+        CodecUtil.checksumEntireFile(blockIn);
+      }
+      
+      this.postingsReader.init(blockIn);
+      seekDir(blockIn);
+
+      final FieldInfos fieldInfos = state.fieldInfos;
+      final int numFields = blockIn.readVInt();
+      for (int i = 0; i < numFields; i++) {
+        FieldInfo fieldInfo = fieldInfos.fieldInfo(blockIn.readVInt());
+        bool hasFreq = fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY;
+        long numTerms = blockIn.readVLong();
+        long sumTotalTermFreq = hasFreq ? blockIn.readVLong() : -1;
+        long sumDocFreq = blockIn.readVLong();
+        int docCount = blockIn.readVInt();
+        int longsSize = blockIn.readVInt();
+        FST<Long> index = new FST<>(indexIn, PositiveIntOutputs.getSingleton());
+
+        TermsReader current = new TermsReader(fieldInfo, blockIn, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize, index);
+        TermsReader previous = fields.put(fieldInfo.name, current);
+        checkFieldSummary(state.segmentInfo, indexIn, blockIn, current, previous);
+      }
+      if (version >= FSTOrdTermsWriter.TERMS_VERSION_CHECKSUM) {
+        CodecUtil.checkFooter(indexIn);
+      } else {
+        CodecUtil.checkEOF(indexIn);
+      }
+      success = true;
+    } finally {
+      if (success) {
+        IOUtils.close(indexIn, blockIn);
+      } else {
+        IOUtils.closeWhileHandlingException(indexIn, blockIn);
+      }
+    }
+  }
+
+  private int readHeader(IndexInput in)  {
+    return CodecUtil.checkHeader(in, FSTOrdTermsWriter.TERMS_CODEC_NAME,
+                                     FSTOrdTermsWriter.TERMS_VERSION_START,
+                                     FSTOrdTermsWriter.TERMS_VERSION_CURRENT);
+  }
+  private void seekDir(IndexInput in)  {
+    if (version >= FSTOrdTermsWriter.TERMS_VERSION_CHECKSUM) {
+      in.seek(in.length() - CodecUtil.footerLength() - 8);
+    } else {
+      in.seek(in.length() - 8);
+    }
+    in.seek(in.readLong());
+  }
+  private void checkFieldSummary(SegmentInfo info, IndexInput indexIn, IndexInput blockIn, TermsReader field, TermsReader previous)  {
+    // #docs with field must be <= #docs
+    if (field.docCount < 0 || field.docCount > info.getDocCount()) {
+      throw new CorruptIndexException("invalid docCount: " + field.docCount + " maxDoc: " + info.getDocCount() + " (resource=" + indexIn + ", " + blockIn + ")");
+    }
+    // #postings must be >= #docs with field
+    if (field.sumDocFreq < field.docCount) {
+      throw new CorruptIndexException("invalid sumDocFreq: " + field.sumDocFreq + " docCount: " + field.docCount + " (resource=" + indexIn + ", " + blockIn + ")");
+    }
+    // #positions must be >= #postings
+    if (field.sumTotalTermFreq != -1 && field.sumTotalTermFreq < field.sumDocFreq) {
+      throw new CorruptIndexException("invalid sumTotalTermFreq: " + field.sumTotalTermFreq + " sumDocFreq: " + field.sumDocFreq + " (resource=" + indexIn + ", " + blockIn + ")");
+    }
+    if (previous != null) {
+      throw new CorruptIndexException("duplicate fields: " + field.fieldInfo.name + " (resource=" + indexIn + ", " + blockIn + ")");
+    }
+  }
+
+  @Override
+  public Iterator<String> iterator() {
+    return Collections.unmodifiableSet(fields.keySet()).iterator();
+  }
+
+  @Override
+  public Terms terms(String field)  {
+    Debug.Assert( field != null;
+    return fields.get(field);
+  }
+
+  @Override
+  public int size() {
+    return fields.size();
+  }
+
+  @Override
+  public void close()  {
+    try {
+      IOUtils.close(postingsReader);
+    } finally {
+      fields.clear();
+    }
+  }
+
+  final class TermsReader extends Terms {
+    final FieldInfo fieldInfo;
+    final long numTerms;
+    final long sumTotalTermFreq;
+    final long sumDocFreq;
+    final int docCount;
+    final int longsSize;
+    final FST<Long> index;
+
+    final int numSkipInfo;
+    final long[] skipInfo;
+    final byte[] statsBlock;
+    final byte[] metaLongsBlock;
+    final byte[] metaBytesBlock;
+
+    TermsReader(FieldInfo fieldInfo, IndexInput blockIn, long numTerms, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize, FST<Long> index)  {
+      this.fieldInfo = fieldInfo;
+      this.numTerms = numTerms;
+      this.sumTotalTermFreq = sumTotalTermFreq;
+      this.sumDocFreq = sumDocFreq;
+      this.docCount = docCount;
+      this.longsSize = longsSize;
+      this.index = index;
+
+      Debug.Assert( (numTerms & (~0xffffffffL)) == 0;
+      final int numBlocks = (int)(numTerms + INTERVAL - 1) / INTERVAL;
+      this.numSkipInfo = longsSize + 3;
+      this.skipInfo = new long[numBlocks * numSkipInfo];
+      this.statsBlock = new byte[(int)blockIn.readVLong()];
+      this.metaLongsBlock = new byte[(int)blockIn.readVLong()];
+      this.metaBytesBlock = new byte[(int)blockIn.readVLong()];
+
+      int last = 0, next = 0;
+      for (int i = 1; i < numBlocks; i++) {
+        next = numSkipInfo * i;
+        for (int j = 0; j < numSkipInfo; j++) {
+          skipInfo[next + j] = skipInfo[last + j] + blockIn.readVLong();
+        }
+        last = next;
+      }
+      blockIn.readBytes(statsBlock, 0, statsBlock.length);
+      blockIn.readBytes(metaLongsBlock, 0, metaLongsBlock.length);
+      blockIn.readBytes(metaBytesBlock, 0, metaBytesBlock.length);
+    }
+
+    @Override
+    public Comparator<BytesRef> getComparator() {
+      return BytesRef.getUTF8SortedAsUnicodeComparator();
+    }
+
+    @Override
+    public bool hasFreqs() {
+      return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
+    }
+
+    @Override
+    public bool hasOffsets() {
+      return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
+    }
+
+    @Override
+    public bool hasPositions() {
+      return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
+    }
+
+    @Override
+    public bool hasPayloads() {
+      return fieldInfo.hasPayloads();
+    }
+
+    @Override
+    public long size() {
+      return numTerms;
+    }
+
+    @Override
+    public long getSumTotalTermFreq() {
+      return sumTotalTermFreq;
+    }
+
+    @Override
+    public long getSumDocFreq()  {
+      return sumDocFreq;
+    }
+
+    @Override
+    public int getDocCount()  {
+      return docCount;
+    }
+
+    @Override
+    public TermsEnum iterator(TermsEnum reuse)  {
+      return new SegmentTermsEnum();
+    }
+
+    @Override
+    public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm)  {
+      return new IntersectTermsEnum(compiled, startTerm);
+    }
+
+    // Only wraps common operations for PBF interact
+    abstract class BaseTermsEnum extends TermsEnum {
+      /* Current term, null when enum ends or unpositioned */
+      BytesRef term;
+
+      /* Current term's ord, starts from 0 */
+      long ord;
+
+      /* Current term stats + decoded metadata (customized by PBF) */
+      final BlockTermState state;
+
+      /* Datainput to load stats & metadata */
+      final ByteArrayDataInput statsReader = new ByteArrayDataInput();
+      final ByteArrayDataInput metaLongsReader = new ByteArrayDataInput();
+      final ByteArrayDataInput metaBytesReader = new ByteArrayDataInput();
+
+      /* To which block is buffered */ 
+      int statsBlockOrd;
+      int metaBlockOrd;
+
+      /* Current buffered metadata (long[] & byte[]) */
+      long[][] longs;
+      int[] bytesStart;
+      int[] bytesLength;
+
+      /* Current buffered stats (df & ttf) */
+      int[] docFreq;
+      long[] totalTermFreq;
+
+      BaseTermsEnum()  {
+        this.state = postingsReader.newTermState();
+        this.term = null;
+        this.statsReader.reset(statsBlock);
+        this.metaLongsReader.reset(metaLongsBlock);
+        this.metaBytesReader.reset(metaBytesBlock);
+
+        this.longs = new long[INTERVAL][longsSize];
+        this.bytesStart = new int[INTERVAL];
+        this.bytesLength = new int[INTERVAL];
+        this.docFreq = new int[INTERVAL];
+        this.totalTermFreq = new long[INTERVAL];
+        this.statsBlockOrd = -1;
+        this.metaBlockOrd = -1;
+        if (!hasFreqs()) {
+          Arrays.fill(totalTermFreq, -1);
+        }
+      }
+
+      @Override
+      public Comparator<BytesRef> getComparator() {
+        return BytesRef.getUTF8SortedAsUnicodeComparator();
+      }
+
+      /** Decodes stats data into term state */
+      void decodeStats()  {
+        final int upto = (int)ord % INTERVAL;
+        final int oldBlockOrd = statsBlockOrd;
+        statsBlockOrd = (int)ord / INTERVAL;
+        if (oldBlockOrd != statsBlockOrd) {
+          refillStats();
+        }
+        state.docFreq = docFreq[upto];
+        state.totalTermFreq = totalTermFreq[upto];
+      }
+
+      /** Let PBF decode metadata */
+      void decodeMetaData()  {
+        final int upto = (int)ord % INTERVAL;
+        final int oldBlockOrd = metaBlockOrd;
+        metaBlockOrd = (int)ord / INTERVAL;
+        if (metaBlockOrd != oldBlockOrd) {
+          refillMetadata();
+        }
+        metaBytesReader.setPosition(bytesStart[upto]);
+        postingsReader.decodeTerm(longs[upto], metaBytesReader, fieldInfo, state, true);
+      }
+
+      /** Load current stats shard */
+      final void refillStats()  {
+        final int offset = statsBlockOrd * numSkipInfo;
+        final int statsFP = (int)skipInfo[offset];
+        statsReader.setPosition(statsFP);
+        for (int i = 0; i < INTERVAL && !statsReader.eof(); i++) {
+          int code = statsReader.readVInt();
+          if (hasFreqs()) {
+            docFreq[i] = (code >>> 1);
+            if ((code & 1) == 1) {
+              totalTermFreq[i] = docFreq[i];
+            } else {
+              totalTermFreq[i] = docFreq[i] + statsReader.readVLong();
+            }
+          } else {
+            docFreq[i] = code;
+          }
+        }
+      }
+
+      /** Load current metadata shard */
+      final void refillMetadata()  {
+        final int offset = metaBlockOrd * numSkipInfo;
+        final int metaLongsFP = (int)skipInfo[offset + 1];
+        final int metaBytesFP = (int)skipInfo[offset + 2];
+        metaLongsReader.setPosition(metaLongsFP);
+        for (int j = 0; j < longsSize; j++) {
+          longs[0][j] = skipInfo[offset + 3 + j] + metaLongsReader.readVLong();
+        }
+        bytesStart[0] = metaBytesFP; 
+        bytesLength[0] = (int)metaLongsReader.readVLong();
+        for (int i = 1; i < INTERVAL && !metaLongsReader.eof(); i++) {
+          for (int j = 0; j < longsSize; j++) {
+            longs[i][j] = longs[i-1][j] + metaLongsReader.readVLong();
+          }
+          bytesStart[i] = bytesStart[i-1] + bytesLength[i-1];
+          bytesLength[i] = (int)metaLongsReader.readVLong();
+        }
+      }
+
+      @Override
+      public TermState termState()  {
+        decodeMetaData();
+        return state.clone();
+      }
+
+      @Override
+      public BytesRef term() {
+        return term;
+      }
+
+      @Override
+      public int docFreq()  {
+        return state.docFreq;
+      }
+
+      @Override
+      public long totalTermFreq()  {
+        return state.totalTermFreq;
+      }
+
+      @Override
+      public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags)  {
+        decodeMetaData();
+        return postingsReader.docs(fieldInfo, state, liveDocs, reuse, flags);
+      }
+
+      @Override
+      public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags)  {
+        if (!hasPositions()) {
+          return null;
+        }
+        decodeMetaData();
+        return postingsReader.docsAndPositions(fieldInfo, state, liveDocs, reuse, flags);
+      }
+
+      // TODO: this can be achieved by making use of Util.getByOutput()
+      //           and should have related tests
+      @Override
+      public void seekExact(long ord)  {
+        throw new UnsupportedOperationException();
+      }
+
+      @Override
+      public long ord() {
+        throw new UnsupportedOperationException();
+      }
+    }
+
+    // Iterates through all terms in this field
+    private final class SegmentTermsEnum extends BaseTermsEnum {
+      final BytesRefFSTEnum<Long> fstEnum;
+
+      /* True when current term's metadata is decoded */
+      bool decoded;
+
+      /* True when current enum is 'positioned' by seekExact(TermState) */
+      bool seekPending;
+
+      SegmentTermsEnum()  {
+        this.fstEnum = new BytesRefFSTEnum<>(index);
+        this.decoded = false;
+        this.seekPending = false;
+      }
+
+      @Override
+      void decodeMetaData()  {
+        if (!decoded && !seekPending) {
+          super.decodeMetaData();
+          decoded = true;
+        }
+      }
+
+      // Update current enum according to FSTEnum
+      void updateEnum(final InputOutput<Long> pair)  {
+        if (pair == null) {
+          term = null;
+        } else {
+          term = pair.input;
+          ord = pair.output;
+          decodeStats();
+        }
+        decoded = false;
+        seekPending = false;
+      }
+
+      @Override
+      public BytesRef next()  {
+        if (seekPending) {  // previously positioned, but termOutputs not fetched
+          seekPending = false;
+          SeekStatus status = seekCeil(term);
+          Debug.Assert( status == SeekStatus.FOUND;  // must positioned on valid term
+        }
+        updateEnum(fstEnum.next());
+        return term;
+      }
+
+      @Override
+      public bool seekExact(BytesRef target)  {
+        updateEnum(fstEnum.seekExact(target));
+        return term != null;
+      }
+
+      @Override
+      public SeekStatus seekCeil(BytesRef target)  {
+        updateEnum(fstEnum.seekCeil(target));
+        if (term == null) {
+          return SeekStatus.END;
+        } else {
+          return term.equals(target) ? SeekStatus.FOUND : SeekStatus.NOT_FOUND;
+        }
+      }
+
+      @Override
+      public void seekExact(BytesRef target, TermState otherState) {
+        if (!target.equals(term)) {
+          state.copyFrom(otherState);
+          term = BytesRef.deepCopyOf(target);
+          seekPending = true;
+        }
+      }
+    }
+
+    // Iterates intersect result with automaton (cannot seek!)
+    private final class IntersectTermsEnum extends BaseTermsEnum {
+      /* True when current term's metadata is decoded */
+      bool decoded;
+
+      /* True when there is pending term when calling next() */
+      bool pending;
+
+      /* stack to record how current term is constructed, 
+       * used to accumulate metadata or rewind term:
+       *   level == term.length + 1,
+       *         == 0 when term is null */
+      Frame[] stack;
+      int level;
+
+      /* term dict fst */
+      final FST<Long> fst;
+      final FST.BytesReader fstReader;
+      final Outputs<Long> fstOutputs;
+
+      /* query automaton to intersect with */
+      final ByteRunAutomaton fsa;
+
+      private final class Frame {
+        /* fst stats */
+        FST.Arc<Long> arc;
+
+        /* automaton stats */
+        int state;
+
+        Frame() {
+          this.arc = new FST.Arc<>();
+          this.state = -1;
+        }
+
+        public String toString() {
+          return "arc=" + arc + " state=" + state;
+        }
+      }
+
+      IntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm)  {
+        //if (TEST) System.out.println("Enum init, startTerm=" + startTerm);
+        this.fst = index;
+        this.fstReader = fst.getBytesReader();
+        this.fstOutputs = index.outputs;
+        this.fsa = compiled.runAutomaton;
+        this.level = -1;
+        this.stack = new Frame[16];
+        for (int i = 0 ; i < stack.length; i++) {
+          this.stack[i] = new Frame();
+        }
+
+        Frame frame;
+        frame = loadVirtualFrame(newFrame());
+        this.level++;
+        frame = loadFirstFrame(newFrame());
+        pushFrame(frame);
+
+        this.decoded = false;
+        this.pending = false;
+
+        if (startTerm == null) {
+          pending = isAccept(topFrame());
+        } else {
+          doSeekCeil(startTerm);
+          pending = !startTerm.equals(term) && isValid(topFrame()) && isAccept(topFrame());
+        }
+      }
+
+      @Override
+      void decodeMetaData()  {
+        if (!decoded) {
+          super.decodeMetaData();
+          decoded = true;
+        }
+      }
+
+      @Override
+      void decodeStats()  {
+        final FST.Arc<Long> arc = topFrame().arc;
+        Debug.Assert( arc.nextFinalOutput == fstOutputs.getNoOutput();
+        ord = arc.output;
+        super.decodeStats();
+      }
+
+      @Override
+      public SeekStatus seekCeil(BytesRef target)  {
+        throw new UnsupportedOperationException();
+      }
+
+      @Override
+      public BytesRef next()  {
+        //if (TEST) System.out.println("Enum next()");
+        if (pending) {
+          pending = false;
+          decodeStats();
+          return term;
+        }
+        decoded = false;
+      DFS:
+        while (level > 0) {
+          Frame frame = newFrame();
+          if (loadExpandFrame(topFrame(), frame) != null) {  // has valid target
+            pushFrame(frame);
+            if (isAccept(frame)) {  // gotcha
+              break;
+            }
+            continue;  // check next target
+          } 
+          frame = popFrame();
+          while(level > 0) {
+            if (loadNextFrame(topFrame(), frame) != null) {  // has valid sibling 
+              pushFrame(frame);
+              if (isAccept(frame)) {  // gotcha
+                break DFS;
+              }
+              continue DFS;   // check next target 
+            }
+            frame = popFrame();
+          }
+          return null;
+        }
+        decodeStats();
+        return term;
+      }
+
+      BytesRef doSeekCeil(BytesRef target)  {
+        //if (TEST) System.out.println("Enum doSeekCeil()");
+        Frame frame= null;
+        int label, upto = 0, limit = target.length;
+        while (upto < limit) {  // to target prefix, or ceil label (rewind prefix)
+          frame = newFrame();
+          label = target.bytes[upto] & 0xff;
+          frame = loadCeilFrame(label, topFrame(), frame);
+          if (frame == null || frame.arc.label != label) {
+            break;
+          }
+          Debug.Assert( isValid(frame);  // target must be fetched from automaton
+          pushFrame(frame);
+          upto++;
+        }
+        if (upto == limit) {  // got target
+          return term;
+        }
+        if (frame != null) {  // got larger term('s prefix)
+          pushFrame(frame);
+          return isAccept(frame) ? term : next();
+        }
+        while (level > 0) {   // got target's prefix, advance to larger term
+          frame = popFrame();
+          while (level > 0 && !canRewind(frame)) {
+            frame = popFrame();
+          }
+          if (loadNextFrame(topFrame(), frame) != null) {
+            pushFrame(frame);
+            return isAccept(frame) ? term : next();
+          }
+        }
+        return null;
+      }
+
+      /** Virtual frame, never pop */
+      Frame loadVirtualFrame(Frame frame)  {
+        frame.arc.output = fstOutputs.getNoOutput();
+        frame.arc.nextFinalOutput = fstOutputs.getNoOutput();
+        frame.state = -1;
+        return frame;
+      }
+
+      /** Load frame for start arc(node) on fst */
+      Frame loadFirstFrame(Frame frame)  {
+        frame.arc = fst.getFirstArc(frame.arc);
+        frame.state = fsa.getInitialState();
+        return frame;
+      }
+
+      /** Load frame for target arc(node) on fst */
+      Frame loadExpandFrame(Frame top, Frame frame)  {
+        if (!canGrow(top)) {
+          return null;
+        }
+        frame.arc = fst.readFirstRealTargetArc(top.arc.target, frame.arc, fstReader);
+        frame.state = fsa.step(top.state, frame.arc.label);
+        //if (TEST) System.out.println(" loadExpand frame="+frame);
+        if (frame.state == -1) {
+          return loadNextFrame(top, frame);
+        }
+        return frame;
+      }
+
+      /** Load frame for sibling arc(node) on fst */
+      Frame loadNextFrame(Frame top, Frame frame)  {
+        if (!canRewind(frame)) {
+          return null;
+        }
+        while (!frame.arc.isLast()) {
+          frame.arc = fst.readNextRealArc(frame.arc, fstReader);
+          frame.state = fsa.step(top.state, frame.arc.label);
+          if (frame.state != -1) {
+            break;
+          }
+        }
+        //if (TEST) System.out.println(" loadNext frame="+frame);
+        if (frame.state == -1) {
+          return null;
+        }
+        return frame;
+      }
+
+      /** Load frame for target arc(node) on fst, so that 
+       *  arc.label >= label and !fsa.reject(arc.label) */
+      Frame loadCeilFrame(int label, Frame top, Frame frame)  {
+        FST.Arc<Long> arc = frame.arc;
+        arc = Util.readCeilArc(label, fst, top.arc, arc, fstReader);
+        if (arc == null) {
+          return null;
+        }
+        frame.state = fsa.step(top.state, arc.label);
+        //if (TEST) System.out.println(" loadCeil frame="+frame);
+        if (frame.state == -1) {
+          return loadNextFrame(top, frame);
+        }
+        return frame;
+      }
+
+      bool isAccept(Frame frame) {  // reach a term both fst&fsa accepts
+        return fsa.isAccept(frame.state) && frame.arc.isFinal();
+      }
+      bool isValid(Frame frame) {   // reach a prefix both fst&fsa won't reject
+        return /*frame != null &&*/ frame.state != -1;
+      }
+      bool canGrow(Frame frame) {   // can walk forward on both fst&fsa
+        return frame.state != -1 && FST.targetHasArcs(frame.arc);
+      }
+      bool canRewind(Frame frame) { // can jump to sibling
+        return !frame.arc.isLast();
+      }
+
+      void pushFrame(Frame frame) {
+        final FST.Arc<Long> arc = frame.arc;
+        arc.output = fstOutputs.add(topFrame().arc.output, arc.output);
+        term = grow(arc.label);
+        level++;
+        Debug.Assert( frame == stack[level];
+      }
+
+      Frame popFrame() {
+        term = shrink();
+        return stack[level--];
+      }
+
+      Frame newFrame() {
+        if (level+1 == stack.length) {
+          final Frame[] temp = new Frame[ArrayUtil.oversize(level+2, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
+          System.arraycopy(stack, 0, temp, 0, stack.length);
+          for (int i = stack.length; i < temp.length; i++) {
+            temp[i] = new Frame();
+          }
+          stack = temp;
+        }
+        return stack[level+1];
+      }
+
+      Frame topFrame() {
+        return stack[level];
+      }
+
+      BytesRef grow(int label) {
+        if (term == null) {
+          term = new BytesRef(new byte[16], 0, 0);
+        } else {
+          if (term.length == term.bytes.length) {
+            term.grow(term.length+1);
+          }
+          term.bytes[term.length++] = (byte)label;
+        }
+        return term;
+      }
+
+      BytesRef shrink() {
+        if (term.length == 0) {
+          term = null;
+        } else {
+          term.length--;
+        }
+        return term;
+      }
+    }
+  }
+
+  static<T> void walk(FST<T> fst)  {
+    final ArrayList<FST.Arc<T>> queue = new ArrayList<>();
+    final BitSet seen = new BitSet();
+    final FST.BytesReader reader = fst.getBytesReader();
+    final FST.Arc<T> startArc = fst.getFirstArc(new FST.Arc<T>());
+    queue.add(startArc);
+    while (!queue.isEmpty()) {
+      final FST.Arc<T> arc = queue.remove(0);
+      final long node = arc.target;
+      //System.out.println(arc);
+      if (FST.targetHasArcs(arc) && !seen.get((int) node)) {
+        seen.set((int) node);
+        fst.readFirstRealTargetArc(node, arc, reader);
+        while (true) {
+          queue.add(new FST.Arc<T>().copyFrom(arc));
+          if (arc.isLast()) {
+            break;
+          } else {
+            fst.readNextRealArc(arc, reader);
+          }
+        }
+      }
+    }
+  }
+  
+  @Override
+  public long ramBytesUsed() {
+    long ramBytesUsed = 0;
+    for (TermsReader r : fields.values()) {
+      if (r.index != null) {
+        ramBytesUsed += r.index.sizeInBytes();
+        ramBytesUsed += RamUsageEstimator.sizeOf(r.metaBytesBlock);
+        ramBytesUsed += RamUsageEstimator.sizeOf(r.metaLongsBlock);
+        ramBytesUsed += RamUsageEstimator.sizeOf(r.skipInfo);
+        ramBytesUsed += RamUsageEstimator.sizeOf(r.statsBlock);
+      }
+    }
+    return ramBytesUsed;
+  }
+  
+  @Override
+  public void checkIntegrity()  {
+    postingsReader.checkIntegrity();
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1da1cb5b/src/Lucene.Net.Codecs/Memory/FSTOrdTermsWriter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Codecs/Memory/FSTOrdTermsWriter.cs b/src/Lucene.Net.Codecs/Memory/FSTOrdTermsWriter.cs
new file mode 100644
index 0000000..33997fd
--- /dev/null
+++ b/src/Lucene.Net.Codecs/Memory/FSTOrdTermsWriter.cs
@@ -0,0 +1,374 @@
+package org.apache.lucene.codecs.memory;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.List;
+import java.util.ArrayList;
+import java.util.Comparator;
+
+import org.apache.lucene.index.FieldInfo.IndexOptions;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RAMOutputStream;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.fst.Builder;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.PositiveIntOutputs;
+import org.apache.lucene.util.fst.Util;
+import org.apache.lucene.codecs.BlockTermState;
+import org.apache.lucene.codecs.PostingsWriterBase;
+import org.apache.lucene.codecs.PostingsConsumer;
+import org.apache.lucene.codecs.FieldsConsumer;
+import org.apache.lucene.codecs.TermsConsumer;
+import org.apache.lucene.codecs.TermStats;
+import org.apache.lucene.codecs.CodecUtil;
+
+/** 
+ * FST-based term dict, using ord as FST output.
+ *
+ * The FST holds the mapping between &lt;term, ord&gt;, and 
+ * term's metadata is delta encoded into a single byte block.
+ *
+ * Typically the byte block consists of four parts:
+ * 1. term statistics: docFreq, totalTermFreq;
+ * 2. monotonic long[], e.g. the pointer to the postings list for that term;
+ * 3. generic byte[], e.g. other information customized by postings base.
+ * 4. single-level skip list to speed up metadata decoding by ord.
+ *
+ * <p>
+ * Files:
+ * <ul>
+ *  <li><tt>.tix</tt>: <a href="#Termindex">Term Index</a></li>
+ *  <li><tt>.tbk</tt>: <a href="#Termblock">Term Block</a></li>
+ * </ul>
+ * </p>
+ *
+ * <a name="Termindex" id="Termindex"></a>
+ * <h3>Term Index</h3>
+ * <p>
+ *  The .tix contains a list of FSTs, one for each field.
+ *  The FST maps a term to its corresponding order in current field.
+ * </p>
+ * 
+ * <ul>
+ *  <li>TermIndex(.tix) --&gt; Header, TermFST<sup>NumFields</sup>, Footer</li>
+ *  <li>TermFST --&gt; {@link FST FST&lt;long&gt;}</li>
+ *  <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
+ *  <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}</li>
+ * </ul>
+ *
+ * <p>Notes:</p>
+ * <ul>
+ *  <li>
+ *  Since terms are already sorted before writing to <a href="#Termblock">Term Block</a>, 
+ *  their ords can directly used to seek term metadata from term block.
+ *  </li>
+ * </ul>
+ *
+ * <a name="Termblock" id="Termblock"></a>
+ * <h3>Term Block</h3>
+ * <p>
+ *  The .tbk contains all the statistics and metadata for terms, along with field summary (e.g. 
+ *  per-field data like number of documents in current field). For each field, there are four blocks:
+ *  <ul>
+ *   <li>statistics bytes block: contains term statistics; </li>
+ *   <li>metadata longs block: delta-encodes monotonic part of metadata; </li>
+ *   <li>metadata bytes block: encodes other parts of metadata; </li>
+ *   <li>skip block: contains skip data, to speed up metadata seeking and decoding</li>
+ *  </ul>
+ * </p>
+ *
+ * <p>File Format:</p>
+ * <ul>
+ *  <li>TermBlock(.tbk) --&gt; Header, <i>PostingsHeader</i>, FieldSummary, DirOffset</li>
+ *  <li>FieldSummary --&gt; NumFields, &lt;FieldNumber, NumTerms, SumTotalTermFreq?, SumDocFreq,
+ *                                         DocCount, LongsSize, DataBlock &gt; <sup>NumFields</sup>, Footer</li>
+ *
+ *  <li>DataBlock --&gt; StatsBlockLength, MetaLongsBlockLength, MetaBytesBlockLength, 
+ *                       SkipBlock, StatsBlock, MetaLongsBlock, MetaBytesBlock </li>
+ *  <li>SkipBlock --&gt; &lt; StatsFPDelta, MetaLongsSkipFPDelta, MetaBytesSkipFPDelta, 
+ *                            MetaLongsSkipDelta<sup>LongsSize</sup> &gt;<sup>NumTerms</sup>
+ *  <li>StatsBlock --&gt; &lt; DocFreq[Same?], (TotalTermFreq-DocFreq) ? &gt; <sup>NumTerms</sup>
+ *  <li>MetaLongsBlock --&gt; &lt; LongDelta<sup>LongsSize</sup>, BytesSize &gt; <sup>NumTerms</sup>
+ *  <li>MetaBytesBlock --&gt; Byte <sup>MetaBytesBlockLength</sup>
+ *  <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
+ *  <li>DirOffset --&gt; {@link DataOutput#writeLong Uint64}</li>
+ *  <li>NumFields, FieldNumber, DocCount, DocFreq, LongsSize, 
+ *        FieldNumber, DocCount --&gt; {@link DataOutput#writeVInt VInt}</li>
+ *  <li>NumTerms, SumTotalTermFreq, SumDocFreq, StatsBlockLength, MetaLongsBlockLength, MetaBytesBlockLength,
+ *        StatsFPDelta, MetaLongsSkipFPDelta, MetaBytesSkipFPDelta, MetaLongsSkipStart, TotalTermFreq, 
+ *        LongDelta,--&gt; {@link DataOutput#writeVLong VLong}</li>
+ *  <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}</li>
+ * </ul>
+ * <p>Notes: </p>
+ * <ul>
+ *  <li>
+ *   The format of PostingsHeader and MetaBytes are customized by the specific postings implementation:
+ *   they contain arbitrary per-file data (such as parameters or versioning information), and per-term data 
+ *   (non-monotonic ones like pulsed postings data).
+ *  </li>
+ *  <li>
+ *   During initialization the reader will load all the blocks into memory. SkipBlock will be decoded, so that during seek
+ *   term dict can lookup file pointers directly. StatsFPDelta, MetaLongsSkipFPDelta, etc. are file offset
+ *   for every SkipInterval's term. MetaLongsSkipDelta is the difference from previous one, which indicates
+ *   the value of preceding metadata longs for every SkipInterval's term.
+ *  </li>
+ *  <li>
+ *   DocFreq is the count of documents which contain the term. TotalTermFreq is the total number of occurrences of the term. 
+ *   Usually these two values are the same for long tail terms, therefore one bit is stole from DocFreq to check this case,
+ *   so that encoding of TotalTermFreq may be omitted.
+ *  </li>
+ * </ul>
+ *
+ * @lucene.experimental 
+ */
+
+public class FSTOrdTermsWriter extends FieldsConsumer {
+  static final String TERMS_INDEX_EXTENSION = "tix";
+  static final String TERMS_BLOCK_EXTENSION = "tbk";
+  static final String TERMS_CODEC_NAME = "FST_ORD_TERMS_DICT";
+  public static final int TERMS_VERSION_START = 0;
+  public static final int TERMS_VERSION_CHECKSUM = 1;
+  public static final int TERMS_VERSION_CURRENT = TERMS_VERSION_CHECKSUM;
+  public static final int SKIP_INTERVAL = 8;
+  
+  final PostingsWriterBase postingsWriter;
+  final FieldInfos fieldInfos;
+  final List<FieldMetaData> fields = new ArrayList<>();
+  IndexOutput blockOut = null;
+  IndexOutput indexOut = null;
+
+  public FSTOrdTermsWriter(SegmentWriteState state, PostingsWriterBase postingsWriter)  {
+    final String termsIndexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION);
+    final String termsBlockFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_BLOCK_EXTENSION);
+
+    this.postingsWriter = postingsWriter;
+    this.fieldInfos = state.fieldInfos;
+
+    bool success = false;
+    try {
+      this.indexOut = state.directory.createOutput(termsIndexFileName, state.context);
+      this.blockOut = state.directory.createOutput(termsBlockFileName, state.context);
+      writeHeader(indexOut);
+      writeHeader(blockOut);
+      this.postingsWriter.init(blockOut); 
+      success = true;
+    } finally {
+      if (!success) {
+        IOUtils.closeWhileHandlingException(indexOut, blockOut);
+      }
+    }
+  }
+
+  @Override
+  public TermsConsumer addField(FieldInfo field)  {
+    return new TermsWriter(field);
+  }
+
+  @Override
+  public void close()  {
+    if (blockOut != null) {
+      IOException ioe = null;
+      try {
+        final long blockDirStart = blockOut.getFilePointer();
+        
+        // write field summary
+        blockOut.writeVInt(fields.size());
+        for (FieldMetaData field : fields) {
+          blockOut.writeVInt(field.fieldInfo.number);
+          blockOut.writeVLong(field.numTerms);
+          if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) {
+            blockOut.writeVLong(field.sumTotalTermFreq);
+          }
+          blockOut.writeVLong(field.sumDocFreq);
+          blockOut.writeVInt(field.docCount);
+          blockOut.writeVInt(field.longsSize);
+          blockOut.writeVLong(field.statsOut.getFilePointer());
+          blockOut.writeVLong(field.metaLongsOut.getFilePointer());
+          blockOut.writeVLong(field.metaBytesOut.getFilePointer());
+          
+          field.skipOut.writeTo(blockOut);
+          field.statsOut.writeTo(blockOut);
+          field.metaLongsOut.writeTo(blockOut);
+          field.metaBytesOut.writeTo(blockOut);
+          field.dict.save(indexOut);
+        }
+        writeTrailer(blockOut, blockDirStart);
+        CodecUtil.writeFooter(indexOut);
+        CodecUtil.writeFooter(blockOut);
+      } catch (IOException ioe2) {
+        ioe = ioe2;
+      } finally {
+        IOUtils.closeWhileHandlingException(ioe, blockOut, indexOut, postingsWriter);
+        blockOut = null;
+      }
+    }
+  }
+
+  private void writeHeader(IndexOutput out)  {
+    CodecUtil.writeHeader(out, TERMS_CODEC_NAME, TERMS_VERSION_CURRENT);   
+  }
+  private void writeTrailer(IndexOutput out, long dirStart)  {
+    out.writeLong(dirStart);
+  }
+
+  private static class FieldMetaData {
+    public FieldInfo fieldInfo;
+    public long numTerms;
+    public long sumTotalTermFreq;
+    public long sumDocFreq;
+    public int docCount;
+    public int longsSize;
+    public FST<Long> dict;
+
+    // TODO: block encode each part 
+
+    // vint encode next skip point (fully decoded when reading)
+    public RAMOutputStream skipOut;
+    // vint encode df, (ttf-df)
+    public RAMOutputStream statsOut;
+    // vint encode monotonic long[] and length for corresponding byte[]
+    public RAMOutputStream metaLongsOut;
+    // generic byte[]
+    public RAMOutputStream metaBytesOut;
+  }
+
+  final class TermsWriter extends TermsConsumer {
+    private final Builder<Long> builder;
+    private final PositiveIntOutputs outputs;
+    private final FieldInfo fieldInfo;
+    private final int longsSize;
+    private long numTerms;
+
+    private final IntsRef scratchTerm = new IntsRef();
+    private final RAMOutputStream statsOut = new RAMOutputStream();
+    private final RAMOutputStream metaLongsOut = new RAMOutputStream();
+    private final RAMOutputStream metaBytesOut = new RAMOutputStream();
+
+    private final RAMOutputStream skipOut = new RAMOutputStream();
+    private long lastBlockStatsFP;
+    private long lastBlockMetaLongsFP;
+    private long lastBlockMetaBytesFP;
+    private long[] lastBlockLongs;
+
+    private long[] lastLongs;
+    private long lastMetaBytesFP;
+
+    TermsWriter(FieldInfo fieldInfo) {
+      this.numTerms = 0;
+      this.fieldInfo = fieldInfo;
+      this.longsSize = postingsWriter.setField(fieldInfo);
+      this.outputs = PositiveIntOutputs.getSingleton();
+      this.builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
+
+      this.lastBlockStatsFP = 0;
+      this.lastBlockMetaLongsFP = 0;
+      this.lastBlockMetaBytesFP = 0;
+      this.lastBlockLongs = new long[longsSize];
+
+      this.lastLongs = new long[longsSize];
+      this.lastMetaBytesFP = 0;
+    }
+
+    @Override
+    public Comparator<BytesRef> getComparator() {
+      return BytesRef.getUTF8SortedAsUnicodeComparator();
+    }
+
+    @Override
+    public PostingsConsumer startTerm(BytesRef text)  {
+      postingsWriter.startTerm();
+      return postingsWriter;
+    }
+
+    @Override
+    public void finishTerm(BytesRef text, TermStats stats)  {
+      if (numTerms > 0 && numTerms % SKIP_INTERVAL == 0) {
+        bufferSkip();
+      }
+      // write term meta data into fst
+      final long longs[] = new long[longsSize];
+      final long delta = stats.totalTermFreq - stats.docFreq;
+      if (stats.totalTermFreq > 0) {
+        if (delta == 0) {
+          statsOut.writeVInt(stats.docFreq<<1|1);
+        } else {
+          statsOut.writeVInt(stats.docFreq<<1|0);
+          statsOut.writeVLong(stats.totalTermFreq-stats.docFreq);
+        }
+      } else {
+        statsOut.writeVInt(stats.docFreq);
+      }
+      BlockTermState state = postingsWriter.newTermState();
+      state.docFreq = stats.docFreq;
+      state.totalTermFreq = stats.totalTermFreq;
+      postingsWriter.finishTerm(state);
+      postingsWriter.encodeTerm(longs, metaBytesOut, fieldInfo, state, true);
+      for (int i = 0; i < longsSize; i++) {
+        metaLongsOut.writeVLong(longs[i] - lastLongs[i]);
+        lastLongs[i] = longs[i];
+      }
+      metaLongsOut.writeVLong(metaBytesOut.getFilePointer() - lastMetaBytesFP);
+
+      builder.add(Util.toIntsRef(text, scratchTerm), numTerms);
+      numTerms++;
+
+      lastMetaBytesFP = metaBytesOut.getFilePointer();
+    }
+
+    @Override
+    public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount)  {
+      if (numTerms > 0) {
+        final FieldMetaData metadata = new FieldMetaData();
+        metadata.fieldInfo = fieldInfo;
+        metadata.numTerms = numTerms;
+        metadata.sumTotalTermFreq = sumTotalTermFreq;
+        metadata.sumDocFreq = sumDocFreq;
+        metadata.docCount = docCount;
+        metadata.longsSize = longsSize;
+        metadata.skipOut = skipOut;
+        metadata.statsOut = statsOut;
+        metadata.metaLongsOut = metaLongsOut;
+        metadata.metaBytesOut = metaBytesOut;
+        metadata.dict = builder.finish();
+        fields.add(metadata);
+      }
+    }
+
+    private void bufferSkip()  {
+      skipOut.writeVLong(statsOut.getFilePointer() - lastBlockStatsFP);
+      skipOut.writeVLong(metaLongsOut.getFilePointer() - lastBlockMetaLongsFP);
+      skipOut.writeVLong(metaBytesOut.getFilePointer() - lastBlockMetaBytesFP);
+      for (int i = 0; i < longsSize; i++) {
+        skipOut.writeVLong(lastLongs[i] - lastBlockLongs[i]);
+      }
+      lastBlockStatsFP = statsOut.getFilePointer();
+      lastBlockMetaLongsFP = metaLongsOut.getFilePointer();
+      lastBlockMetaBytesFP = metaBytesOut.getFilePointer();
+      System.arraycopy(lastLongs, 0, lastBlockLongs, 0, longsSize);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1da1cb5b/src/Lucene.Net.Codecs/Memory/FSTPostingsFormat.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Codecs/Memory/FSTPostingsFormat.cs b/src/Lucene.Net.Codecs/Memory/FSTPostingsFormat.cs
new file mode 100644
index 0000000..5a54abd
--- /dev/null
+++ b/src/Lucene.Net.Codecs/Memory/FSTPostingsFormat.cs
@@ -0,0 +1,83 @@
+package org.apache.lucene.codecs.memory;
+
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.codecs.FieldsConsumer;
+import org.apache.lucene.codecs.FieldsProducer;
+import org.apache.lucene.codecs.PostingsFormat;
+import org.apache.lucene.codecs.PostingsReaderBase;
+import org.apache.lucene.codecs.PostingsWriterBase;
+import org.apache.lucene.codecs.lucene41.Lucene41PostingsWriter;
+import org.apache.lucene.codecs.lucene41.Lucene41PostingsReader;
+import org.apache.lucene.index.FieldInfo.IndexOptions;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.util.IOUtils;
+
+/**
+ * FST term dict + Lucene41PBF
+ */
+
+public final class FSTPostingsFormat extends PostingsFormat {
+  public FSTPostingsFormat() {
+    super("FST41");
+  }
+
+  @Override
+  public String toString() {
+    return getName();
+  }
+
+  @Override
+  public FieldsConsumer fieldsConsumer(SegmentWriteState state)  {
+    PostingsWriterBase postingsWriter = new Lucene41PostingsWriter(state);
+
+    bool success = false;
+    try {
+      FieldsConsumer ret = new FSTTermsWriter(state, postingsWriter);
+      success = true;
+      return ret;
+    } finally {
+      if (!success) {
+        IOUtils.closeWhileHandlingException(postingsWriter);
+      }
+    }
+  }
+
+  @Override
+  public FieldsProducer fieldsProducer(SegmentReadState state)  {
+    PostingsReaderBase postingsReader = new Lucene41PostingsReader(state.directory,
+                                                                state.fieldInfos,
+                                                                state.segmentInfo,
+                                                                state.context,
+                                                                state.segmentSuffix);
+    bool success = false;
+    try {
+      FieldsProducer ret = new FSTTermsReader(state, postingsReader);
+      success = true;
+      return ret;
+    } finally {
+      if (!success) {
+        IOUtils.closeWhileHandlingException(postingsReader);
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1da1cb5b/src/Lucene.Net.Codecs/Memory/FSTPulsing41PostingsFormat.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Codecs/Memory/FSTPulsing41PostingsFormat.cs b/src/Lucene.Net.Codecs/Memory/FSTPulsing41PostingsFormat.cs
new file mode 100644
index 0000000..74c4296
--- /dev/null
+++ b/src/Lucene.Net.Codecs/Memory/FSTPulsing41PostingsFormat.cs
@@ -0,0 +1,92 @@
+package org.apache.lucene.codecs.memory;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.codecs.FieldsConsumer;
+import org.apache.lucene.codecs.FieldsProducer;
+import org.apache.lucene.codecs.PostingsBaseFormat;
+import org.apache.lucene.codecs.PostingsFormat;
+import org.apache.lucene.codecs.PostingsReaderBase;
+import org.apache.lucene.codecs.PostingsWriterBase;
+import org.apache.lucene.codecs.lucene41.Lucene41PostingsWriter;
+import org.apache.lucene.codecs.lucene41.Lucene41PostingsReader;
+import org.apache.lucene.codecs.lucene41.Lucene41PostingsBaseFormat;
+import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat;
+import org.apache.lucene.codecs.pulsing.PulsingPostingsWriter;
+import org.apache.lucene.codecs.pulsing.PulsingPostingsReader;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.util.IOUtils;
+
+/** FST + Pulsing41, test only, since
+ *  FST does no delta encoding here!
+ *  @lucene.experimental */
+
+public class FSTPulsing41PostingsFormat extends PostingsFormat {
+  private final PostingsBaseFormat wrappedPostingsBaseFormat;
+  private final int freqCutoff;
+
+  public FSTPulsing41PostingsFormat() {
+    this(1);
+  }
+  
+  public FSTPulsing41PostingsFormat(int freqCutoff) {
+    super("FSTPulsing41");
+    this.wrappedPostingsBaseFormat = new Lucene41PostingsBaseFormat();
+    this.freqCutoff = freqCutoff;
+  }
+
+  @Override
+  public FieldsConsumer fieldsConsumer(SegmentWriteState state)  {
+    PostingsWriterBase docsWriter = null;
+    PostingsWriterBase pulsingWriter = null;
+
+    bool success = false;
+    try {
+      docsWriter = wrappedPostingsBaseFormat.postingsWriterBase(state);
+      pulsingWriter = new PulsingPostingsWriter(state, freqCutoff, docsWriter);
+      FieldsConsumer ret = new FSTTermsWriter(state, pulsingWriter);
+      success = true;
+      return ret;
+    } finally {
+      if (!success) {
+        IOUtils.closeWhileHandlingException(docsWriter, pulsingWriter);
+      }
+    }
+  }
+
+  @Override
+  public FieldsProducer fieldsProducer(SegmentReadState state)  {
+    PostingsReaderBase docsReader = null;
+    PostingsReaderBase pulsingReader = null;
+    bool success = false;
+    try {
+      docsReader = wrappedPostingsBaseFormat.postingsReaderBase(state);
+      pulsingReader = new PulsingPostingsReader(state, docsReader);
+      FieldsProducer ret = new FSTTermsReader(state, pulsingReader);
+      success = true;
+      return ret;
+    } finally {
+      if (!success) {
+        IOUtils.closeWhileHandlingException(docsReader, pulsingReader);
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1da1cb5b/src/Lucene.Net.Codecs/Memory/FSTTermOutputs.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Codecs/Memory/FSTTermOutputs.cs b/src/Lucene.Net.Codecs/Memory/FSTTermOutputs.cs
new file mode 100644
index 0000000..e0a91e8
--- /dev/null
+++ b/src/Lucene.Net.Codecs/Memory/FSTTermOutputs.cs
@@ -0,0 +1,331 @@
+package org.apache.lucene.codecs.memory;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfo.IndexOptions;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.util.fst.Outputs;
+import org.apache.lucene.util.LongsRef;
+
+/**
+ * An FST {@link Outputs} implementation for 
+ * {@link FSTTermsWriter}.
+ *
+ * @lucene.experimental
+ */
+
+// NOTE: outputs should be per-field, since
+// longsSize is fixed for each field
+class FSTTermOutputs extends Outputs<FSTTermOutputs.TermData> {
+  private final static TermData NO_OUTPUT = new TermData();
+  //private static bool TEST = false;
+  private final bool hasPos;
+  private final int longsSize;
+
+  /** 
+   * Represents the metadata for one term.
+   * On an FST, only long[] part is 'shared' and pushed towards root.
+   * byte[] and term stats will be kept on deeper arcs.
+   */
+  static class TermData {
+    long[] longs;
+    byte[] bytes;
+    int docFreq;
+    long totalTermFreq;
+    TermData() {
+      this.longs = null;
+      this.bytes = null;
+      this.docFreq = 0;
+      this.totalTermFreq = -1;
+    }
+    TermData(long[] longs, byte[] bytes, int docFreq, long totalTermFreq) {
+      this.longs = longs;
+      this.bytes = bytes;
+      this.docFreq = docFreq;
+      this.totalTermFreq = totalTermFreq;
+    }
+
+    // NOTE: actually, FST nodes are seldom 
+    // identical when outputs on their arcs 
+    // aren't NO_OUTPUTs.
+    @Override
+    public int hashCode() {
+      int hash = 0;
+      if (longs != null) {
+        final int end = longs.length;
+        for (int i = 0; i < end; i++) {
+          hash -= longs[i];
+        }
+      }
+      if (bytes != null) {
+        hash = -hash;
+        final int end = bytes.length;
+        for (int i = 0; i < end; i++) {
+          hash += bytes[i];
+        }
+      }
+      hash += docFreq + totalTermFreq;
+      return hash;
+    }
+
+    @Override
+    public bool equals(Object other_) {
+      if (other_ == this) {
+        return true;
+      } else if (!(other_ instanceof FSTTermOutputs.TermData)) {
+        return false;
+      }
+      TermData other = (TermData) other_;
+      return statsEqual(this, other) && 
+             longsEqual(this, other) && 
+             bytesEqual(this, other);
+
+    }
+  }
+  
+  protected FSTTermOutputs(FieldInfo fieldInfo, int longsSize) {
+    this.hasPos = (fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY);
+    this.longsSize = longsSize;
+  }
+
+  @Override
+  //
+  // The return value will be the smaller one, when these two are 
+  // 'comparable', i.e. 
+  // 1. every value in t1 is not larger than in t2, or
+  // 2. every value in t1 is not smaller than t2.
+  //
+  public TermData common(TermData t1, TermData t2) {
+    //if (TEST) System.out.print("common("+t1+", "+t2+") = ");
+    if (t1 == NO_OUTPUT || t2 == NO_OUTPUT) {
+      //if (TEST) System.out.println("ret:"+NO_OUTPUT);
+      return NO_OUTPUT;
+    }
+    Debug.Assert( t1.longs.length == t2.longs.length;
+
+    long[] min = t1.longs, max = t2.longs;
+    int pos = 0;
+    TermData ret;
+
+    while (pos < longsSize && min[pos] == max[pos]) {
+      pos++;
+    }
+    if (pos < longsSize) {  // unequal long[]
+      if (min[pos] > max[pos]) {
+        min = t2.longs;
+        max = t1.longs;
+      }
+      // check whether strictly smaller
+      while (pos < longsSize && min[pos] <= max[pos]) {
+        pos++;
+      }
+      if (pos < longsSize || allZero(min)) {  // not comparable or all-zero
+        ret = NO_OUTPUT;
+      } else {
+        ret = new TermData(min, null, 0, -1);
+      }
+    } else {  // equal long[]
+      if (statsEqual(t1, t2) && bytesEqual(t1, t2)) {
+        ret = t1;
+      } else if (allZero(min)) {
+        ret = NO_OUTPUT;
+      } else {
+        ret = new TermData(min, null, 0, -1);
+      }
+    }
+    //if (TEST) System.out.println("ret:"+ret);
+    return ret;
+  }
+
+  @Override
+  public TermData subtract(TermData t1, TermData t2) {
+    //if (TEST) System.out.print("subtract("+t1+", "+t2+") = ");
+    if (t2 == NO_OUTPUT) {
+      //if (TEST) System.out.println("ret:"+t1);
+      return t1;
+    }
+    Debug.Assert( t1.longs.length == t2.longs.length;
+
+    int pos = 0;
+    long diff = 0;
+    long[] share = new long[longsSize];
+
+    while (pos < longsSize) {
+      share[pos] = t1.longs[pos] - t2.longs[pos];
+      diff += share[pos];
+      pos++;
+    }
+
+    TermData ret;
+    if (diff == 0 && statsEqual(t1, t2) && bytesEqual(t1, t2)) {
+      ret = NO_OUTPUT;
+    } else {
+      ret = new TermData(share, t1.bytes, t1.docFreq, t1.totalTermFreq);
+    }
+    //if (TEST) System.out.println("ret:"+ret);
+    return ret;
+  }
+
+  // TODO: if we refactor a 'addSelf(TermData other)',
+  // we can gain about 5~7% for fuzzy queries, however this also 
+  // means we are putting too much stress on FST Outputs decoding?
+  @Override
+  public TermData add(TermData t1, TermData t2) {
+    //if (TEST) System.out.print("add("+t1+", "+t2+") = ");
+    if (t1 == NO_OUTPUT) {
+      //if (TEST) System.out.println("ret:"+t2);
+      return t2;
+    } else if (t2 == NO_OUTPUT) {
+      //if (TEST) System.out.println("ret:"+t1);
+      return t1;
+    }
+    Debug.Assert( t1.longs.length == t2.longs.length;
+
+    int pos = 0;
+    long[] accum = new long[longsSize];
+
+    while (pos < longsSize) {
+      accum[pos] = t1.longs[pos] + t2.longs[pos];
+      pos++;
+    }
+
+    TermData ret;
+    if (t2.bytes != null || t2.docFreq > 0) {
+      ret = new TermData(accum, t2.bytes, t2.docFreq, t2.totalTermFreq);
+    } else {
+      ret = new TermData(accum, t1.bytes, t1.docFreq, t1.totalTermFreq);
+    }
+    //if (TEST) System.out.println("ret:"+ret);
+    return ret;
+  }
+
+  @Override
+  public void write(TermData data, DataOutput out)  {
+    int bit0 = allZero(data.longs) ? 0 : 1;
+    int bit1 = ((data.bytes == null || data.bytes.length == 0) ? 0 : 1) << 1;
+    int bit2 = ((data.docFreq == 0)  ? 0 : 1) << 2;
+    int bits = bit0 | bit1 | bit2;
+    if (bit1 > 0) {  // determine extra length
+      if (data.bytes.length < 32) {
+        bits |= (data.bytes.length << 3);
+        out.writeByte((byte)bits);
+      } else {
+        out.writeByte((byte)bits);
+        out.writeVInt(data.bytes.length);
+      }
+    } else {
+      out.writeByte((byte)bits);
+    }
+    if (bit0 > 0) {  // not all-zero case
+      for (int pos = 0; pos < longsSize; pos++) {
+        out.writeVLong(data.longs[pos]);
+      }
+    }
+    if (bit1 > 0) {  // bytes exists
+      out.writeBytes(data.bytes, 0, data.bytes.length);
+    }
+    if (bit2 > 0) {  // stats exist
+      if (hasPos) {
+        if (data.docFreq == data.totalTermFreq) {
+          out.writeVInt((data.docFreq << 1) | 1);
+        } else {
+          out.writeVInt((data.docFreq << 1));
+          out.writeVLong(data.totalTermFreq - data.docFreq);
+        }
+      } else {
+        out.writeVInt(data.docFreq);
+      }
+    }
+  }
+
+  @Override
+  public TermData read(DataInput in)  {
+    long[] longs = new long[longsSize];
+    byte[] bytes = null;
+    int docFreq = 0;
+    long totalTermFreq = -1;
+    int bits = in.readByte() & 0xff;
+    int bit0 = bits & 1;
+    int bit1 = bits & 2;
+    int bit2 = bits & 4;
+    int bytesSize = (bits >>> 3);
+    if (bit1 > 0 && bytesSize == 0) {  // determine extra length
+      bytesSize = in.readVInt();
+    }
+    if (bit0 > 0) {  // not all-zero case
+      for (int pos = 0; pos < longsSize; pos++) {
+        longs[pos] = in.readVLong();
+      }
+    }
+    if (bit1 > 0) {  // bytes exists
+      bytes = new byte[bytesSize];
+      in.readBytes(bytes, 0, bytesSize);
+    }
+    if (bit2 > 0) {  // stats exist
+      int code = in.readVInt();
+      if (hasPos) {
+        totalTermFreq = docFreq = code >>> 1;
+        if ((code & 1) == 0) {
+          totalTermFreq += in.readVLong();
+        }
+      } else {
+        docFreq = code;
+      }
+    }
+    return new TermData(longs, bytes, docFreq, totalTermFreq);
+  }
+
+  @Override
+  public TermData getNoOutput() {
+    return NO_OUTPUT;
+  }
+
+  @Override
+  public String outputToString(TermData data) {
+    return data.toString();
+  }
+
+  static bool statsEqual(final TermData t1, final TermData t2) {
+    return t1.docFreq == t2.docFreq && t1.totalTermFreq == t2.totalTermFreq;
+  }
+  static bool bytesEqual(final TermData t1, final TermData t2) {
+    if (t1.bytes == null && t2.bytes == null) {
+      return true;
+    }
+    return t1.bytes != null && t2.bytes != null && Arrays.equals(t1.bytes, t2.bytes);
+  }
+  static bool longsEqual(final TermData t1, final TermData t2) {
+    if (t1.longs == null && t2.longs == null) {
+      return true;
+    }
+    return t1.longs != null && t2.longs != null && Arrays.equals(t1.longs, t2.longs);
+  }
+  static bool allZero(final long[] l) {
+    for (int i = 0; i < l.length; i++) {
+      if (l[i] != 0) {
+        return false;
+      }
+    }
+    return true;
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1da1cb5b/src/Lucene.Net.Codecs/Memory/FSTTermsReader.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Codecs/Memory/FSTTermsReader.cs b/src/Lucene.Net.Codecs/Memory/FSTTermsReader.cs
new file mode 100644
index 0000000..e38bb59
--- /dev/null
+++ b/src/Lucene.Net.Codecs/Memory/FSTTermsReader.cs
@@ -0,0 +1,762 @@
+package org.apache.lucene.codecs.memory;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.TreeMap;
+
+import org.apache.lucene.codecs.BlockTermState;
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.FieldsProducer;
+import org.apache.lucene.codecs.PostingsReaderBase;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.DocsEnum;
+import org.apache.lucene.index.FieldInfo.IndexOptions;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.TermState;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.automaton.ByteRunAutomaton;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
+import org.apache.lucene.util.fst.BytesRefFSTEnum.InputOutput;
+import org.apache.lucene.util.fst.BytesRefFSTEnum;
+import org.apache.lucene.util.fst.FST;
+import org.apache.lucene.util.fst.Outputs;
+import org.apache.lucene.util.fst.Util;
+
+/**
+ * FST-based terms dictionary reader.
+ *
+ * The FST directly maps each term and its metadata, 
+ * it is memory resident.
+ *
+ * @lucene.experimental
+ */
+
+public class FSTTermsReader extends FieldsProducer {
+  final TreeMap<String, TermsReader> fields = new TreeMap<>();
+  final PostingsReaderBase postingsReader;
+  //static bool TEST = false;
+  final int version;
+
+  public FSTTermsReader(SegmentReadState state, PostingsReaderBase postingsReader)  {
+    final String termsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, FSTTermsWriter.TERMS_EXTENSION);
+
+    this.postingsReader = postingsReader;
+    final IndexInput in = state.directory.openInput(termsFileName, state.context);
+
+    bool success = false;
+    try {
+      version = readHeader(in);
+      if (version >= FSTTermsWriter.TERMS_VERSION_CHECKSUM) {
+        CodecUtil.checksumEntireFile(in);
+      }
+      this.postingsReader.init(in);
+      seekDir(in);
+
+      final FieldInfos fieldInfos = state.fieldInfos;
+      final int numFields = in.readVInt();
+      for (int i = 0; i < numFields; i++) {
+        int fieldNumber = in.readVInt();
+        FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber);
+        long numTerms = in.readVLong();
+        long sumTotalTermFreq = fieldInfo.getIndexOptions() == IndexOptions.DOCS_ONLY ? -1 : in.readVLong();
+        long sumDocFreq = in.readVLong();
+        int docCount = in.readVInt();
+        int longsSize = in.readVInt();
+        TermsReader current = new TermsReader(fieldInfo, in, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize);
+        TermsReader previous = fields.put(fieldInfo.name, current);
+        checkFieldSummary(state.segmentInfo, in, current, previous);
+      }
+      success = true;
+    } finally {
+      if (success) {
+        IOUtils.close(in);
+      } else {
+        IOUtils.closeWhileHandlingException(in);
+      }
+    }
+  }
+
+  private int readHeader(IndexInput in)  {
+    return CodecUtil.checkHeader(in, FSTTermsWriter.TERMS_CODEC_NAME,
+                                     FSTTermsWriter.TERMS_VERSION_START,
+                                     FSTTermsWriter.TERMS_VERSION_CURRENT);
+  }
+  private void seekDir(IndexInput in)  {
+    if (version >= FSTTermsWriter.TERMS_VERSION_CHECKSUM) {
+      in.seek(in.length() - CodecUtil.footerLength() - 8);
+    } else {
+      in.seek(in.length() - 8);
+    }
+    in.seek(in.readLong());
+  }
+  private void checkFieldSummary(SegmentInfo info, IndexInput in, TermsReader field, TermsReader previous)  {
+    // #docs with field must be <= #docs
+    if (field.docCount < 0 || field.docCount > info.getDocCount()) {
+      throw new CorruptIndexException("invalid docCount: " + field.docCount + " maxDoc: " + info.getDocCount() + " (resource=" + in + ")");
+    }
+    // #postings must be >= #docs with field
+    if (field.sumDocFreq < field.docCount) {
+      throw new CorruptIndexException("invalid sumDocFreq: " + field.sumDocFreq + " docCount: " + field.docCount + " (resource=" + in + ")");
+    }
+    // #positions must be >= #postings
+    if (field.sumTotalTermFreq != -1 && field.sumTotalTermFreq < field.sumDocFreq) {
+      throw new CorruptIndexException("invalid sumTotalTermFreq: " + field.sumTotalTermFreq + " sumDocFreq: " + field.sumDocFreq + " (resource=" + in + ")");
+    }
+    if (previous != null) {
+      throw new CorruptIndexException("duplicate fields: " + field.fieldInfo.name + " (resource=" + in + ")");
+    }
+  }
+
+  @Override
+  public Iterator<String> iterator() {
+    return Collections.unmodifiableSet(fields.keySet()).iterator();
+  }
+
+  @Override
+  public Terms terms(String field)  {
+    Debug.Assert( field != null;
+    return fields.get(field);
+  }
+
+  @Override
+  public int size() {
+    return fields.size();
+  }
+
+  @Override
+  public void close()  {
+    try {
+      IOUtils.close(postingsReader);
+    } finally {
+      fields.clear();
+    }
+  }
+
+  final class TermsReader extends Terms {
+    final FieldInfo fieldInfo;
+    final long numTerms;
+    final long sumTotalTermFreq;
+    final long sumDocFreq;
+    final int docCount;
+    final int longsSize;
+    final FST<FSTTermOutputs.TermData> dict;
+
+    TermsReader(FieldInfo fieldInfo, IndexInput in, long numTerms, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize)  {
+      this.fieldInfo = fieldInfo;
+      this.numTerms = numTerms;
+      this.sumTotalTermFreq = sumTotalTermFreq;
+      this.sumDocFreq = sumDocFreq;
+      this.docCount = docCount;
+      this.longsSize = longsSize;
+      this.dict = new FST<>(in, new FSTTermOutputs(fieldInfo, longsSize));
+    }
+
+    @Override
+    public Comparator<BytesRef> getComparator() {
+      return BytesRef.getUTF8SortedAsUnicodeComparator();
+    }
+
+    @Override
+    public bool hasFreqs() {
+      return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
+    }
+
+    @Override
+    public bool hasOffsets() {
+      return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
+    }
+
+    @Override
+    public bool hasPositions() {
+      return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
+    }
+
+    @Override
+    public bool hasPayloads() {
+      return fieldInfo.hasPayloads();
+    }
+
+    @Override
+    public long size() {
+      return numTerms;
+    }
+
+    @Override
+    public long getSumTotalTermFreq() {
+      return sumTotalTermFreq;
+    }
+
+    @Override
+    public long getSumDocFreq()  {
+      return sumDocFreq;
+    }
+
+    @Override
+    public int getDocCount()  {
+      return docCount;
+    }
+
+    @Override
+    public TermsEnum iterator(TermsEnum reuse)  {
+      return new SegmentTermsEnum();
+    }
+
+    @Override
+    public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm)  {
+      return new IntersectTermsEnum(compiled, startTerm);
+    }
+
+    // Only wraps common operations for PBF interact
+    abstract class BaseTermsEnum extends TermsEnum {
+      /* Current term, null when enum ends or unpositioned */
+      BytesRef term;
+
+      /* Current term stats + decoded metadata (customized by PBF) */
+      final BlockTermState state;
+
+      /* Current term stats + undecoded metadata (long[] & byte[]) */
+      FSTTermOutputs.TermData meta;
+      ByteArrayDataInput bytesReader;
+
+      /** Decodes metadata into customized term state */
+      abstract void decodeMetaData() ;
+
+      BaseTermsEnum()  {
+        this.state = postingsReader.newTermState();
+        this.bytesReader = new ByteArrayDataInput();
+        this.term = null;
+        // NOTE: metadata will only be initialized in child class
+      }
+
+      @Override
+      public TermState termState()  {
+        decodeMetaData();
+        return state.clone();
+      }
+
+      @Override
+      public BytesRef term() {
+        return term;
+      }
+
+      @Override
+      public int docFreq()  {
+        return state.docFreq;
+      }
+
+      @Override
+      public long totalTermFreq()  {
+        return state.totalTermFreq;
+      }
+
+      @Override
+      public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags)  {
+        decodeMetaData();
+        return postingsReader.docs(fieldInfo, state, liveDocs, reuse, flags);
+      }
+
+      @Override
+      public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags)  {
+        if (!hasPositions()) {
+          return null;
+        }
+        decodeMetaData();
+        return postingsReader.docsAndPositions(fieldInfo, state, liveDocs, reuse, flags);
+      }
+
+      @Override
+      public void seekExact(long ord)  {
+        throw new UnsupportedOperationException();
+      }
+
+      @Override
+      public long ord() {
+        throw new UnsupportedOperationException();
+      }
+    }
+
+
+    // Iterates through all terms in this field
+    private final class SegmentTermsEnum extends BaseTermsEnum {
+      final BytesRefFSTEnum<FSTTermOutputs.TermData> fstEnum;
+
+      /* True when current term's metadata is decoded */
+      bool decoded;
+
+      /* True when current enum is 'positioned' by seekExact(TermState) */
+      bool seekPending;
+
+      SegmentTermsEnum()  {
+        super();
+        this.fstEnum = new BytesRefFSTEnum<>(dict);
+        this.decoded = false;
+        this.seekPending = false;
+        this.meta = null;
+      }
+
+      @Override
+      public Comparator<BytesRef> getComparator() {
+        return BytesRef.getUTF8SortedAsUnicodeComparator();
+      }
+
+      // Let PBF decode metadata from long[] and byte[]
+      @Override
+      void decodeMetaData()  {
+        if (!decoded && !seekPending) {
+          if (meta.bytes != null) {
+            bytesReader.reset(meta.bytes, 0, meta.bytes.length);
+          }
+          postingsReader.decodeTerm(meta.longs, bytesReader, fieldInfo, state, true);
+          decoded = true;
+        }
+      }
+
+      // Update current enum according to FSTEnum
+      void updateEnum(final InputOutput<FSTTermOutputs.TermData> pair) {
+        if (pair == null) {
+          term = null;
+        } else {
+          term = pair.input;
+          meta = pair.output;
+          state.docFreq = meta.docFreq;
+          state.totalTermFreq = meta.totalTermFreq;
+        }
+        decoded = false;
+        seekPending = false;
+      }
+
+      @Override
+      public BytesRef next()  {
+        if (seekPending) {  // previously positioned, but termOutputs not fetched
+          seekPending = false;
+          SeekStatus status = seekCeil(term);
+          Debug.Assert( status == SeekStatus.FOUND;  // must positioned on valid term
+        }
+        updateEnum(fstEnum.next());
+        return term;
+      }
+
+      @Override
+      public bool seekExact(BytesRef target)  {
+        updateEnum(fstEnum.seekExact(target));
+        return term != null;
+      }
+
+      @Override
+      public SeekStatus seekCeil(BytesRef target)  {
+        updateEnum(fstEnum.seekCeil(target));
+        if (term == null) {
+          return SeekStatus.END;
+        } else {
+          return term.equals(target) ? SeekStatus.FOUND : SeekStatus.NOT_FOUND;
+        }
+      }
+
+      @Override
+      public void seekExact(BytesRef target, TermState otherState) {
+        if (!target.equals(term)) {
+          state.copyFrom(otherState);
+          term = BytesRef.deepCopyOf(target);
+          seekPending = true;
+        }
+      }
+    }
+
+    // Iterates intersect result with automaton (cannot seek!)
+    private final class IntersectTermsEnum extends BaseTermsEnum {
+      /* True when current term's metadata is decoded */
+      bool decoded;
+
+      /* True when there is pending term when calling next() */
+      bool pending;
+
+      /* stack to record how current term is constructed, 
+       * used to accumulate metadata or rewind term:
+       *   level == term.length + 1,
+       *         == 0 when term is null */
+      Frame[] stack;
+      int level;
+
+      /* to which level the metadata is accumulated 
+       * so that we can accumulate metadata lazily */
+      int metaUpto;
+
+      /* term dict fst */
+      final FST<FSTTermOutputs.TermData> fst;
+      final FST.BytesReader fstReader;
+      final Outputs<FSTTermOutputs.TermData> fstOutputs;
+
+      /* query automaton to intersect with */
+      final ByteRunAutomaton fsa;
+
+      private final class Frame {
+        /* fst stats */
+        FST.Arc<FSTTermOutputs.TermData> fstArc;
+
+        /* automaton stats */
+        int fsaState;
+
+        Frame() {
+          this.fstArc = new FST.Arc<>();
+          this.fsaState = -1;
+        }
+
+        public String toString() {
+          return "arc=" + fstArc + " state=" + fsaState;
+        }
+      }
+
+      IntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm)  {
+        super();
+        //if (TEST) System.out.println("Enum init, startTerm=" + startTerm);
+        this.fst = dict;
+        this.fstReader = fst.getBytesReader();
+        this.fstOutputs = dict.outputs;
+        this.fsa = compiled.runAutomaton;
+        this.level = -1;
+        this.stack = new Frame[16];
+        for (int i = 0 ; i < stack.length; i++) {
+          this.stack[i] = new Frame();
+        }
+
+        Frame frame;
+        frame = loadVirtualFrame(newFrame());
+        this.level++;
+        frame = loadFirstFrame(newFrame());
+        pushFrame(frame);
+
+        this.meta = null;
+        this.metaUpto = 1;
+        this.decoded = false;
+        this.pending = false;
+
+        if (startTerm == null) {
+          pending = isAccept(topFrame());
+        } else {
+          doSeekCeil(startTerm);
+          pending = !startTerm.equals(term) && isValid(topFrame()) && isAccept(topFrame());
+        }
+      }
+
+      @Override
+      public Comparator<BytesRef> getComparator() {
+        return BytesRef.getUTF8SortedAsUnicodeComparator();
+      }
+
+      @Override
+      void decodeMetaData()  {
+        Debug.Assert( term != null;
+        if (!decoded) {
+          if (meta.bytes != null) {
+            bytesReader.reset(meta.bytes, 0, meta.bytes.length);
+          }
+          postingsReader.decodeTerm(meta.longs, bytesReader, fieldInfo, state, true);
+          decoded = true;
+        }
+      }
+
+      /** Lazily accumulate meta data, when we got a accepted term */
+      void loadMetaData()  {
+        FST.Arc<FSTTermOutputs.TermData> last, next;
+        last = stack[metaUpto].fstArc;
+        while (metaUpto != level) {
+          metaUpto++;
+          next = stack[metaUpto].fstArc;
+          next.output = fstOutputs.add(next.output, last.output);
+          last = next;
+        }
+        if (last.isFinal()) {
+          meta = fstOutputs.add(last.output, last.nextFinalOutput);
+        } else {
+          meta = last.output;
+        }
+        state.docFreq = meta.docFreq;
+        state.totalTermFreq = meta.totalTermFreq;
+      }
+
+      @Override
+      public SeekStatus seekCeil(BytesRef target)  {
+        decoded = false;
+        term = doSeekCeil(target);
+        loadMetaData();
+        if (term == null) {
+          return SeekStatus.END;
+        } else {
+          return term.equals(target) ? SeekStatus.FOUND : SeekStatus.NOT_FOUND;
+        }
+      }
+
+      @Override
+      public BytesRef next()  {
+        //if (TEST) System.out.println("Enum next()");
+        if (pending) {
+          pending = false;
+          loadMetaData();
+          return term;
+        }
+        decoded = false;
+      DFS:
+        while (level > 0) {
+          Frame frame = newFrame();
+          if (loadExpandFrame(topFrame(), frame) != null) {  // has valid target
+            pushFrame(frame);
+            if (isAccept(frame)) {  // gotcha
+              break;
+            }
+            continue;  // check next target
+          } 
+          frame = popFrame();
+          while(level > 0) {
+            if (loadNextFrame(topFrame(), frame) != null) {  // has valid sibling 
+              pushFrame(frame);
+              if (isAccept(frame)) {  // gotcha
+                break DFS;
+              }
+              continue DFS;   // check next target 
+            }
+            frame = popFrame();
+          }
+          return null;
+        }
+        loadMetaData();
+        return term;
+      }
+
+      private BytesRef doSeekCeil(BytesRef target)  {
+        //if (TEST) System.out.println("Enum doSeekCeil()");
+        Frame frame= null;
+        int label, upto = 0, limit = target.length;
+        while (upto < limit) {  // to target prefix, or ceil label (rewind prefix)
+          frame = newFrame();
+          label = target.bytes[upto] & 0xff;
+          frame = loadCeilFrame(label, topFrame(), frame);
+          if (frame == null || frame.fstArc.label != label) {
+            break;
+          }
+          Debug.Assert( isValid(frame);  // target must be fetched from automaton
+          pushFrame(frame);
+          upto++;
+        }
+        if (upto == limit) {  // got target
+          return term;
+        }
+        if (frame != null) {  // got larger term('s prefix)
+          pushFrame(frame);
+          return isAccept(frame) ? term : next();
+        }
+        while (level > 0) {  // got target's prefix, advance to larger term
+          frame = popFrame();
+          while (level > 0 && !canRewind(frame)) {
+            frame = popFrame();
+          }
+          if (loadNextFrame(topFrame(), frame) != null) {
+            pushFrame(frame);
+            return isAccept(frame) ? term : next();
+          }
+        }
+        return null;
+      }
+
+      /** Virtual frame, never pop */
+      Frame loadVirtualFrame(Frame frame)  {
+        frame.fstArc.output = fstOutputs.getNoOutput();
+        frame.fstArc.nextFinalOutput = fstOutputs.getNoOutput();
+        frame.fsaState = -1;
+        return frame;
+      }
+
+      /** Load frame for start arc(node) on fst */
+      Frame loadFirstFrame(Frame frame)  {
+        frame.fstArc = fst.getFirstArc(frame.fstArc);
+        frame.fsaState = fsa.getInitialState();
+        return frame;
+      }
+
+      /** Load frame for target arc(node) on fst */
+      Frame loadExpandFrame(Frame top, Frame frame)  {
+        if (!canGrow(top)) {
+          return null;
+        }
+        frame.fstArc = fst.readFirstRealTargetArc(top.fstArc.target, frame.fstArc, fstReader);
+        frame.fsaState = fsa.step(top.fsaState, frame.fstArc.label);
+        //if (TEST) System.out.println(" loadExpand frame="+frame);
+        if (frame.fsaState == -1) {
+          return loadNextFrame(top, frame);
+        }
+        return frame;
+      }
+
+      /** Load frame for sibling arc(node) on fst */
+      Frame loadNextFrame(Frame top, Frame frame)  {
+        if (!canRewind(frame)) {
+          return null;
+        }
+        while (!frame.fstArc.isLast()) {
+          frame.fstArc = fst.readNextRealArc(frame.fstArc, fstReader);
+          frame.fsaState = fsa.step(top.fsaState, frame.fstArc.label);
+          if (frame.fsaState != -1) {
+            break;
+          }
+        }
+        //if (TEST) System.out.println(" loadNext frame="+frame);
+        if (frame.fsaState == -1) {
+          return null;
+        }
+        return frame;
+      }
+
+      /** Load frame for target arc(node) on fst, so that 
+       *  arc.label >= label and !fsa.reject(arc.label) */
+      Frame loadCeilFrame(int label, Frame top, Frame frame)  {
+        FST.Arc<FSTTermOutputs.TermData> arc = frame.fstArc;
+        arc = Util.readCeilArc(label, fst, top.fstArc, arc, fstReader);
+        if (arc == null) {
+          return null;
+        }
+        frame.fsaState = fsa.step(top.fsaState, arc.label);
+        //if (TEST) System.out.println(" loadCeil frame="+frame);
+        if (frame.fsaState == -1) {
+          return loadNextFrame(top, frame);
+        }
+        return frame;
+      }
+
+      bool isAccept(Frame frame) {  // reach a term both fst&fsa accepts
+        return fsa.isAccept(frame.fsaState) && frame.fstArc.isFinal();
+      }
+      bool isValid(Frame frame) {   // reach a prefix both fst&fsa won't reject
+        return /*frame != null &&*/ frame.fsaState != -1;
+      }
+      bool canGrow(Frame frame) {   // can walk forward on both fst&fsa
+        return frame.fsaState != -1 && FST.targetHasArcs(frame.fstArc);
+      }
+      bool canRewind(Frame frame) { // can jump to sibling
+        return !frame.fstArc.isLast();
+      }
+
+      void pushFrame(Frame frame) {
+        term = grow(frame.fstArc.label);
+        level++;
+        //if (TEST) System.out.println("  term=" + term + " level=" + level);
+      }
+
+      Frame popFrame() {
+        term = shrink();
+        level--;
+        metaUpto = metaUpto > level ? level : metaUpto;
+        //if (TEST) System.out.println("  term=" + term + " level=" + level);
+        return stack[level+1];
+      }
+
+      Frame newFrame() {
+        if (level+1 == stack.length) {
+          final Frame[] temp = new Frame[ArrayUtil.oversize(level+2, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
+          System.arraycopy(stack, 0, temp, 0, stack.length);
+          for (int i = stack.length; i < temp.length; i++) {
+            temp[i] = new Frame();
+          }
+          stack = temp;
+        }
+        return stack[level+1];
+      }
+
+      Frame topFrame() {
+        return stack[level];
+      }
+
+      BytesRef grow(int label) {
+        if (term == null) {
+          term = new BytesRef(new byte[16], 0, 0);
+        } else {
+          if (term.length == term.bytes.length) {
+            term.grow(term.length+1);
+          }
+          term.bytes[term.length++] = (byte)label;
+        }
+        return term;
+      }
+
+      BytesRef shrink() {
+        if (term.length == 0) {
+          term = null;
+        } else {
+          term.length--;
+        }
+        return term;
+      }
+    }
+  }
+
+  static<T> void walk(FST<T> fst)  {
+    final ArrayList<FST.Arc<T>> queue = new ArrayList<>();
+    final BitSet seen = new BitSet();
+    final FST.BytesReader reader = fst.getBytesReader();
+    final FST.Arc<T> startArc = fst.getFirstArc(new FST.Arc<T>());
+    queue.add(startArc);
+    while (!queue.isEmpty()) {
+      final FST.Arc<T> arc = queue.remove(0);
+      final long node = arc.target;
+      //System.out.println(arc);
+      if (FST.targetHasArcs(arc) && !seen.get((int) node)) {
+        seen.set((int) node);
+        fst.readFirstRealTargetArc(node, arc, reader);
+        while (true) {
+          queue.add(new FST.Arc<T>().copyFrom(arc));
+          if (arc.isLast()) {
+            break;
+          } else {
+            fst.readNextRealArc(arc, reader);
+          }
+        }
+      }
+    }
+  }
+
+  @Override
+  public long ramBytesUsed() {
+    long ramBytesUsed = 0;
+    for (TermsReader r : fields.values()) {
+      ramBytesUsed += r.dict == null ? 0 : r.dict.sizeInBytes();
+    }
+    return ramBytesUsed;
+  }
+  
+  @Override
+  public void checkIntegrity()  {
+    postingsReader.checkIntegrity();
+  }
+}


Mime
View raw message