lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From synhers...@apache.org
Subject [08/51] [abbrv] [partial] Cleaning up and getting ready to development towards v4.8
Date Sat, 06 Sep 2014 19:36:19 GMT
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1da1cb5b/src/Lucene.Net.Codecs/BlockTerms/VariableGapTermsIndexWriter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Codecs/BlockTerms/VariableGapTermsIndexWriter.cs b/src/Lucene.Net.Codecs/BlockTerms/VariableGapTermsIndexWriter.cs
new file mode 100644
index 0000000..1a8f9d4
--- /dev/null
+++ b/src/Lucene.Net.Codecs/BlockTerms/VariableGapTermsIndexWriter.cs
@@ -0,0 +1,366 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+using Lucene.Net.Codecs;
+using Lucene.Net.Codecs.BlockTerms;
+using Lucene.Net.Index;
+using Lucene.Net.Store;
+using Lucene.Net.Util;
+using Lucene.Net.Util.Fst;
+
+namespace Lucene.Net.Codecs.BlockTerms
+{
+    
+}
+/**
+ * Selects index terms according to provided pluggable
+ * {@link IndexTermSelector}, and stores them in a prefix trie that's
+ * loaded entirely in RAM stored as an FST.  This terms
+ * index only supports unsigned byte term sort order
+ * (unicode codepoint order when the bytes are UTF8).
+ *
+ * @lucene.experimental */
+public class VariableGapTermsIndexWriter : TermsIndexWriterBase {
+  protected IndexOutput output;
+
+  /** Extension of terms index file */
+  public const String TERMS_INDEX_EXTENSION = "tiv";
+
+ public const String CODEC_NAME = "VARIABLE_GAP_TERMS_INDEX";
+ public const int VERSION_START = 0;
+public const int VERSION_APPEND_ONLY = 1;
+  public const int VERSION_CHECKSUM = 2;
+  public const int VERSION_CURRENT = VERSION_CHECKSUM;
+
+  private readonly List<FSTFieldWriter> fields = new ArrayList<>();
+  
+  @SuppressWarnings("unused") private final FieldInfos fieldInfos; // unread
+  private final IndexTermSelector policy;
+
+  /** 
+   * Hook for selecting which terms should be placed in the terms index.
+   * <p>
+   * {@link #newField} is called at the start of each new field, and
+   * {@link #isIndexTerm} for each term in that field.
+   * 
+   * @lucene.experimental 
+   */
+
+    public abstract class IndexTermSelector
+    {
+        /// <summary>
+        /// Called sequentially on every term being written
+        /// returning true if this term should be indexed
+        /// </summary>
+        public abstract bool IsIndexTerm(BytesRef term, TermStats stats);
+        
+        /// <summary>Called when a new field is started</summary>
+        public abstract void NewField(FieldInfo fieldInfo);
+    }
+
+    /// <remarks>
+    /// Same policy as {@link FixedGapTermsIndexWriter}
+    /// </remarks>
+    public sealed class EveryNTermSelector : IndexTermSelector
+    {
+        private int count;
+        private readonly int interval;
+
+        public EveryNTermSelector(int interval)
+        {
+            this.interval = interval;
+            // First term is first indexed term:
+            count = interval;
+        }
+
+        public override bool IsIndexTerm(BytesRef term, TermStats stats)
+        {
+            if (count >= interval)
+            {
+                count = 1;
+                return true;
+            }
+            else
+            {
+                count++;
+                return false;
+            }
+        }
+
+        public override void NewField(FieldInfo fieldInfo)
+        {
+            count = interval;
+        }
+    }
+
+    /// <summary>
+    /// Sets an index term when docFreq >= docFreqThresh, or
+    /// every interval terms.  This should reduce seek time
+    /// to high docFreq terms. 
+    /// </summary>
+    public class EveryNOrDocFreqTermSelector : IndexTermSelector
+    {
+        private int count;
+        private readonly int docFreqThresh;
+        private readonly int interval;
+
+        public EveryNOrDocFreqTermSelector(int docFreqThresh, int interval)
+        {
+            this.interval = interval;
+            this.docFreqThresh = docFreqThresh;
+
+            // First term is first indexed term:
+            count = interval;
+        }
+
+        public override bool IsIndexTerm(BytesRef term, TermStats stats)
+        {
+            if (stats.DocFreq >= docFreqThresh || count >= interval)
+            {
+                count = 1;
+                return true;
+            }
+            else
+            {
+                count++;
+                return false;
+            }
+        }
+
+        public override void NewField(FieldInfo fieldInfo)
+        {
+            count = interval;
+        }
+    }
+
+    // TODO: it'd be nice to let the FST builder prune based
+  // on term count of each node (the prune1/prune2 that it
+  // accepts), and build the index based on that.  This
+  // should result in a more compact terms index, more like
+  // a prefix trie than the other selectors, because it
+  // only stores enough leading bytes to get down to N
+  // terms that may complete that prefix.  It becomes
+  // "deeper" when terms are dense, and "shallow" when they
+  // are less dense.
+  //
+  // However, it's not easy to make that work this this
+  // API, because that pruning doesn't immediately know on
+  // seeing each term whether that term will be a seek point
+  // or not.  It requires some non-causality in the API, ie
+  // only on seeing some number of future terms will the
+  // builder decide which past terms are seek points.
+  // Somehow the API'd need to be able to return a "I don't
+  // know" value, eg like a Future, which only later on is
+  // flipped (frozen) to true or false.
+  //
+  // We could solve this with a 2-pass approach, where the
+  // first pass would build an FSA (no outputs) solely to
+  // determine which prefixes are the 'leaves' in the
+  // pruning. The 2nd pass would then look at this prefix
+  // trie to mark the seek points and build the FST mapping
+  // to the true output.
+  //
+  // But, one downside to this approach is that it'd result
+  // in uneven index term selection.  EG with prune1=10, the
+  // resulting index terms could be as frequent as every 10
+  // terms or as rare as every <maxArcCount> * 10 (eg 2560),
+  // in the extremes.
+
+    public VariableGapTermsIndexWriter(SegmentWriteState state, IndexTermSelector policy)
+    {
+        string indexFileName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix,
+            TERMS_INDEX_EXTENSION);
+        output = state.Directory.CreateOutput(indexFileName, state.Context);
+        bool success = false;
+        try
+        {
+            FieldInfos = state.FieldInfos;
+            this.Policy = policy;
+            writeHeader(output);
+            success = true;
+        }
+        finally
+        {
+            if (!success)
+            {
+                IOUtils.CloseWhileHandlingException(output);
+            }
+        }
+    }
+
+    private void WriteHeader(IndexOutput output)
+    {
+        CodecUtil.WriteHeader(output, CODEC_NAME, VERSION_CURRENT);
+    }
+
+    public override FieldWriter AddField(FieldInfo field, long termsFilePointer)
+    {
+        ////System.out.println("VGW: field=" + field.name);
+        Policy.newField(field);
+        FSTFieldWriter writer = new FSTFieldWriter(field, termsFilePointer);
+        fields.Add(writer);
+        return writer;
+    }
+
+    /** NOTE: if your codec does not sort in unicode code
+   *  point order, you must override this method, to simply
+   *  return indexedTerm.length. */
+
+    protected int IndexedTermPrefixLength(BytesRef priorTerm, BytesRef indexedTerm)
+    {
+        // As long as codec sorts terms in unicode codepoint
+        // order, we can safely strip off the non-distinguishing
+        // suffix to save RAM in the loaded terms index.
+        int idxTermOffset = indexedTerm.Offset;
+        int priorTermOffset = priorTerm.Offset;
+        int limit = Math.Min(priorTerm.Length, indexedTerm.Length);
+        for (int byteIdx = 0; byteIdx < limit; byteIdx++)
+        {
+            if (priorTerm.Bytes[priorTermOffset + byteIdx] != indexedTerm.Bytes[idxTermOffset + byteIdx])
+            {
+                return byteIdx + 1;
+            }
+        }
+
+        return Math.Min(1 + priorTerm.Length, indexedTerm.Length);
+    }
+
+    private class FSTFieldWriter : FieldWriter
+    {
+        private readonly Builder<long> fstBuilder;
+        private readonly PositiveIntOutputs fstOutputs;
+        private readonly long startTermsFilePointer;
+
+        public FieldInfo fieldInfo;
+        private FST<long> fst;
+        private long indexStart;
+
+        private readonly BytesRef lastTerm = new BytesRef();
+        private bool first = true;
+
+        public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer)
+        {
+            this.fieldInfo = fieldInfo;
+            fstOutputs = PositiveIntOutputs.Singleton;
+            fstBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1, fstOutputs);
+            indexStart = output.FilePointer;
+            ////System.out.println("VGW: field=" + fieldInfo.name);
+
+            // Always put empty string in
+            fstBuilder.Add(new IntsRef(), termsFilePointer);
+            startTermsFilePointer = termsFilePointer;
+        }
+
+        public override bool CheckIndexTerm(BytesRef text, TermStats stats)
+        {
+            //System.out.println("VGW: index term=" + text.utf8ToString());
+            // NOTE: we must force the first term per field to be
+            // indexed, in case policy doesn't:
+            if (policy.isIndexTerm(text, stats) || first)
+            {
+                first = false;
+                //System.out.println("  YES");
+                return true;
+            }
+            else
+            {
+                lastTerm.CopyBytes(text);
+                return false;
+            }
+        }
+
+        private readonly IntsRef scratchIntsRef = new IntsRef();
+
+        public override void Add(BytesRef text, TermStats stats, long termsFilePointer)
+        {
+            if (text.Length == 0)
+            {
+                // We already added empty string in ctor
+                Debug.Assert(termsFilePointer == startTermsFilePointer);
+                return;
+            }
+            int lengthSave = text.Length;
+            text.Length = IndexedTermPrefixLength(lastTerm, text);
+            try
+            {
+                fstBuilder.Add(Util.ToIntsRef(text, scratchIntsRef), termsFilePointer);
+            }
+            finally
+            {
+                text.Length = lengthSave;
+            }
+            lastTerm.CopyBytes(text);
+        }
+
+        public override void Finish(long termsFilePointer)
+        {
+            fst = fstBuilder.Finish();
+            if (fst != null)
+            {
+                fst.Save(output);
+            }
+        }
+    }
+
+    public void Dispose()
+    {
+        if (output != null)
+        {
+            try
+            {
+                long dirStart = output.FilePointer;
+                int fieldCount = fields.Size;
+
+                int nonNullFieldCount = 0;
+                for (int i = 0; i < fieldCount; i++)
+                {
+                    FSTFieldWriter field = fields[i];
+                    if (field.fst != null)
+                    {
+                        nonNullFieldCount++;
+                    }
+                }
+
+                output.WriteVInt(nonNullFieldCount);
+                for (int i = 0; i < fieldCount; i++)
+                {
+                    FSTFieldWriter field = fields[i];
+                    if (field.Fst != null)
+                    {
+                        output.WriteVInt(field.fieldInfo.Number);
+                        output.WriteVLong(field.indexStart);
+                    }
+                }
+                writeTrailer(dirStart);
+                CodecUtil.WriteFooter(output);
+            }
+            finally
+            {
+                output.Dispose();
+                output = null;
+            }
+        }
+    }
+
+    private void WriteTrailer(long dirStart)
+    {
+        output.WriteLong(dirStart);
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1da1cb5b/src/Lucene.Net.Codecs/Bloom/BloomFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Codecs/Bloom/BloomFilterFactory.cs b/src/Lucene.Net.Codecs/Bloom/BloomFilterFactory.cs
new file mode 100644
index 0000000..6bac454
--- /dev/null
+++ b/src/Lucene.Net.Codecs/Bloom/BloomFilterFactory.cs
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Codecs.Bloom
+{
+
+    using Lucene.Net.Index;
+
+    /// <summary>
+    /// Class used to create index-time {@link FuzzySet} appropriately configured for
+    /// each field. Also called to right-size bitsets for serialization.
+    ///
+    ///  @lucene.experimental
+    /// </summary>
+    public abstract class BloomFilterFactory
+    {
+
+        /// <summary>
+        /// 
+        /// </summary>
+        /// <param name="state">The content to be indexed</param>
+        /// <param name="info">The field requiring a BloomFilter</param>
+        /// <returns>An appropriately sized set or null if no BloomFiltering required</returns>
+        public abstract FuzzySet GetSetForField(SegmentWriteState state, FieldInfo info);
+
+        /// <summary>
+        /// Called when downsizing bitsets for serialization
+        /// </summary>
+        /// <param name="fieldInfo">The field with sparse set bits</param>
+        /// <param name="initialSet">The bits accumulated</param>
+        /// <returns> null or a hopefully more densely packed, smaller bitset</returns>
+        public FuzzySet Downsize(FieldInfo fieldInfo, FuzzySet initialSet)
+        {
+            // Aim for a bitset size that would have 10% of bits set (so 90% of searches
+            // would fail-fast)
+            const float targetMaxSaturation = 0.1f;
+            return initialSet.Downsize(targetMaxSaturation);
+        }
+
+        /// <summary>
+        /// Used to determine if the given filter has reached saturation and should be retired i.e. not saved any more
+        /// </summary>
+        /// <param name="bloomFilter">The bloomFilter being tested</param>
+        /// <param name="fieldInfo">The field with which this filter is associated</param>
+        /// <returns>true if the set has reached saturation and should be retired</returns>
+        public abstract bool IsSaturated(FuzzySet bloomFilter, FieldInfo fieldInfo);
+
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1da1cb5b/src/Lucene.Net.Codecs/Bloom/BloomFilteringPostingsFormat.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Codecs/Bloom/BloomFilteringPostingsFormat.cs b/src/Lucene.Net.Codecs/Bloom/BloomFilteringPostingsFormat.cs
new file mode 100644
index 0000000..eb710b8
--- /dev/null
+++ b/src/Lucene.Net.Codecs/Bloom/BloomFilteringPostingsFormat.cs
@@ -0,0 +1,547 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Codecs.Bloom
+{
+
+    using System;
+    using System.Collections.Generic;
+    using System.Diagnostics;
+    using Lucene.Net.Index;
+    using Lucene.Net.Search;
+    using Lucene.Net.Store;
+    using Lucene.Net.Support;
+    using Lucene.Net.Util;
+    using Lucene.Net.Util.Automaton;
+
+    /// <summary>
+    /// 
+    /// A {@link PostingsFormat} useful for low doc-frequency fields such as primary
+    /// keys. Bloom filters are maintained in a ".blm" file which offers "fast-fail"
+    /// for reads in segments known to have no record of the key. A choice of
+    /// delegate PostingsFormat is used to record all other Postings data.
+    /// 
+    /// A choice of {@link BloomFilterFactory} can be passed to tailor Bloom Filter
+    /// settings on a per-field basis. The default configuration is
+    /// {@link DefaultBloomFilterFactory} which allocates a ~8mb bitset and hashes
+    /// values using {@link MurmurHash2}. This should be suitable for most purposes.
+    ///
+    /// The format of the blm file is as follows:
+    ///
+    /// <ul>
+    /// <li>BloomFilter (.blm) --&gt; Header, DelegatePostingsFormatName,
+    /// NumFilteredFields, Filter<sup>NumFilteredFields</sup>, Footer</li>
+    /// <li>Filter --&gt; FieldNumber, FuzzySet</li>
+    /// <li>FuzzySet --&gt;See {@link FuzzySet#serialize(DataOutput)}</li>
+    /// <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
+    /// <li>DelegatePostingsFormatName --&gt; {@link DataOutput#writeString(String)
+    /// String} The name of a ServiceProvider registered {@link PostingsFormat}</li>
+    /// <li>NumFilteredFields --&gt; {@link DataOutput#writeInt Uint32}</li>
+    /// <li>FieldNumber --&gt; {@link DataOutput#writeInt Uint32} The number of the
+    /// field in this segment</li>
+    /// <li>Footer --&gt; {@link CodecUtil#writeFooter CodecFooter}</li>
+    /// </ul>
+    ///
+    ///  @lucene.experimental
+    /// </summary>
+    public sealed class BloomFilteringPostingsFormat : PostingsFormat
+    {
+        public static readonly String BLOOM_CODEC_NAME = "BloomFilter";
+        public static readonly int VERSION_START = 1;
+        public static readonly int VERSION_CHECKSUM = 2;
+        public static readonly int VERSION_CURRENT = VERSION_CHECKSUM;
+
+        /** Extension of Bloom Filters file */
+        private static readonly String BLOOM_EXTENSION = "blm";
+
+        private BloomFilterFactory bloomFilterFactory = new DefaultBloomFilterFactory();
+        private PostingsFormat delegatePostingsFormat;
+        
+        /// <summary>
+        ///  Creates Bloom filters for a selection of fields created in the index. This
+        /// is recorded as a set of Bitsets held as a segment summary in an additional
+        /// "blm" file. This PostingsFormat delegates to a choice of delegate
+        /// PostingsFormat for encoding all other postings data.
+        /// </summary>
+        /// <param name="delegatePostingsFormat">The PostingsFormat that records all the non-bloom filter data i.e. postings info.</param>
+        /// <param name="bloomFilterFactory">The {@link BloomFilterFactory} responsible for sizing BloomFilters appropriately</param>
+        public BloomFilteringPostingsFormat(PostingsFormat delegatePostingsFormat,
+            BloomFilterFactory bloomFilterFactory) : base(BLOOM_CODEC_NAME)
+        {
+            this.delegatePostingsFormat = delegatePostingsFormat;
+            this.bloomFilterFactory = bloomFilterFactory;
+        }
+
+        /// <summary>
+        /// Creates Bloom filters for a selection of fields created in the index. This
+        /// is recorded as a set of Bitsets held as a segment summary in an additional
+        /// "blm" file. This PostingsFormat delegates to a choice of delegate
+        /// PostingsFormat for encoding all other postings data. This choice of
+        /// constructor defaults to the {@link DefaultBloomFilterFactory} for
+        /// configuring per-field BloomFilters.
+        /// </summary>
+        /// <param name="delegatePostingsFormat">The PostingsFormat that records all the non-bloom filter data i.e. postings info.</param>
+        public BloomFilteringPostingsFormat(PostingsFormat delegatePostingsFormat)
+            : this(delegatePostingsFormat, new DefaultBloomFilterFactory())
+        {
+        }
+
+        /// <summary>
+        /// Used only by core Lucene at read-time via Service Provider instantiation -
+        /// do not use at Write-time in application code.
+        /// </summary>
+        public BloomFilteringPostingsFormat() : base(BLOOM_CODEC_NAME)
+        {
+        }
+
+        public override FieldsConsumer FieldsConsumer(SegmentWriteState state)
+        {
+            if (delegatePostingsFormat == null)
+            {
+                throw new InvalidOperationException("Error - constructed without a choice of PostingsFormat");
+            }
+            return new BloomFilteredFieldsConsumer(
+                delegatePostingsFormat.FieldsConsumer(state), state,
+                delegatePostingsFormat);
+        }
+
+        public override FieldsProducer FieldsProducer(SegmentReadState state)
+        {
+            return new BloomFilteredFieldsProducer(state);
+        }
+
+        internal class BloomFilteredFieldsProducer : FieldsProducer
+        {
+            private FieldsProducer delegateFieldsProducer;
+            private HashMap<String, FuzzySet> bloomsByFieldName = new HashMap<String, FuzzySet>();
+
+            public BloomFilteredFieldsProducer(SegmentReadState state)
+            {
+
+                String bloomFileName = IndexFileNames.SegmentFileName(
+                    state.SegmentInfo.Name, state.SegmentSuffix, BLOOM_EXTENSION);
+                ChecksumIndexInput bloomIn = null;
+                bool success = false;
+                try
+                {
+                    bloomIn = state.Directory.OpenChecksumInput(bloomFileName, state.Context);
+                    int version = CodecUtil.CheckHeader(bloomIn, BLOOM_CODEC_NAME, VERSION_START, VERSION_CURRENT);
+                    // // Load the hash function used in the BloomFilter
+                    // hashFunction = HashFunction.forName(bloomIn.readString());
+                    // Load the delegate postings format
+                    PostingsFormat delegatePostingsFormat = PostingsFormat.ForName(bloomIn
+                        .ReadString());
+
+                    this.delegateFieldsProducer = delegatePostingsFormat
+                        .FieldsProducer(state);
+                    int numBlooms = bloomIn.ReadInt();
+                    for (int i = 0; i < numBlooms; i++)
+                    {
+                        int fieldNum = bloomIn.ReadInt();
+                        FuzzySet bloom = FuzzySet.Deserialize(bloomIn);
+                        FieldInfo fieldInfo = state.FieldInfos.FieldInfo(fieldNum);
+                        bloomsByFieldName.Add(fieldInfo.Name, bloom);
+                    }
+                    if (version >= VERSION_CHECKSUM)
+                    {
+                        CodecUtil.CheckFooter(bloomIn);
+                    }
+                    else
+                    {
+                        CodecUtil.CheckEOF(bloomIn);
+                    }
+                    IOUtils.Close(bloomIn);
+                    success = true;
+                }
+                finally
+                {
+                    if (!success)
+                    {
+                        IOUtils.CloseWhileHandlingException(bloomIn, delegateFieldsProducer);
+                    }
+                }
+            }
+
+            public override IEnumerator<string> GetEnumerator()
+            {
+                return delegateFieldsProducer.GetEnumerator();
+            }
+
+            public override Terms Terms(String field)
+            {
+                FuzzySet filter = bloomsByFieldName[field];
+                if (filter == null)
+                {
+                    return delegateFieldsProducer.Terms(field);
+                }
+                else
+                {
+                    Terms result = delegateFieldsProducer.Terms(field);
+                    if (result == null)
+                    {
+                        return null;
+                    }
+                    return new BloomFilteredTerms(result, filter);
+                }
+            }
+
+            public override int Size()
+            {
+                return delegateFieldsProducer.Size();
+            }
+
+            public override long UniqueTermCount
+            {
+                get { return delegateFieldsProducer.UniqueTermCount; }
+            }
+
+            public override void Dispose()
+            {
+                delegateFieldsProducer.Dispose();
+            }
+
+            public override long RamBytesUsed()
+            {
+                long sizeInBytes = ((delegateFieldsProducer != null) ? delegateFieldsProducer.RamBytesUsed() : 0);
+                foreach (var entry in bloomsByFieldName.EntrySet())
+                {
+                    sizeInBytes += entry.Key.Length*RamUsageEstimator.NUM_BYTES_CHAR;
+                    sizeInBytes += entry.Value.RamBytesUsed();
+                }
+                return sizeInBytes;
+            }
+
+            public override void CheckIntegrity()
+            {
+                delegateFieldsProducer.CheckIntegrity();
+            }
+
+            internal class BloomFilteredTerms : Terms
+            {
+                private Terms delegateTerms;
+                private FuzzySet filter;
+
+                public BloomFilteredTerms(Terms terms, FuzzySet filter)
+                {
+                    this.delegateTerms = terms;
+                    this.filter = filter;
+                }
+
+                public override TermsEnum Intersect(CompiledAutomaton compiled,
+                    BytesRef startTerm)
+                {
+                    return delegateTerms.Intersect(compiled, startTerm);
+                }
+
+                public override TermsEnum Iterator(TermsEnum reuse)
+                {
+                    if ((reuse != null) && (reuse is BloomFilteredTermsEnum))
+                    {
+                        // recycle the existing BloomFilteredTermsEnum by asking the delegate
+                        // to recycle its contained TermsEnum
+                        BloomFilteredTermsEnum bfte = (BloomFilteredTermsEnum) reuse;
+                        if (bfte.filter == filter)
+                        {
+                            bfte.Reset(delegateTerms, bfte.delegateTermsEnum);
+                            return bfte;
+                        }
+                    }
+                    // We have been handed something we cannot reuse (either null, wrong
+                    // class or wrong filter) so allocate a new object
+                    return new BloomFilteredTermsEnum(delegateTerms, reuse, filter);
+                }
+
+                public override IComparer<BytesRef> Comparator
+                {
+                    get { return delegateTerms.Comparator; }
+                }
+
+                public override long Size()
+                {
+                    return delegateTerms.Size();
+                }
+
+                public override long SumTotalTermFreq
+                {
+                    get { return delegateTerms.SumTotalTermFreq; }
+                }
+
+                public override long SumDocFreq
+                {
+                    get { return delegateTerms.SumDocFreq; }
+                }
+
+                public override int DocCount
+                {
+                    get { return delegateTerms.DocCount; }
+                }
+
+                public override bool HasFreqs()
+                {
+                    return delegateTerms.HasFreqs();
+                }
+
+                public override bool HasOffsets()
+                {
+                    return delegateTerms.HasOffsets();
+                }
+
+                public override bool HasPositions()
+                {
+                    return delegateTerms.HasPositions();
+                }
+
+                public override bool HasPayloads()
+                {
+                    return delegateTerms.HasPayloads();
+                }
+            }
+
+            internal sealed class BloomFilteredTermsEnum : TermsEnum
+            {
+                private Terms delegateTerms;
+                internal TermsEnum delegateTermsEnum;
+                private TermsEnum reuseDelegate;
+                internal readonly FuzzySet filter;
+
+                public BloomFilteredTermsEnum(Terms delegateTerms, TermsEnum reuseDelegate, FuzzySet filter)
+                {
+                    this.delegateTerms = delegateTerms;
+                    this.reuseDelegate = reuseDelegate;
+                    this.filter = filter;
+                }
+
+                internal void Reset(Terms delegateTerms, TermsEnum reuseDelegate)
+                {
+                    this.delegateTerms = delegateTerms;
+                    this.reuseDelegate = reuseDelegate;
+                    this.delegateTermsEnum = null;
+                }
+
+                private TermsEnum Delegate()
+                {
+                    if (delegateTermsEnum == null)
+                    {
+                        /* pull the iterator only if we really need it -
+                    * this can be a relativly heavy operation depending on the 
+                    * delegate postings format and they underlying directory
+                    * (clone IndexInput) */
+                        delegateTermsEnum = delegateTerms.Iterator(reuseDelegate);
+                    }
+
+                    return delegateTermsEnum;
+                }
+
+                public override BytesRef Next()
+                {
+                    return Delegate().Next();
+                }
+
+                public override IComparer<BytesRef> Comparator
+                {
+                    get { return delegateTerms.Comparator; }
+                }
+
+                public override bool SeekExact(BytesRef text)
+                {
+                    // The magical fail-fast speed up that is the entire point of all of
+                    // this code - save a disk seek if there is a match on an in-memory
+                    // structure
+                    // that may occasionally give a false positive but guaranteed no false
+                    // negatives
+                    if (filter.Contains(text) == FuzzySet.ContainsResult.No)
+                    {
+                        return false;
+                    }
+                    return Delegate().SeekExact(text);
+                }
+
+                public override SeekStatus SeekCeil(BytesRef text)
+                {
+                    return Delegate().SeekCeil(text);
+                }
+
+                public override void SeekExact(long ord)
+                {
+                    Delegate().SeekExact(ord);
+                }
+
+                public override BytesRef Term()
+                {
+                    return Delegate().Term();
+                }
+
+                public override long Ord()
+                {
+                    return Delegate().Ord();
+                }
+
+                public override int DocFreq()
+                {
+                    return Delegate().DocFreq();
+                }
+
+                public override long TotalTermFreq()
+                {
+                    return Delegate().TotalTermFreq();
+                }
+
+                public override DocsAndPositionsEnum DocsAndPositions(Bits liveDocs,
+                    DocsAndPositionsEnum reuse, int flags)
+                {
+                    return Delegate().DocsAndPositions(liveDocs, reuse, flags);
+                }
+
+                public override DocsEnum Docs(Bits liveDocs, DocsEnum reuse, int flags)
+                {
+                    return Delegate().Docs(liveDocs, reuse, flags);
+                }
+            }
+
+        }
+
+        internal class BloomFilteredFieldsConsumer : FieldsConsumer
+        {
+            private FieldsConsumer delegateFieldsConsumer;
+            private Dictionary<FieldInfo, FuzzySet> bloomFilters = new Dictionary<FieldInfo, FuzzySet>();
+            private SegmentWriteState state;
+
+            public BloomFilteredFieldsConsumer(FieldsConsumer fieldsConsumer,
+                SegmentWriteState state, PostingsFormat delegatePostingsFormat)
+            {
+                this.delegateFieldsConsumer = fieldsConsumer;
+                this.state = state;
+            }
+
+            public override TermsConsumer AddField(FieldInfo field)
+            {
+                FuzzySet bloomFilter = bloomFilterFactory.GetSetForField(state, field);
+                if (bloomFilter != null)
+                {
+                    Debug.Debug.Assert((bloomFilters.ContainsKey(field) == false);
+                    bloomFilters.Add(field, bloomFilter);
+                    return new WrappedTermsConsumer(delegateFieldsConsumer.AddField(field), bloomFilter);
+                }
+                else
+                {
+                    // No, use the unfiltered fieldsConsumer - we are not interested in
+                    // recording any term Bitsets.
+                    return delegateFieldsConsumer.AddField(field);
+                }
+            }
+
+            public override void Dispose()
+            {
+                delegateFieldsConsumer.Dispose();
+                // Now we are done accumulating values for these fields
+                var nonSaturatedBlooms = new List<KeyValuePair<FieldInfo, FuzzySet>>();
+
+                foreach (var entry in bloomFilters.EntrySet())
+                {
+                    FuzzySet bloomFilter = entry.Value;
+                    if (!bloomFilterFactory.IsSaturated(bloomFilter, entry.Key))
+                    {
+                        nonSaturatedBlooms.Add(entry);
+                    }
+                }
+
+                String bloomFileName = IndexFileNames.SegmentFileName(
+                    state.SegmentInfo.Name, state.SegmentSuffix, BLOOM_EXTENSION);
+                IndexOutput bloomOutput = null;
+
+                try
+                {
+                    bloomOutput = state.Directory.CreateOutput(bloomFileName, state.Context);
+                    CodecUtil.WriteHeader(bloomOutput, BLOOM_CODEC_NAME, VERSION_CURRENT);
+                    // remember the name of the postings format we will delegate to
+                    bloomOutput.WriteString(delegatePostingsFormat.Name);
+
+                    // First field in the output file is the number of fields+blooms saved
+                    bloomOutput.WriteInt(nonSaturatedBlooms.Count);
+                    foreach (var entry in nonSaturatedBlooms)
+                    {
+                        FieldInfo fieldInfo = entry.Key;
+                        FuzzySet bloomFilter = entry.Value;
+                        bloomOutput.WriteInt(fieldInfo.Number);
+                        SaveAppropriatelySizedBloomFilter(bloomOutput, bloomFilter, fieldInfo);
+                    }
+
+                    CodecUtil.WriteFooter(bloomOutput);
+                }
+                finally
+                {
+                    IOUtils.Close(bloomOutput);
+                }
+                //We are done with large bitsets so no need to keep them hanging around
+                bloomFilters.Clear();
+            }
+
+            private void SaveAppropriatelySizedBloomFilter(IndexOutput bloomOutput,
+                FuzzySet bloomFilter, FieldInfo fieldInfo)
+            {
+
+                FuzzySet rightSizedSet = bloomFilterFactory.Downsize(fieldInfo,
+                    bloomFilter);
+                if (rightSizedSet == null)
+                {
+                    rightSizedSet = bloomFilter;
+                }
+                rightSizedSet.Serialize(bloomOutput);
+            }
+
+        }
+
+        internal class WrappedTermsConsumer : TermsConsumer
+        {
+            private TermsConsumer delegateTermsConsumer;
+            private FuzzySet bloomFilter;
+
+            public WrappedTermsConsumer(TermsConsumer termsConsumer, FuzzySet bloomFilter)
+            {
+                this.delegateTermsConsumer = termsConsumer;
+                this.bloomFilter = bloomFilter;
+            }
+
+            public override PostingsConsumer StartTerm(BytesRef text)
+            {
+                return delegateTermsConsumer.StartTerm(text);
+            }
+
+            public override void FinishTerm(BytesRef text, TermStats stats)
+            {
+                // Record this term in our BloomFilter
+                if (stats.DocFreq > 0)
+                {
+                    bloomFilter.AddValue(text);
+                }
+                delegateTermsConsumer.FinishTerm(text, stats);
+            }
+
+            public override void Finish(long sumTotalTermFreq, long sumDocFreq, int docCount)
+            {
+                delegateTermsConsumer.Finish(sumTotalTermFreq, sumDocFreq, docCount);
+            }
+
+            public override IComparer<BytesRef> Comparator
+            {
+                get { return delegateTermsConsumer.Comparator; }
+            }
+
+        }
+
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1da1cb5b/src/Lucene.Net.Codecs/Bloom/DefaultBloomFilterFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Codecs/Bloom/DefaultBloomFilterFactory.cs b/src/Lucene.Net.Codecs/Bloom/DefaultBloomFilterFactory.cs
new file mode 100644
index 0000000..6d1bb54
--- /dev/null
+++ b/src/Lucene.Net.Codecs/Bloom/DefaultBloomFilterFactory.cs
@@ -0,0 +1,45 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Codecs.Bloom
+{
+    using Lucene.Net.Index;
+
+    /// <summary>
+    /// Default policy is to allocate a bitset with 10% saturation given a unique term per document.
+    /// Bits are set via MurmurHash2 hashing function.
+    ///
+    /// @lucene.experimental
+    /// </summary>
+    public class DefaultBloomFilterFactory : BloomFilterFactory
+    {
+
+        public override FuzzySet GetSetForField(SegmentWriteState state, FieldInfo info)
+        {
+            //Assume all of the docs have a unique term (e.g. a primary key) and we hope to maintain a set with 10% of bits set
+            return FuzzySet.CreateSetBasedOnQuality(state.SegmentInfo.DocCount, 0.10f);
+        }
+
+        public override bool IsSaturated(FuzzySet bloomFilter, FieldInfo fieldInfo)
+        {
+            // Don't bother saving bitsets if >90% of bits are set - we don't want to
+            // throw any more memory at this problem.
+            return bloomFilter.GetSaturation() > 0.9f;
+        }
+
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1da1cb5b/src/Lucene.Net.Codecs/Bloom/FuzzySet.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Codecs/Bloom/FuzzySet.cs b/src/Lucene.Net.Codecs/Bloom/FuzzySet.cs
new file mode 100644
index 0000000..5a97564
--- /dev/null
+++ b/src/Lucene.Net.Codecs/Bloom/FuzzySet.cs
@@ -0,0 +1,347 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Codecs.Bloom
+{
+
+    using System;
+    using System.Diagnostics;
+    using Lucene.Net.Store;
+    using Lucene.Net.Util;
+
+    /// <summary>
+    /// A class used to represent a set of many, potentially large, values (e.g. many
+    /// long strings such as URLs), using a significantly smaller amount of memory.
+    ///
+    /// The set is "lossy" in that it cannot definitively state that is does contain
+    /// a value but it <em>can</em> definitively say if a value is <em>not</em> in
+    /// the set. It can therefore be used as a Bloom Filter.
+    /// 
+    /// Another application of the set is that it can be used to perform fuzzy counting because
+    /// it can estimate reasonably accurately how many unique values are contained in the set. 
+    ///
+    /// This class is NOT threadsafe.
+    ///
+    /// Internally a Bitset is used to record values and once a client has finished recording
+    /// a stream of values the {@link #downsize(float)} method can be used to create a suitably smaller set that
+    /// is sized appropriately for the number of values recorded and desired saturation levels. 
+    /// 
+    /// @lucene.experimental
+    /// </summary>
+    public class FuzzySet
+    {
+
+        public static readonly int VERSION_SPI = 1; // HashFunction used to be loaded through a SPI
+        public static readonly int VERSION_START = VERSION_SPI;
+        public static readonly int VERSION_CURRENT = 2;
+
+        public static HashFunction hashFunctionForVersion(int version)
+        {
+            if (version < VERSION_START)
+            {
+                throw new ArgumentException("Version " + version + " is too old, expected at least " +
+                                                   VERSION_START);
+            }
+            else if (version > VERSION_CURRENT)
+            {
+                throw new ArgumentException("Version " + version + " is too new, expected at most " +
+                                                   VERSION_CURRENT);
+            }
+            return MurmurHash2.INSTANCE;
+        }
+
+        /// <remarks>
+        /// Result from {@link FuzzySet#contains(BytesRef)}:
+        /// can never return definitively YES (always MAYBE), 
+        /// but can sometimes definitely return NO.
+        /// </remarks>
+        public enum ContainsResult
+        {
+            Maybe,
+            No
+        };
+
+        private readonly HashFunction hashFunction;
+        private readonly FixedBitSet filter;
+        private readonly int bloomSize;
+
+        //The sizes of BitSet used are all numbers that, when expressed in binary form,
+        //are all ones. This is to enable fast downsizing from one bitset to another
+        //by simply ANDing each set index in one bitset with the size of the target bitset
+        // - this provides a fast modulo of the number. Values previously accumulated in
+        // a large bitset and then mapped to a smaller set can be looked up using a single
+        // AND operation of the query term's hash rather than needing to perform a 2-step
+        // translation of the query term that mirrors the stored content's reprojections.
+        private static int[] usableBitSetSizes;
+
+        private static 
+        {
+            usableBitSetSizes = new int[30];
+            int mask = 1;
+            int size = mask;
+            for (int i = 0; i < usableBitSetSizes.Length; i++)
+            {
+                size = (size << 1) | mask;
+                usableBitSetSizes[i] = size;
+            }
+        }
+
+        /**
+   * Rounds down required maxNumberOfBits to the nearest number that is made up
+   * of all ones as a binary number.  
+   * Use this method where controlling memory use is paramount.
+   */
+
+        public static int GetNearestSetSize(int maxNumberOfBits)
+        {
+            int result = usableBitSetSizes[0];
+            for (int i = 0; i < usableBitSetSizes.Length; i++)
+            {
+                if (usableBitSetSizes[i] <= maxNumberOfBits)
+                {
+                    result = usableBitSetSizes[i];
+                }
+            }
+            return result;
+        }
+
+        /**
+   * Use this method to choose a set size where accuracy (low content saturation) is more important
+   * than deciding how much memory to throw at the problem.
+   * @param desiredSaturation A number between 0 and 1 expressing the % of bits set once all values have been recorded
+   * @return The size of the set nearest to the required size
+   */
+
+        public static int GetNearestSetSize(int maxNumberOfValuesExpected,
+            float desiredSaturation)
+        {
+            // Iterate around the various scales of bitset from smallest to largest looking for the first that
+            // satisfies value volumes at the chosen saturation level
+            for (int i = 0; i < usableBitSetSizes.Length; i++)
+            {
+                int numSetBitsAtDesiredSaturation = (int) (usableBitSetSizes[i]*desiredSaturation);
+                int estimatedNumUniqueValues = GetEstimatedNumberUniqueValuesAllowingForCollisions(
+                    usableBitSetSizes[i], numSetBitsAtDesiredSaturation);
+                if (estimatedNumUniqueValues > maxNumberOfValuesExpected)
+                {
+                    return usableBitSetSizes[i];
+                }
+            }
+            return -1;
+        }
+
+        public static FuzzySet CreateSetBasedOnMaxMemory(int maxNumBytes)
+        {
+            int setSize = GetNearestSetSize(maxNumBytes);
+            return new FuzzySet(new FixedBitSet(setSize + 1), setSize, hashFunctionForVersion(VERSION_CURRENT));
+        }
+
+        public static FuzzySet CreateSetBasedOnQuality(int maxNumUniqueValues, float desiredMaxSaturation)
+        {
+            int setSize = GetNearestSetSize(maxNumUniqueValues, desiredMaxSaturation);
+            return new FuzzySet(new FixedBitSet(setSize + 1), setSize, hashFunctionForVersion(VERSION_CURRENT));
+        }
+
+        private FuzzySet(FixedBitSet filter, int bloomSize, HashFunction hashFunction)
+        {
+            this.filter = filter;
+            this.bloomSize = bloomSize;
+            this.hashFunction = hashFunction;
+        }
+
+        /**
+   * The main method required for a Bloom filter which, given a value determines set membership.
+   * Unlike a conventional set, the fuzzy set returns NO or MAYBE rather than true or false.
+   * @return NO or MAYBE
+   */
+
+        public ContainsResult Contains(BytesRef value)
+        {
+            int hash = hashFunction.Hash(value);
+            if (hash < 0)
+            {
+                hash = hash*-1;
+            }
+            return MayContainValue(hash);
+        }
+
+        /**
+   * Serializes the data set to file using the following format:
+   * <ul>
+   *  <li>FuzzySet --&gt;FuzzySetVersion,HashFunctionName,BloomSize,
+   * NumBitSetWords,BitSetWord<sup>NumBitSetWords</sup></li> 
+   * <li>HashFunctionName --&gt; {@link DataOutput#writeString(String) String} The
+   * name of a ServiceProvider registered {@link HashFunction}</li>
+   * <li>FuzzySetVersion --&gt; {@link DataOutput#writeInt Uint32} The version number of the {@link FuzzySet} class</li>
+   * <li>BloomSize --&gt; {@link DataOutput#writeInt Uint32} The modulo value used
+   * to project hashes into the field's Bitset</li>
+   * <li>NumBitSetWords --&gt; {@link DataOutput#writeInt Uint32} The number of
+   * longs (as returned from {@link FixedBitSet#getBits})</li>
+   * <li>BitSetWord --&gt; {@link DataOutput#writeLong Long} A long from the array
+   * returned by {@link FixedBitSet#getBits}</li>
+   * </ul>
+   * @param out Data output stream
+   * @ If there is a low-level I/O error
+   */
+
+        public void Serialize(DataOutput output)
+        {
+            output.WriteInt(VERSION_CURRENT);
+            output.WriteInt(bloomSize);
+            long[] bits = filter.GetBits();
+            output.WriteInt(bits.Length);
+            for (int i = 0; i < bits.Length; i++)
+            {
+                // Can't used VLong encoding because cant cope with negative numbers
+                // output by FixedBitSet
+                output.WriteLong(bits[i]);
+            }
+        }
+
+        public static FuzzySet Deserialize(DataInput input)
+        {
+            int version = input.ReadInt();
+            if (version == VERSION_SPI)
+            {
+                input.ReadString();
+            }
+            HashFunction hashFunction = hashFunctionForVersion(version);
+            int bloomSize = input.ReadInt();
+            int numLongs = input.ReadInt();
+            long[] longs = new long[numLongs];
+            for (int i = 0; i < numLongs; i++)
+            {
+                longs[i] = input.ReadLong();
+            }
+            FixedBitSet bits = new FixedBitSet(longs, bloomSize + 1);
+            return new FuzzySet(bits, bloomSize, hashFunction);
+        }
+
+        private ContainsResult MayContainValue(int positiveHash)
+        {
+            Debug.Debug.Assert((positiveHash >= 0);
+
+            // Bloom sizes are always base 2 and so can be ANDed for a fast modulo
+            int pos = positiveHash & bloomSize;
+            if (filter.Get(pos))
+            {
+                // This term may be recorded in this index (but could be a collision)
+                return ContainsResult.Maybe;
+            }
+            // definitely NOT in this segment
+            return ContainsResult.No;
+        }
+
+        /**
+   * Records a value in the set. The referenced bytes are hashed and then modulo n'd where n is the
+   * chosen size of the internal bitset.
+   * @param value the key value to be hashed
+   * @ If there is a low-level I/O error
+   */
+
+        public void AddValue(BytesRef value)
+        {
+            int hash = hashFunction.Hash(value);
+            if (hash < 0)
+            {
+                hash = hash*-1;
+            }
+            // Bitmasking using bloomSize is effectively a modulo operation.
+            int bloomPos = hash & bloomSize;
+            filter.Set(bloomPos);
+        }
+
+
+        /**
+   * 
+   * @param targetMaxSaturation A number between 0 and 1 describing the % of bits that would ideally be set in the 
+   * result. Lower values have better accuracy but require more space.
+   * @return a smaller FuzzySet or null if the current set is already over-saturated
+   */
+
+        public FuzzySet Downsize(float targetMaxSaturation)
+        {
+            int numBitsSet = filter.Cardinality();
+            FixedBitSet rightSizedBitSet = filter;
+            int rightSizedBitSetSize = bloomSize;
+            //Hopefully find a smaller size bitset into which we can project accumulated values while maintaining desired saturation level
+            for (int i = 0; i < usableBitSetSizes.Length; i++)
+            {
+                int candidateBitsetSize = usableBitSetSizes[i];
+                float candidateSaturation = (float) numBitsSet
+                                            /(float) candidateBitsetSize;
+                if (candidateSaturation <= targetMaxSaturation)
+                {
+                    rightSizedBitSetSize = candidateBitsetSize;
+                    break;
+                }
+            }
+            // Re-project the numbers to a smaller space if necessary
+            if (rightSizedBitSetSize < bloomSize)
+            {
+                // Reset the choice of bitset to the smaller version
+                rightSizedBitSet = new FixedBitSet(rightSizedBitSetSize + 1);
+                // Map across the bits from the large set to the smaller one
+                int bitIndex = 0;
+                do
+                {
+                    bitIndex = filter.NextSetBit(bitIndex);
+                    if (bitIndex >= 0)
+                    {
+                        // Project the larger number into a smaller one effectively
+                        // modulo-ing by using the target bitset size as a mask
+                        int downSizedBitIndex = bitIndex & rightSizedBitSetSize;
+                        rightSizedBitSet.Set(downSizedBitIndex);
+                        bitIndex++;
+                    }
+                } while ((bitIndex >= 0) && (bitIndex <= bloomSize));
+            }
+            else
+            {
+                return null;
+            }
+            return new FuzzySet(rightSizedBitSet, rightSizedBitSetSize, hashFunction);
+        }
+
+        public int GetEstimatedUniqueValues()
+        {
+            return GetEstimatedNumberUniqueValuesAllowingForCollisions(bloomSize, filter.Cardinality());
+        }
+
+        // Given a set size and a the number of set bits, produces an estimate of the number of unique values recorded
+        public static int GetEstimatedNumberUniqueValuesAllowingForCollisions(
+            int setSize, int numRecordedBits)
+        {
+            double setSizeAsDouble = setSize;
+            double numRecordedBitsAsDouble = numRecordedBits;
+            double saturation = numRecordedBitsAsDouble/setSizeAsDouble;
+            double logInverseSaturation = Math.Log(1 - saturation)*-1;
+            return (int) (setSizeAsDouble*logInverseSaturation);
+        }
+
+        public float GetSaturation()
+        {
+            int numBitsSet = filter.Cardinality();
+            return (float) numBitsSet/(float) bloomSize;
+        }
+
+        public long RamBytesUsed()
+        {
+            return RamUsageEstimator.SizeOf(filter.GetBits());
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1da1cb5b/src/Lucene.Net.Codecs/Bloom/HashFunction.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Codecs/Bloom/HashFunction.cs b/src/Lucene.Net.Codecs/Bloom/HashFunction.cs
new file mode 100644
index 0000000..9431e1b
--- /dev/null
+++ b/src/Lucene.Net.Codecs/Bloom/HashFunction.cs
@@ -0,0 +1,40 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Codecs.Bloom
+{
+    using Lucene.Net.Util;
+
+    /// <summary>
+    /// Base class for hashing functions that can be referred to by name.
+    /// Subclasses are expected to provide threadsafe implementations of the hash function
+    /// on the range of bytes referenced in the provided {@link BytesRef}
+    /// 
+    /// @lucene.experimental
+    /// </summary>
+    public abstract class HashFunction
+    {
+
+        /// <summary>
+        /// Hashes the contents of the referenced bytes
+        /// @param bytes the data to be hashed
+        /// @return the hash of the bytes referenced by bytes.offset and length bytes.length
+        /// </summary>
+        public abstract int Hash(BytesRef bytes);
+
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1da1cb5b/src/Lucene.Net.Codecs/Bloom/MurmurHash2.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Codecs/Bloom/MurmurHash2.cs b/src/Lucene.Net.Codecs/Bloom/MurmurHash2.cs
new file mode 100644
index 0000000..cb70d5d
--- /dev/null
+++ b/src/Lucene.Net.Codecs/Bloom/MurmurHash2.cs
@@ -0,0 +1,111 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+
+namespace Lucene.Net.Codecs.Bloom
+{
+
+    using Lucene.Net.Util;
+
+    /// <summary>
+    /// This is a very fast, non-cryptographic hash suitable for general hash-based
+    /// lookup. See http://murmurhash.googlepages.com/ for more details.
+    ///
+    /// The C version of MurmurHash 2.0 found at that site was ported to Java by
+    /// Andrzej Bialecki (ab at getopt org).
+    ///
+    ///  The code from getopt.org was adapted by Mark Harwood in the form here as one of a pluggable choice of 
+    /// hashing functions as the core function had to be adapted to work with BytesRefs with offsets and lengths
+    /// rather than raw byte arrays.  
+    /// 
+    /// @lucene.experimental
+    /// </summary>
+    public sealed class MurmurHash2 : HashFunction
+    {
+
+        public static readonly MurmurHash2 INSTANCE = new MurmurHash2();
+
+        private MurmurHash2()
+        {
+        }
+
+        public static int Hash(byte[] data, uint seed, int offset, int len)
+        {
+            int m = 0x5bd1e995;
+            int r = 24;
+            int h = (int)(seed ^ (long)len);
+            int len_4 = len >> 2;
+            for (int i = 0; i < len_4; i++)
+            {
+                int i_4 = offset + (i << 2);
+                int k = data[i_4 + 3];
+                k = k << 8;
+                k = k | (data[i_4 + 2] & 0xff);
+                k = k << 8;
+                k = k | (data[i_4 + 1] & 0xff);
+                k = k << 8;
+                k = k | (data[i_4 + 0] & 0xff);
+                k *= m;
+                k ^= k >> r;
+                k *= m;
+                h *= m;
+                h ^= k;
+            }
+            int len_m = len_4 << 2;
+            int left = len - len_m;
+            if (left != 0)
+            {
+                if (left >= 3)
+                {
+                    h ^= data[offset + len - 3] << 16;
+                }
+                if (left >= 2)
+                {
+                    h ^= data[offset + len - 2] << 8;
+                }
+                if (left >= 1)
+                {
+                    h ^= data[offset + len - 1];
+                }
+                h *= m;
+            }
+            h ^= h >> 13;
+            h *= m;
+            h ^= h >> 15;
+            return h;
+        }
+
+        /// <summary>
+        /// Generates 32 bit hash from byte array with default seed value.
+        /// </summary>
+        /// <param name="data">byte array to hash</param>
+        /// <param name="offset">the start position in the array to hash</param>
+        /// <param name="len">length of the array elements to hash</param>
+        /// <returns>32 bit hash of the given array</returns>
+        public static int Hash32(byte[] data, int offset, int len)
+        {
+            return Hash(data, 0x9747b28c, offset, len);
+        }
+
+        public override int Hash(BytesRef br)
+        {
+            return Hash32((byte[])(Array)br.Bytes, br.Offset, br.Length);
+        }
+
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1da1cb5b/src/Lucene.Net.Codecs/DiskDV/DiskDocValuesFormat.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Codecs/DiskDV/DiskDocValuesFormat.cs b/src/Lucene.Net.Codecs/DiskDV/DiskDocValuesFormat.cs
new file mode 100644
index 0000000..c71295c
--- /dev/null
+++ b/src/Lucene.Net.Codecs/DiskDV/DiskDocValuesFormat.cs
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+namespace Lucene.Net.Codecs.DiskDV
+{
+    using Lucene.Net.Codecs;
+    using Lucene.Net.Codecs.Lucene45;
+    using Lucene.Net.Index;
+    using System;
+
+    /// <summary>
+    /// DocValues format that keeps most things on disk.
+    /// Only things like disk offsets are loaded into ram.
+    ///
+    /// @lucene.experimental
+    /// </summary>
+    public sealed class DiskDocValuesFormat : DocValuesFormat
+    {
+
+        public const String DATA_CODEC = "DiskDocValuesData";
+        public const String DATA_EXTENSION = "dvdd";
+        public const String META_CODEC = "DiskDocValuesMetadata";
+        public const String META_EXTENSION = "dvdm";
+
+        public DiskDocValuesFormat() : base("Disk")
+        {
+        }
+
+        public override DocValuesConsumer FieldsConsumer(SegmentWriteState state)
+        {
+            return new Lucene45DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION);
+        }
+
+        public override DocValuesProducer FieldsProducer(SegmentReadState state)
+        {
+            return new DiskDocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION);
+        }
+
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1da1cb5b/src/Lucene.Net.Codecs/DiskDV/DiskDocValuesProducer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Codecs/DiskDV/DiskDocValuesProducer.cs b/src/Lucene.Net.Codecs/DiskDV/DiskDocValuesProducer.cs
new file mode 100644
index 0000000..a5241be
--- /dev/null
+++ b/src/Lucene.Net.Codecs/DiskDV/DiskDocValuesProducer.cs
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Codecs.DiskDV
+{
+    using System;
+    using Lucene.Net.Codecs.Lucene45;
+    using Lucene.Net.Index;
+    using Lucene.Net.Store;
+    using Lucene.Net.Util.Packed;
+
+    public class DiskDocValuesProducer : Lucene45DocValuesProducer
+    {
+
+        public DiskDocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec,
+            String metaExtension) :
+                base(state, dataCodec, dataExtension, metaCodec, metaExtension)
+        {
+        }
+
+        protected override MonotonicBlockPackedReader GetAddressInstance(IndexInput data, FieldInfo field,
+            BinaryEntry bytes)
+        {
+            data.Seek(bytes.AddressesOffset);
+            return new MonotonicBlockPackedReader((IndexInput)data.Clone(), bytes.PackedIntsVersion, bytes.BlockSize, bytes.Count,
+                true);
+        }
+
+        protected override MonotonicBlockPackedReader GetIntervalInstance(IndexInput data, FieldInfo field,
+            BinaryEntry bytes)
+        {
+            throw new InvalidOperationException();
+        }
+
+        protected override MonotonicBlockPackedReader GetOrdIndexInstance(IndexInput data, FieldInfo field,
+            NumericEntry entry)
+        {
+            data.Seek(entry.Offset);
+            return new MonotonicBlockPackedReader((IndexInput)data.Clone(), entry.PackedIntsVersion, entry.BlockSize, entry.Count,
+                true);
+        }
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1da1cb5b/src/Lucene.Net.Codecs/DiskDV/DiskNormsFormat.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Codecs/DiskDV/DiskNormsFormat.cs b/src/Lucene.Net.Codecs/DiskDV/DiskNormsFormat.cs
new file mode 100644
index 0000000..91dd77b
--- /dev/null
+++ b/src/Lucene.Net.Codecs/DiskDV/DiskNormsFormat.cs
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Codecs.DiskDV
+{
+
+    using System;
+    using Lucene.Net.Codecs;
+    using Lucene.Net.Codecs.Lucene45;
+    using Lucene.Net.Index;
+
+    /// <summary>
+    /// Norms format that keeps all norms on disk
+    /// </summary>
+    public sealed class DiskNormsFormat : NormsFormat
+    {
+        private const String DATA_CODEC = "DiskNormsData";
+        private const String DATA_EXTENSION = "dnvd";
+        private const String META_CODEC = "DiskNormsMetadata";
+        private const String META_EXTENSION = "dnvm";
+
+        public override DocValuesConsumer NormsConsumer(SegmentWriteState state)
+        {
+            return new Lucene45DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION);
+        }
+
+        public override DocValuesProducer NormsProducer(SegmentReadState state)
+        {
+            return new DiskDocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION);
+        }
+
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1da1cb5b/src/Lucene.Net.Codecs/Intblock/FixedIntBlockIndexInput.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Codecs/Intblock/FixedIntBlockIndexInput.cs b/src/Lucene.Net.Codecs/Intblock/FixedIntBlockIndexInput.cs
new file mode 100644
index 0000000..3594005
--- /dev/null
+++ b/src/Lucene.Net.Codecs/Intblock/FixedIntBlockIndexInput.cs
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using Lucene.Net.Store;
+
+namespace Lucene.Net.Codecs.Intblock
+{
+ 
+    /// <summary>
+    /// Naive int block API that writes vInts.  This is
+    /// expected to give poor performance; it's really only for
+    /// testing the pluggability.  One should typically use pfor instead. */
+    ///
+    /// Abstract base class that reads fixed-size blocks of ints
+    /// from an IndexInput.  While this is a simple approach, a
+    /// more performant approach would directly create an impl
+    /// of IntIndexInput inside Directory.  Wrapping a generic
+    /// IndexInput will likely cost performance.
+    /// 
+    /// @lucene.experimental
+    /// </summary>
+public abstract class FixedIntBlockIndexInput : IntIndexInput {
+
+  private readonly IndexInput input;
+  protected readonly int BlockSize;
+
+        protected FixedIntBlockIndexInput(IndexInput input)
+        {
+            this.input = input;
+            BlockSize = input.ReadVInt();
+        }
+
+  public override IntIndexInput.Reader reader() {
+    final int[] buffer = new int[BlockSize];
+    final IndexInput clone = in.clone();
+
+    // TODO: can this be simplified?
+    return new Reader(clone, buffer, this.GetBlockReader(clone, buffer));
+  }
+
+        public override void Close()
+        {
+            input.Close();
+        }
+
+  public override IntIndexInput.Index Index() {
+    return new Index();
+  }
+
+  protected abstract BlockReader getBlockReader(IndexInput in, int[] buffer);
+
+  
+}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1da1cb5b/src/Lucene.Net.Codecs/Intblock/FixedIntBlockIndexOutput.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Codecs/Intblock/FixedIntBlockIndexOutput.cs b/src/Lucene.Net.Codecs/Intblock/FixedIntBlockIndexOutput.cs
new file mode 100644
index 0000000..a83fd38
--- /dev/null
+++ b/src/Lucene.Net.Codecs/Intblock/FixedIntBlockIndexOutput.cs
@@ -0,0 +1,128 @@
+package org.apache.lucene.codecs.intblock;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** Naive int block API that writes vInts.  This is
+ *  expected to give poor performance; it's really only for
+ *  testing the pluggability.  One should typically use pfor instead. */
+
+import java.io.IOException;
+
+import org.apache.lucene.codecs.sep.IntIndexOutput;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.IndexOutput;
+
+/** Abstract base class that writes fixed-size blocks of ints
+ *  to an IndexOutput.  While this is a simple approach, a
+ *  more performant approach would directly create an impl
+ *  of IntIndexOutput inside Directory.  Wrapping a generic
+ *  IndexInput will likely cost performance.
+ *
+ * @lucene.experimental
+ */
+public abstract class FixedIntBlockIndexOutput extends IntIndexOutput {
+
+  protected final IndexOutput out;
+  private final int blockSize;
+  protected final int[] buffer;
+  private int upto;
+
+  protected FixedIntBlockIndexOutput(IndexOutput out, int fixedBlockSize)  {
+    blockSize = fixedBlockSize;
+    this.out = out;
+    out.writeVInt(blockSize);
+    buffer = new int[blockSize];
+  }
+
+  protected abstract void flushBlock() ;
+
+  @Override
+  public IntIndexOutput.Index index() {
+    return new Index();
+  }
+
+  private class Index extends IntIndexOutput.Index {
+    long fp;
+    int upto;
+    long lastFP;
+    int lastUpto;
+
+    @Override
+    public void mark()  {
+      fp = out.getFilePointer();
+      upto = FixedIntBlockIndexOutput.this.upto;
+    }
+
+    @Override
+    public void copyFrom(IntIndexOutput.Index other, bool copyLast)  {
+      Index idx = (Index) other;
+      fp = idx.fp;
+      upto = idx.upto;
+      if (copyLast) {
+        lastFP = fp;
+        lastUpto = upto;
+      }
+    }
+
+    @Override
+    public void write(DataOutput indexOut, bool absolute)  {
+      if (absolute) {
+        indexOut.writeVInt(upto);
+        indexOut.writeVLong(fp);
+      } else if (fp == lastFP) {
+        // same block
+        Debug.Assert( upto >= lastUpto;
+        int uptoDelta = upto - lastUpto;
+        indexOut.writeVInt(uptoDelta << 1 | 1);
+      } else {      
+        // new block
+        indexOut.writeVInt(upto << 1);
+        indexOut.writeVLong(fp - lastFP);
+      }
+      lastUpto = upto;
+      lastFP = fp;
+    }
+
+    @Override
+    public String toString() {
+      return "fp=" + fp + " upto=" + upto;
+    }
+  }
+
+  @Override
+  public void write(int v)  {
+    buffer[upto++] = v;
+    if (upto == blockSize) {
+      flushBlock();
+      upto = 0;
+    }
+  }
+
+  @Override
+  public void close()  {
+    try {
+      if (upto > 0) {
+        // NOTE: entries in the block after current upto are
+        // invalid
+        flushBlock();
+      }
+    } finally {
+      out.close();
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1da1cb5b/src/Lucene.Net.Codecs/Intblock/IBlockReader.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Codecs/Intblock/IBlockReader.cs b/src/Lucene.Net.Codecs/Intblock/IBlockReader.cs
new file mode 100644
index 0000000..adf14ec
--- /dev/null
+++ b/src/Lucene.Net.Codecs/Intblock/IBlockReader.cs
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Codecs.Intblock
+{
+    /// <summary>
+    /// Interface for fixed-size block decoders
+    /// 
+    /// Implementations should decode into the buffer in {@link #ReadBlock}
+    /// </summary>
+    public interface IBlockReader
+    {
+         void ReadBlock();
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1da1cb5b/src/Lucene.Net.Codecs/Intblock/Index.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Codecs/Intblock/Index.cs b/src/Lucene.Net.Codecs/Intblock/Index.cs
new file mode 100644
index 0000000..67710a6
--- /dev/null
+++ b/src/Lucene.Net.Codecs/Intblock/Index.cs
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Codecs.Intblock
+{
+    using System;
+    using Lucene.Net.Codecs.Intblock;
+
+    internal class Index : IntIndexInput.Index 
+    {
+    private long fp;
+    private int upto;
+
+        public override void Read(final DataInput indexIn, final bool absolute)
+    {
+        if (absolute)
+        {
+            upto = indexIn.readVInt();
+            fp = indexIn.readVLong();
+        }
+        else
+        {
+            final
+            int uptoDelta = indexIn.readVInt();
+            if ((uptoDelta & 1) == 1)
+            {
+                // same block
+                upto += uptoDelta >> > 1;
+            }
+            else
+            {
+                // new block
+                upto = uptoDelta >> > 1;
+                fp += indexIn.readVLong();
+            }
+        }
+        Debug.Assert(
+        upto < blockSize;
+    }
+
+        public override void Seek(final IntIndexInput .Reader other)
+        {
+            ((Reader) other).seek(fp, upto);
+        }
+
+        public override void CopyFrom(IntIndexInput.Index other)
+        {
+            Index idx = (Index) other;
+            fp = idx.fp;
+            upto = idx.upto;
+        }
+
+        public override Index Clone()
+        {
+            Index other = new Index();
+            other.fp = fp;
+            other.upto = upto;
+            return other;
+        }
+
+        public override String ToString()
+        {
+            return "fp=" + fp + " upto=" + upto;
+        }
+    
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1da1cb5b/src/Lucene.Net.Codecs/Intblock/Reader.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Codecs/Intblock/Reader.cs b/src/Lucene.Net.Codecs/Intblock/Reader.cs
new file mode 100644
index 0000000..8e0eb1d
--- /dev/null
+++ b/src/Lucene.Net.Codecs/Intblock/Reader.cs
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Codecs.Intblock
+{
+    internal static class Reader : IntIndexInput.Reader
+    {
+
+    private final IndexInput in;
+    private final BlockReader blockReader;
+    private final int blockSize;
+    private final int[] pending;
+
+    private int upto;
+    private bool seekPending;
+    private long pendingFP;
+    private long lastBlockFP = -1;
+
+    public Reader(final IndexInput in, final int[] pending, final BlockReader blockReader) {
+      this.in = in;
+      this.pending = pending;
+      this.blockSize = pending.length;
+      this.blockReader = blockReader;
+      upto = blockSize;
+    }
+
+    void Seek(final long fp, final int upto) {
+      Debug.Assert( upto < blockSize;
+      if (seekPending || fp != lastBlockFP) {
+        pendingFP = fp;
+        seekPending = true;
+      }
+      this.upto = upto;
+    }
+
+    public override int Next() {
+      if (seekPending) {
+        // Seek & load new block
+        in.seek(pendingFP);
+        lastBlockFP = pendingFP;
+        blockReader.readBlock();
+        seekPending = false;
+      } else if (upto == blockSize) {
+        // Load new block
+        lastBlockFP = in.getFilePointer();
+        blockReader.readBlock();
+        upto = 0;
+      }
+      return pending[upto++];
+    }
+  }
+    }
+}


Mime
View raw message