http://git-wip-us.apache.org/repos/asf/lucenenet/blob/228b970a/src/Lucene.Net.Core/Index/TermVectorEntryFreqSortedComparator.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Core/Index/TermVectorEntryFreqSortedComparator.cs b/src/Lucene.Net.Core/Index/TermVectorEntryFreqSortedComparator.cs
deleted file mode 100644
index 26fd586..0000000
--- a/src/Lucene.Net.Core/Index/TermVectorEntryFreqSortedComparator.cs
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-using System;
-
-namespace Lucene.Net.Index
-{
-
- /// Compares s first by frequency and then by
- /// the term (case-sensitive)
- ///
- ///
- ///
- public class TermVectorEntryFreqSortedComparator : System.Collections.Generic.IComparer
- {
- public virtual int Compare(TermVectorEntry entry, TermVectorEntry entry1)
- {
- int result = 0;
- result = entry1.Frequency - entry.Frequency;
- if (result == 0)
- {
- result = String.CompareOrdinal(entry.Term, entry1.Term);
- if (result == 0)
- {
- result = String.CompareOrdinal(entry.Field, entry1.Field);
- }
- }
- return result;
- }
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/228b970a/src/Lucene.Net.Core/Index/TermVectorMapper.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Core/Index/TermVectorMapper.cs b/src/Lucene.Net.Core/Index/TermVectorMapper.cs
deleted file mode 100644
index e006385..0000000
--- a/src/Lucene.Net.Core/Index/TermVectorMapper.cs
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-using System;
-
-namespace Lucene.Net.Index
-{
-
- /// The TermVectorMapper can be used to map Term Vectors into your own
- /// structure instead of the parallel array structure used by
- /// .
- ///
- /// It is up to the implementation to make sure it is thread-safe.
- ///
- ///
- ///
- ///
- public abstract class TermVectorMapper
- {
-
- private bool ignoringPositions;
- private bool ignoringOffsets;
-
-
- protected internal TermVectorMapper()
- {
- }
-
- ///
- /// true if this mapper should tell Lucene to ignore positions even if they are stored
- ///
- /// similar to ignoringPositions
- ///
- protected internal TermVectorMapper(bool ignoringPositions, bool ignoringOffsets)
- {
- this.ignoringPositions = ignoringPositions;
- this.ignoringOffsets = ignoringOffsets;
- }
-
- /// Tell the mapper what to expect in regards to field, number of terms, offset and position storage.
- /// This method will be called once before retrieving the vector for a field.
- ///
- /// This method will be called before .
- ///
- /// The field the vector is for
- ///
- /// The number of terms that need to be mapped
- ///
- /// true if the mapper should expect offset information
- ///
- /// true if the mapper should expect positions info
- ///
- public abstract void SetExpectations(System.String field, int numTerms, bool storeOffsets, bool storePositions);
- /// Map the Term Vector information into your own structure
- /// The term to add to the vector
- ///
- /// The frequency of the term in the document
- ///
- /// null if the offset is not specified, otherwise the offset into the field of the term
- ///
- /// null if the position is not specified, otherwise the position in the field of the term
- ///
- public abstract void Map(System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions);
-
- /// Indicate to Lucene that even if there are positions stored, this mapper is not interested in them and they
- /// can be skipped over. Derived classes should set this to true if they want to ignore positions. The default
- /// is false, meaning positions will be loaded if they are stored.
- ///
- /// false
- public virtual bool IsIgnoringPositions
- {
- get { return ignoringPositions; }
- }
-
- ///
- /// Same principal as , but applied to offsets. false by default.
- ///
- /// false
- public virtual bool IsIgnoringOffsets
- {
- get { return ignoringOffsets; }
- }
-
- /// Passes down the index of the document whose term vector is currently being mapped,
- /// once for each top level call to a term vector reader.
- ///
- /// Default implementation IGNORES the document number. Override if your implementation needs the document number.
- ///
- /// NOTE: Document numbers are internal to Lucene and subject to change depending on indexing operations.
- ///
- ///
- /// index of document currently being mapped
- ///
- public virtual void SetDocumentNumber(int documentNumber)
- {
- }
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/228b970a/src/Lucene.Net.Core/Index/TermVectorOffsetInfo.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Core/Index/TermVectorOffsetInfo.cs b/src/Lucene.Net.Core/Index/TermVectorOffsetInfo.cs
deleted file mode 100644
index 1f9d7d2..0000000
--- a/src/Lucene.Net.Core/Index/TermVectorOffsetInfo.cs
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-using System;
-
-using System.Runtime.InteropServices;
-
-namespace Lucene.Net.Index
-{
-
- /// The TermVectorOffsetInfo class holds information pertaining to a Term in a 's
- /// offset information. This offset information is the character offset as set during the Analysis phase (and thus may not be the actual offset in the
- /// original content).
- ///
- [Serializable]
- public struct TermVectorOffsetInfo : IEquatable
- {
- /// Convenience declaration when creating a that stores only position information.
- [NonSerialized]
- public static readonly TermVectorOffsetInfo[] EMPTY_OFFSET_INFO = new TermVectorOffsetInfo[0];
-
- [NonSerialized]
- public static readonly TermVectorOffsetInfo Null = new TermVectorOffsetInfo(int.MinValue, int.MinValue);
-
- private int startOffset;
- private int endOffset;
-
- //public TermVectorOffsetInfo()
- //{
- //}
-
- public TermVectorOffsetInfo(int startOffset, int endOffset)
- {
- this.endOffset = endOffset;
- this.startOffset = startOffset;
- }
-
- /// The accessor for the ending offset for the term
- /// The offset
- public int EndOffset
- {
- get { return endOffset; }
- set { this.endOffset = value; }
- }
-
- /// The accessor for the starting offset of the term.
- ///
- ///
- /// The offset
- public int StartOffset
- {
- get { return startOffset; }
- set { this.startOffset = value; }
- }
-
- ///// Two TermVectorOffsetInfos are equals if both the start and end offsets are the same
- ///// The comparison Object
- /////
- ///// true if both and are the same for both objects.
- /////
- //public override bool Equals(System.Object o)
- //{
- // if (this == o)
- // return true;
- // if (!(o is TermVectorOffsetInfo))
- // return false;
-
- // TermVectorOffsetInfo termVectorOffsetInfo = (TermVectorOffsetInfo) o;
-
- // if (endOffset != termVectorOffsetInfo.endOffset)
- // return false;
- // if (startOffset != termVectorOffsetInfo.startOffset)
- // return false;
-
- // return true;
- //}
-
- //public override int GetHashCode()
- //{
- // int result;
- // result = startOffset;
- // result = 29 * result + endOffset;
- // return result;
- //}
-
-
- public bool Equals(TermVectorOffsetInfo other)
- {
- return startOffset == other.startOffset && endOffset == other.endOffset;
- }
-
- public override bool Equals(object obj)
- {
- if (ReferenceEquals(null, obj))
- {
- return EndOffset == int.MinValue && StartOffset == int.MinValue;
- }
- if (obj.GetType() != typeof (TermVectorOffsetInfo)) return false;
- return Equals((TermVectorOffsetInfo) obj);
- }
-
- public override int GetHashCode()
- {
- unchecked
- {
- return (startOffset*397) ^ endOffset;
- }
- }
-
- public static bool operator ==(TermVectorOffsetInfo left, object right)
- {
- return left.Equals(right);
- }
-
- public static bool operator !=(TermVectorOffsetInfo left, object right)
- {
- return !left.Equals(right);
- }
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/228b970a/src/Lucene.Net.Core/Index/TermVectorsReader.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Core/Index/TermVectorsReader.cs b/src/Lucene.Net.Core/Index/TermVectorsReader.cs
deleted file mode 100644
index 23677a9..0000000
--- a/src/Lucene.Net.Core/Index/TermVectorsReader.cs
+++ /dev/null
@@ -1,731 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-using System;
-
-using BufferedIndexInput = Lucene.Net.Store.BufferedIndexInput;
-using Directory = Lucene.Net.Store.Directory;
-using IndexInput = Lucene.Net.Store.IndexInput;
-
-namespace Lucene.Net.Index
-{
- class TermVectorsReader : System.ICloneable, IDisposable
- {
-
- // NOTE: if you make a new format, it must be larger than
- // the current format
- internal const int FORMAT_VERSION = 2;
-
- // Changes to speed up bulk merging of term vectors:
- internal const int FORMAT_VERSION2 = 3;
-
- // Changed strings to UTF8 with length-in-bytes not length-in-chars
- internal const int FORMAT_UTF8_LENGTH_IN_BYTES = 4;
-
- // NOTE: always change this if you switch to a new format!
- internal static readonly int FORMAT_CURRENT = FORMAT_UTF8_LENGTH_IN_BYTES;
-
- //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
- internal const int FORMAT_SIZE = 4;
-
- internal const byte STORE_POSITIONS_WITH_TERMVECTOR = (byte) (0x1);
- internal const byte STORE_OFFSET_WITH_TERMVECTOR = (byte) (0x2);
-
- private FieldInfos fieldInfos;
-
- private IndexInput tvx;
- private IndexInput tvd;
- private IndexInput tvf;
- private int size;
- private int numTotalDocs;
-
- // The docID offset where our docs begin in the index
- // file. This will be 0 if we have our own private file.
- private int docStoreOffset;
-
- private int format;
- private bool isDisposed;
-
- internal TermVectorsReader(Directory d, System.String segment, FieldInfos fieldInfos):this(d, segment, fieldInfos, BufferedIndexInput.BUFFER_SIZE)
- {
- }
-
- internal TermVectorsReader(Directory d, System.String segment, FieldInfos fieldInfos, int readBufferSize):this(d, segment, fieldInfos, readBufferSize, - 1, 0)
- {
- }
-
- internal TermVectorsReader(Directory d, System.String segment, FieldInfos fieldInfos, int readBufferSize, int docStoreOffset, int size)
- {
- bool success = false;
-
- try
- {
- if (d.FileExists(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION))
- {
- tvx = d.OpenInput(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION, readBufferSize);
- format = CheckValidFormat(tvx);
- tvd = d.OpenInput(segment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION, readBufferSize);
- int tvdFormat = CheckValidFormat(tvd);
- tvf = d.OpenInput(segment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION, readBufferSize);
- int tvfFormat = CheckValidFormat(tvf);
-
- System.Diagnostics.Debug.Assert(format == tvdFormat);
- System.Diagnostics.Debug.Assert(format == tvfFormat);
-
- if (format >= FORMAT_VERSION2)
- {
- System.Diagnostics.Debug.Assert((tvx.Length() - FORMAT_SIZE) % 16 == 0);
- numTotalDocs = (int)(tvx.Length() >> 4);
- }
- else
- {
- System.Diagnostics.Debug.Assert((tvx.Length() - FORMAT_SIZE) % 8 == 0);
- numTotalDocs = (int)(tvx.Length() >> 3);
- }
-
- if (-1 == docStoreOffset)
- {
- this.docStoreOffset = 0;
- this.size = numTotalDocs;
- System.Diagnostics.Debug.Assert(size == 0 || numTotalDocs == size);
- }
- else
- {
- this.docStoreOffset = docStoreOffset;
- this.size = size;
- // Verify the file is long enough to hold all of our
- // docs
- System.Diagnostics.Debug.Assert(numTotalDocs >= size + docStoreOffset, "numTotalDocs=" + numTotalDocs + " size=" + size + " docStoreOffset=" + docStoreOffset);
- }
- }
- else
- {
- // If all documents flushed in a segment had hit
- // non-aborting exceptions, it's possible that
- // FieldInfos.hasVectors returns true yet the term
- // vector files don't exist.
- format = 0;
- }
-
-
- this.fieldInfos = fieldInfos;
- success = true;
- }
- finally
- {
- // With lock-less commits, it's entirely possible (and
- // fine) to hit a FileNotFound exception above. In
- // this case, we want to explicitly close any subset
- // of things that were opened so that we don't have to
- // wait for a GC to do so.
- if (!success)
- {
- Dispose();
- }
- }
- }
-
- // Used for bulk copy when merging
- internal virtual IndexInput GetTvdStream()
- {
- return tvd;
- }
-
- // Used for bulk copy when merging
- internal virtual IndexInput GetTvfStream()
- {
- return tvf;
- }
-
- private void SeekTvx(int docNum)
- {
- if (format < FORMAT_VERSION2)
- tvx.Seek((docNum + docStoreOffset) * 8L + FORMAT_SIZE);
- else
- tvx.Seek((docNum + docStoreOffset) * 16L + FORMAT_SIZE);
- }
-
- internal virtual bool CanReadRawDocs()
- {
- return format >= FORMAT_UTF8_LENGTH_IN_BYTES;
- }
-
- /// Retrieve the length (in bytes) of the tvd and tvf
- /// entries for the next numDocs starting with
- /// startDocID. This is used for bulk copying when
- /// merging segments, if the field numbers are
- /// congruent. Once this returns, the tvf & tvd streams
- /// are seeked to the startDocID.
- ///
- internal void RawDocs(int[] tvdLengths, int[] tvfLengths, int startDocID, int numDocs)
- {
-
- if (tvx == null)
- {
- for (int i = 0; i < tvdLengths.Length; i++)
- {
- tvdLengths[i] = 0;
- }
- for (int i = 0; i < tvfLengths.Length; i++)
- {
- tvfLengths[i] = 0;
- }
- return ;
- }
-
- // SegmentMerger calls canReadRawDocs() first and should
- // not call us if that returns false.
- if (format < FORMAT_VERSION2)
- throw new System.SystemException("cannot read raw docs with older term vector formats");
-
- SeekTvx(startDocID);
-
- long tvdPosition = tvx.ReadLong();
- tvd.Seek(tvdPosition);
-
- long tvfPosition = tvx.ReadLong();
- tvf.Seek(tvfPosition);
-
- long lastTvdPosition = tvdPosition;
- long lastTvfPosition = tvfPosition;
-
- int count = 0;
- while (count < numDocs)
- {
- int docID = docStoreOffset + startDocID + count + 1;
- System.Diagnostics.Debug.Assert(docID <= numTotalDocs);
- if (docID < numTotalDocs)
- {
- tvdPosition = tvx.ReadLong();
- tvfPosition = tvx.ReadLong();
- }
- else
- {
- tvdPosition = tvd.Length();
- tvfPosition = tvf.Length();
- System.Diagnostics.Debug.Assert(count == numDocs - 1);
- }
- tvdLengths[count] = (int) (tvdPosition - lastTvdPosition);
- tvfLengths[count] = (int) (tvfPosition - lastTvfPosition);
- count++;
- lastTvdPosition = tvdPosition;
- lastTvfPosition = tvfPosition;
- }
- }
-
- private int CheckValidFormat(IndexInput in_Renamed)
- {
- int format = in_Renamed.ReadInt();
- if (format > FORMAT_CURRENT)
- {
- throw new CorruptIndexException("Incompatible format version: " + format + " expected " + FORMAT_CURRENT + " or less");
- }
- return format;
- }
-
- public void Dispose()
- {
- Dispose(true);
- }
-
- protected virtual void Dispose(bool disposing)
- {
- if (isDisposed) return;
-
- if (disposing)
- {
- // make all effort to close up. Keep the first exception
- // and throw it as a new one.
- System.IO.IOException keep = null;
- if (tvx != null)
- try
- {
- tvx.Close();
- }
- catch (System.IO.IOException e)
- {
- if (keep == null)
- keep = e;
- }
- if (tvd != null)
- try
- {
- tvd.Close();
- }
- catch (System.IO.IOException e)
- {
- if (keep == null)
- keep = e;
- }
- if (tvf != null)
- try
- {
- tvf.Close();
- }
- catch (System.IO.IOException e)
- {
- if (keep == null)
- keep = e;
- }
- if (keep != null)
- {
- throw new System.IO.IOException(keep.StackTrace);
- }
- }
-
- isDisposed = true;
- }
-
- ///
- /// The number of documents in the reader
- ///
- internal virtual int Size()
- {
- return size;
- }
-
- public virtual void Get(int docNum, System.String field, TermVectorMapper mapper)
- {
- if (tvx != null)
- {
- int fieldNumber = fieldInfos.FieldNumber(field);
- //We need to account for the FORMAT_SIZE at when seeking in the tvx
- //We don't need to do this in other seeks because we already have the
- // file pointer
- //that was written in another file
- SeekTvx(docNum);
- //System.out.println("TVX Pointer: " + tvx.getFilePointer());
- long tvdPosition = tvx.ReadLong();
-
- tvd.Seek(tvdPosition);
- int fieldCount = tvd.ReadVInt();
- //System.out.println("Num Fields: " + fieldCount);
- // There are only a few fields per document. We opt for a full scan
- // rather then requiring that they be ordered. We need to read through
- // all of the fields anyway to get to the tvf pointers.
- int number = 0;
- int found = - 1;
- for (int i = 0; i < fieldCount; i++)
- {
- if (format >= FORMAT_VERSION)
- number = tvd.ReadVInt();
- else
- number += tvd.ReadVInt();
-
- if (number == fieldNumber)
- found = i;
- }
-
- // This field, although valid in the segment, was not found in this
- // document
- if (found != - 1)
- {
- // Compute position in the tvf file
- long position;
- if (format >= FORMAT_VERSION2)
- position = tvx.ReadLong();
- else
- position = tvd.ReadVLong();
- for (int i = 1; i <= found; i++)
- position += tvd.ReadVLong();
-
- mapper.SetDocumentNumber(docNum);
- ReadTermVector(field, position, mapper);
- }
- else
- {
- //System.out.println("Fieldable not found");
- }
- }
- else
- {
- //System.out.println("No tvx file");
- }
- }
-
-
-
- /// Retrieve the term vector for the given document and field
- /// The document number to retrieve the vector for
- ///
- /// The field within the document to retrieve
- ///
- /// The TermFreqVector for the document and field or null if there is no termVector for this field.
- ///
- /// IOException if there is an error reading the term vector files
- public /*internal*/ virtual ITermFreqVector Get(int docNum, System.String field)
- {
- // Check if no term vectors are available for this segment at all
- ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper();
- Get(docNum, field, mapper);
-
- return mapper.MaterializeVector();
- }
-
- // Reads the String[] fields; you have to pre-seek tvd to
- // the right point
- private System.String[] ReadFields(int fieldCount)
- {
- int number = 0;
- System.String[] fields = new System.String[fieldCount];
-
- for (int i = 0; i < fieldCount; i++)
- {
- if (format >= FORMAT_VERSION)
- number = tvd.ReadVInt();
- else
- number += tvd.ReadVInt();
-
- fields[i] = fieldInfos.FieldName(number);
- }
-
- return fields;
- }
-
- // Reads the long[] offsets into TVF; you have to pre-seek
- // tvx/tvd to the right point
- private long[] ReadTvfPointers(int fieldCount)
- {
- // Compute position in the tvf file
- long position;
- if (format >= FORMAT_VERSION2)
- position = tvx.ReadLong();
- else
- position = tvd.ReadVLong();
-
- long[] tvfPointers = new long[fieldCount];
- tvfPointers[0] = position;
-
- for (int i = 1; i < fieldCount; i++)
- {
- position += tvd.ReadVLong();
- tvfPointers[i] = position;
- }
-
- return tvfPointers;
- }
-
- /// Return all term vectors stored for this document or null if the could not be read in.
- ///
- ///
- /// The document number to retrieve the vector for
- ///
- /// All term frequency vectors
- ///
- /// IOException if there is an error reading the term vector files
- public /*internal*/ virtual ITermFreqVector[] Get(int docNum)
- {
- ITermFreqVector[] result = null;
- if (tvx != null)
- {
- //We need to offset by
- SeekTvx(docNum);
- long tvdPosition = tvx.ReadLong();
-
- tvd.Seek(tvdPosition);
- int fieldCount = tvd.ReadVInt();
-
- // No fields are vectorized for this document
- if (fieldCount != 0)
- {
- System.String[] fields = ReadFields(fieldCount);
- long[] tvfPointers = ReadTvfPointers(fieldCount);
- result = ReadTermVectors(docNum, fields, tvfPointers);
- }
- }
- else
- {
- //System.out.println("No tvx file");
- }
- return result;
- }
-
- public virtual void Get(int docNumber, TermVectorMapper mapper)
- {
- // Check if no term vectors are available for this segment at all
- if (tvx != null)
- {
- //We need to offset by
-
- SeekTvx(docNumber);
- long tvdPosition = tvx.ReadLong();
-
- tvd.Seek(tvdPosition);
- int fieldCount = tvd.ReadVInt();
-
- // No fields are vectorized for this document
- if (fieldCount != 0)
- {
- System.String[] fields = ReadFields(fieldCount);
- long[] tvfPointers = ReadTvfPointers(fieldCount);
- mapper.SetDocumentNumber(docNumber);
- ReadTermVectors(fields, tvfPointers, mapper);
- }
- }
- else
- {
- //System.out.println("No tvx file");
- }
- }
-
-
- private SegmentTermVector[] ReadTermVectors(int docNum, System.String[] fields, long[] tvfPointers)
- {
- SegmentTermVector[] res = new SegmentTermVector[fields.Length];
- for (int i = 0; i < fields.Length; i++)
- {
- var mapper = new ParallelArrayTermVectorMapper();
- mapper.SetDocumentNumber(docNum);
- ReadTermVector(fields[i], tvfPointers[i], mapper);
- res[i] = (SegmentTermVector) mapper.MaterializeVector();
- }
- return res;
- }
-
- private void ReadTermVectors(System.String[] fields, long[] tvfPointers, TermVectorMapper mapper)
- {
- for (int i = 0; i < fields.Length; i++)
- {
- ReadTermVector(fields[i], tvfPointers[i], mapper);
- }
- }
-
-
- ///
- /// The field to read in
- ///
- /// The pointer within the tvf file where we should start reading
- ///
- /// The mapper used to map the TermVector
- ///
- /// IOException
- private void ReadTermVector(System.String field, long tvfPointer, TermVectorMapper mapper)
- {
-
- // Now read the data from specified position
- //We don't need to offset by the FORMAT here since the pointer already includes the offset
- tvf.Seek(tvfPointer);
-
- int numTerms = tvf.ReadVInt();
- //System.out.println("Num Terms: " + numTerms);
- // If no terms - return a constant empty termvector. However, this should never occur!
- if (numTerms == 0)
- return ;
-
- bool storePositions;
- bool storeOffsets;
-
- if (format >= FORMAT_VERSION)
- {
- byte bits = tvf.ReadByte();
- storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
- storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
- }
- else
- {
- tvf.ReadVInt();
- storePositions = false;
- storeOffsets = false;
- }
- mapper.SetExpectations(field, numTerms, storeOffsets, storePositions);
- int start = 0;
- int deltaLength = 0;
- int totalLength = 0;
- byte[] byteBuffer;
- char[] charBuffer;
- bool preUTF8 = format < FORMAT_UTF8_LENGTH_IN_BYTES;
-
- // init the buffers
- if (preUTF8)
- {
- charBuffer = new char[10];
- byteBuffer = null;
- }
- else
- {
- charBuffer = null;
- byteBuffer = new byte[20];
- }
-
- for (int i = 0; i < numTerms; i++)
- {
- start = tvf.ReadVInt();
- deltaLength = tvf.ReadVInt();
- totalLength = start + deltaLength;
-
- System.String term;
-
- if (preUTF8)
- {
- // Term stored as java chars
- if (charBuffer.Length < totalLength)
- {
- char[] newCharBuffer = new char[(int) (1.5 * totalLength)];
- Array.Copy(charBuffer, 0, newCharBuffer, 0, start);
- charBuffer = newCharBuffer;
- }
- tvf.ReadChars(charBuffer, start, deltaLength);
- term = new System.String(charBuffer, 0, totalLength);
- }
- else
- {
- // Term stored as utf8 bytes
- if (byteBuffer.Length < totalLength)
- {
- byte[] newByteBuffer = new byte[(int) (1.5 * totalLength)];
- Array.Copy(byteBuffer, 0, newByteBuffer, 0, start);
- byteBuffer = newByteBuffer;
- }
- tvf.ReadBytes(byteBuffer, start, deltaLength);
- term = System.Text.Encoding.UTF8.GetString(byteBuffer, 0, totalLength);
- }
- int freq = tvf.ReadVInt();
- int[] positions = null;
- if (storePositions)
- {
- //read in the positions
- //does the mapper even care about positions?
- if (mapper.IsIgnoringPositions == false)
- {
- positions = new int[freq];
- int prevPosition = 0;
- for (int j = 0; j < freq; j++)
- {
- positions[j] = prevPosition + tvf.ReadVInt();
- prevPosition = positions[j];
- }
- }
- else
- {
- //we need to skip over the positions. Since these are VInts, I don't believe there is anyway to know for sure how far to skip
- //
- for (int j = 0; j < freq; j++)
- {
- tvf.ReadVInt();
- }
- }
- }
- TermVectorOffsetInfo[] offsets = null;
- if (storeOffsets)
- {
- //does the mapper even care about offsets?
- if (mapper.IsIgnoringOffsets == false)
- {
- offsets = new TermVectorOffsetInfo[freq];
- int prevOffset = 0;
- for (int j = 0; j < freq; j++)
- {
- int startOffset = prevOffset + tvf.ReadVInt();
- int endOffset = startOffset + tvf.ReadVInt();
- offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset);
- prevOffset = endOffset;
- }
- }
- else
- {
- for (int j = 0; j < freq; j++)
- {
- tvf.ReadVInt();
- tvf.ReadVInt();
- }
- }
- }
- mapper.Map(term, freq, offsets, positions);
- }
- }
-
- public virtual System.Object Clone()
- {
-
- TermVectorsReader clone = (TermVectorsReader) base.MemberwiseClone();
-
- // These are null when a TermVectorsReader was created
- // on a segment that did not have term vectors saved
- if (tvx != null && tvd != null && tvf != null)
- {
- clone.tvx = (IndexInput) tvx.Clone();
- clone.tvd = (IndexInput) tvd.Clone();
- clone.tvf = (IndexInput) tvf.Clone();
- }
-
- return clone;
- }
- }
-
-
- /// Models the existing parallel array structure
- class ParallelArrayTermVectorMapper:TermVectorMapper
- {
-
- private System.String[] terms;
- private int[] termFreqs;
- private int[][] positions;
- private TermVectorOffsetInfo[][] offsets;
- private int currentPosition;
- private bool storingOffsets;
- private bool storingPositions;
- private System.String field;
-
- public override void SetExpectations(System.String field, int numTerms, bool storeOffsets, bool storePositions)
- {
- this.field = field;
- terms = new System.String[numTerms];
- termFreqs = new int[numTerms];
- this.storingOffsets = storeOffsets;
- this.storingPositions = storePositions;
- if (storePositions)
- this.positions = new int[numTerms][];
- if (storeOffsets)
- this.offsets = new TermVectorOffsetInfo[numTerms][];
- }
-
- public override void Map(System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions)
- {
- terms[currentPosition] = term;
- termFreqs[currentPosition] = frequency;
- if (storingOffsets)
- {
- this.offsets[currentPosition] = offsets;
- }
- if (storingPositions)
- {
- this.positions[currentPosition] = positions;
- }
- currentPosition++;
- }
-
- /// Construct the vector
- /// The based on the mappings.
- ///
- public virtual ITermFreqVector MaterializeVector()
- {
- SegmentTermVector tv = null;
- if (field != null && terms != null)
- {
- if (storingPositions || storingOffsets)
- {
- tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets);
- }
- else
- {
- tv = new SegmentTermVector(field, terms, termFreqs);
- }
- }
- return tv;
- }
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/228b970a/src/Lucene.Net.Core/Index/TermVectorsTermsWriter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Core/Index/TermVectorsTermsWriter.cs b/src/Lucene.Net.Core/Index/TermVectorsTermsWriter.cs
deleted file mode 100644
index 8d07924..0000000
--- a/src/Lucene.Net.Core/Index/TermVectorsTermsWriter.cs
+++ /dev/null
@@ -1,380 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-using System;
-using System.Collections.Generic;
-using IndexOutput = Lucene.Net.Store.IndexOutput;
-using RAMOutputStream = Lucene.Net.Store.RAMOutputStream;
-using ArrayUtil = Lucene.Net.Util.ArrayUtil;
-
-namespace Lucene.Net.Index
-{
- sealed class TermVectorsTermsWriter:TermsHashConsumer
- {
- private void InitBlock()
- {
- docFreeList = new PerDoc[1];
- }
-
- internal DocumentsWriter docWriter;
- internal TermVectorsWriter termVectorsWriter;
- internal PerDoc[] docFreeList;
- internal int freeCount;
- internal IndexOutput tvx;
- internal IndexOutput tvd;
- internal IndexOutput tvf;
- internal int lastDocID;
-
- public TermVectorsTermsWriter(DocumentsWriter docWriter)
- {
- InitBlock();
- this.docWriter = docWriter;
- }
-
- public override TermsHashConsumerPerThread AddThread(TermsHashPerThread termsHashPerThread)
- {
- return new TermVectorsTermsWriterPerThread(termsHashPerThread, this);
- }
-
- internal override void CreatePostings(RawPostingList[] postings, int start, int count)
- {
- int end = start + count;
- for (int i = start; i < end; i++)
- postings[i] = new PostingList();
- }
-
- public override void Flush(IDictionary> threadsAndFields, SegmentWriteState state)
- {
- lock (this)
- {
- // NOTE: it's possible that all documents seen in this segment
- // hit non-aborting exceptions, in which case we will
- // not have yet init'd the TermVectorsWriter. This is
- // actually OK (unlike in the stored fields case)
- // because, although IieldInfos.hasVectors() will return
- // true, the TermVectorsReader gracefully handles
- // non-existence of the term vectors files.
- if (tvx != null)
- {
-
- if (state.numDocsInStore > 0)
- // In case there are some final documents that we
- // didn't see (because they hit a non-aborting exception):
- Fill(state.numDocsInStore - docWriter.DocStoreOffset);
-
- tvx.Flush();
- tvd.Flush();
- tvf.Flush();
- }
-
- foreach(var entry in threadsAndFields)
- {
- foreach(var field in entry.Value)
- {
- TermVectorsTermsWriterPerField perField = (TermVectorsTermsWriterPerField)field;
- perField.termsHashPerField.Reset();
- perField.ShrinkHash();
- }
-
- TermVectorsTermsWriterPerThread perThread = (TermVectorsTermsWriterPerThread) entry.Key;
- perThread.termsHashPerThread.Reset(true);
- }
- }
- }
-
- internal override void CloseDocStore(SegmentWriteState state)
- {
- lock (this)
- {
- if (tvx != null)
- {
- // At least one doc in this run had term vectors
- // enabled
- Fill(state.numDocsInStore - docWriter.DocStoreOffset);
- tvx.Close();
- tvf.Close();
- tvd.Close();
- tvx = null;
- System.Diagnostics.Debug.Assert(state.docStoreSegmentName != null);
- System.String fileName = state.docStoreSegmentName + "." + IndexFileNames.VECTORS_INDEX_EXTENSION;
- if (4 + ((long) state.numDocsInStore) * 16 != state.directory.FileLength(fileName))
- throw new System.SystemException("after flush: tvx size mismatch: " + state.numDocsInStore + " docs vs " + state.directory.FileLength(fileName) + " length in bytes of " + fileName + " file exists?=" + state.directory.FileExists(fileName));
-
- state.flushedFiles.Add(state.docStoreSegmentName + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);
- state.flushedFiles.Add(state.docStoreSegmentName + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION);
- state.flushedFiles.Add(state.docStoreSegmentName + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION);
-
- docWriter.RemoveOpenFile(state.docStoreSegmentName + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);
- docWriter.RemoveOpenFile(state.docStoreSegmentName + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION);
- docWriter.RemoveOpenFile(state.docStoreSegmentName + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION);
-
- lastDocID = 0;
- }
- }
- }
-
- internal int allocCount;
-
- internal PerDoc GetPerDoc()
- {
- lock (this)
- {
- if (freeCount == 0)
- {
- allocCount++;
- if (allocCount > docFreeList.Length)
- {
- // Grow our free list up front to make sure we have
- // enough space to recycle all outstanding PerDoc
- // instances
- System.Diagnostics.Debug.Assert(allocCount == 1 + docFreeList.Length);
- docFreeList = new PerDoc[ArrayUtil.GetNextSize(allocCount)];
- }
- return new PerDoc(this);
- }
- else
- return docFreeList[--freeCount];
- }
- }
-
- /// Fills in no-term-vectors for all docs we haven't seen
- /// since the last doc that had term vectors.
- ///
- internal void Fill(int docID)
- {
- int docStoreOffset = docWriter.DocStoreOffset;
- int end = docID + docStoreOffset;
- if (lastDocID < end)
- {
- long tvfPosition = tvf.FilePointer;
- while (lastDocID < end)
- {
- tvx.WriteLong(tvd.FilePointer);
- tvd.WriteVInt(0);
- tvx.WriteLong(tvfPosition);
- lastDocID++;
- }
- }
- }
-
- internal void InitTermVectorsWriter()
- {
- lock (this)
- {
- if (tvx == null)
- {
-
- System.String docStoreSegment = docWriter.DocStoreSegment;
-
- if (docStoreSegment == null)
- return ;
-
- System.Diagnostics.Debug.Assert(docStoreSegment != null);
-
- // If we hit an exception while init'ing the term
- // vector output files, we must abort this segment
- // because those files will be in an unknown
- // state:
- tvx = docWriter.directory.CreateOutput(docStoreSegment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);
- tvd = docWriter.directory.CreateOutput(docStoreSegment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION);
- tvf = docWriter.directory.CreateOutput(docStoreSegment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION);
-
- tvx.WriteInt(TermVectorsReader.FORMAT_CURRENT);
- tvd.WriteInt(TermVectorsReader.FORMAT_CURRENT);
- tvf.WriteInt(TermVectorsReader.FORMAT_CURRENT);
-
- docWriter.AddOpenFile(docStoreSegment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);
- docWriter.AddOpenFile(docStoreSegment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION);
- docWriter.AddOpenFile(docStoreSegment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION);
-
- lastDocID = 0;
- }
- }
- }
-
- internal void FinishDocument(PerDoc perDoc)
- {
- lock (this)
- {
-
- System.Diagnostics.Debug.Assert(docWriter.writer.TestPoint("TermVectorsTermsWriter.finishDocument start"));
-
- InitTermVectorsWriter();
-
- Fill(perDoc.docID);
-
- // Append term vectors to the real outputs:
- tvx.WriteLong(tvd.FilePointer);
- tvx.WriteLong(tvf.FilePointer);
- tvd.WriteVInt(perDoc.numVectorFields);
- if (perDoc.numVectorFields > 0)
- {
- for (int i = 0; i < perDoc.numVectorFields; i++)
- tvd.WriteVInt(perDoc.fieldNumbers[i]);
- System.Diagnostics.Debug.Assert(0 == perDoc.fieldPointers [0]);
- long lastPos = perDoc.fieldPointers[0];
- for (int i = 1; i < perDoc.numVectorFields; i++)
- {
- long pos = perDoc.fieldPointers[i];
- tvd.WriteVLong(pos - lastPos);
- lastPos = pos;
- }
- perDoc.perDocTvf.WriteTo(tvf);
- perDoc.numVectorFields = 0;
- }
-
- System.Diagnostics.Debug.Assert(lastDocID == perDoc.docID + docWriter.DocStoreOffset);
-
- lastDocID++;
- perDoc.Reset();
- Free(perDoc);
- System.Diagnostics.Debug.Assert(docWriter.writer.TestPoint("TermVectorsTermsWriter.finishDocument end"));
- }
- }
-
- public bool FreeRAM()
- {
- // We don't hold any state beyond one doc, so we don't
- // free persistent RAM here
- return false;
- }
-
- public override void Abort()
- {
- if (tvx != null)
- {
- try
- {
- tvx.Close();
- }
- catch (System.Exception)
- {
- }
- tvx = null;
- }
- if (tvd != null)
- {
- try
- {
- tvd.Close();
- }
- catch (System.Exception)
- {
- }
- tvd = null;
- }
- if (tvf != null)
- {
- try
- {
- tvf.Close();
- }
- catch (System.Exception)
- {
- }
- tvf = null;
- }
- lastDocID = 0;
- }
-
- internal void Free(PerDoc doc)
- {
- lock (this)
- {
- System.Diagnostics.Debug.Assert(freeCount < docFreeList.Length);
- docFreeList[freeCount++] = doc;
- }
- }
-
- internal class PerDoc:DocumentsWriter.DocWriter
- {
- public PerDoc(TermVectorsTermsWriter enclosingInstance)
- {
- InitBlock(enclosingInstance);
- }
- private void InitBlock(TermVectorsTermsWriter enclosingInstance)
- {
- this.enclosingInstance = enclosingInstance;
- buffer = enclosingInstance.docWriter.NewPerDocBuffer();
- perDocTvf = new RAMOutputStream(buffer);
- }
- private TermVectorsTermsWriter enclosingInstance;
- public TermVectorsTermsWriter Enclosing_Instance
- {
- get
- {
- return enclosingInstance;
- }
-
- }
-
- internal DocumentsWriter.PerDocBuffer buffer;
- internal RAMOutputStream perDocTvf;
- internal int numVectorFields;
-
- internal int[] fieldNumbers = new int[1];
- internal long[] fieldPointers = new long[1];
-
- internal void Reset()
- {
- perDocTvf.Reset();
- buffer.Recycle();
- numVectorFields = 0;
- }
-
- public override void Abort()
- {
- Reset();
- Enclosing_Instance.Free(this);
- }
-
- internal void AddField(int fieldNumber)
- {
- if (numVectorFields == fieldNumbers.Length)
- {
- fieldNumbers = ArrayUtil.Grow(fieldNumbers);
- fieldPointers = ArrayUtil.Grow(fieldPointers);
- }
- fieldNumbers[numVectorFields] = fieldNumber;
- fieldPointers[numVectorFields] = perDocTvf.FilePointer;
- numVectorFields++;
- }
-
- public override long SizeInBytes()
- {
- return buffer.SizeInBytes;
- }
-
- public override void Finish()
- {
- Enclosing_Instance.FinishDocument(this);
- }
- }
-
- internal sealed class PostingList:RawPostingList
- {
- internal int freq; // How many times this term occurred in the current doc
- internal int lastOffset; // Last offset we saw
- internal int lastPosition; // Last position where this term occurred
- }
-
- internal override int BytesPerPosting()
- {
- return RawPostingList.BYTES_SIZE + 3 * DocumentsWriter.INT_NUM_BYTE;
- }
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/228b970a/src/Lucene.Net.Core/Index/TermVectorsTermsWriterPerField.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Core/Index/TermVectorsTermsWriterPerField.cs b/src/Lucene.Net.Core/Index/TermVectorsTermsWriterPerField.cs
deleted file mode 100644
index 945f32b..0000000
--- a/src/Lucene.Net.Core/Index/TermVectorsTermsWriterPerField.cs
+++ /dev/null
@@ -1,290 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-using System;
-using Lucene.Net.Analysis.Tokenattributes;
-using Lucene.Net.Documents;
-using IndexOutput = Lucene.Net.Store.IndexOutput;
-using UnicodeUtil = Lucene.Net.Util.UnicodeUtil;
-
-namespace Lucene.Net.Index
-{
-
- sealed class TermVectorsTermsWriterPerField:TermsHashConsumerPerField
- {
-
- internal TermVectorsTermsWriterPerThread perThread;
- internal TermsHashPerField termsHashPerField;
- internal TermVectorsTermsWriter termsWriter;
- internal FieldInfo fieldInfo;
- internal DocumentsWriter.DocState docState;
- internal FieldInvertState fieldState;
-
- internal bool doVectors;
- internal bool doVectorPositions;
- internal bool doVectorOffsets;
-
- internal int maxNumPostings;
- internal IOffsetAttribute offsetAttribute = null;
-
- public TermVectorsTermsWriterPerField(TermsHashPerField termsHashPerField, TermVectorsTermsWriterPerThread perThread, FieldInfo fieldInfo)
- {
- this.termsHashPerField = termsHashPerField;
- this.perThread = perThread;
- this.termsWriter = perThread.termsWriter;
- this.fieldInfo = fieldInfo;
- docState = termsHashPerField.docState;
- fieldState = termsHashPerField.fieldState;
- }
-
- internal override int GetStreamCount()
- {
- return 2;
- }
-
- internal override bool Start(IFieldable[] fields, int count)
- {
- doVectors = false;
- doVectorPositions = false;
- doVectorOffsets = false;
-
- for (int i = 0; i < count; i++)
- {
- IFieldable field = fields[i];
- if (field.IsIndexed && field.IsTermVectorStored)
- {
- doVectors = true;
- doVectorPositions |= field.IsStorePositionWithTermVector;
- doVectorOffsets |= field.IsStoreOffsetWithTermVector;
- }
- }
-
- if (doVectors)
- {
- if (perThread.doc == null)
- {
- perThread.doc = termsWriter.GetPerDoc();
- perThread.doc.docID = docState.docID;
- System.Diagnostics.Debug.Assert(perThread.doc.numVectorFields == 0);
- System.Diagnostics.Debug.Assert(0 == perThread.doc.perDocTvf.Length);
- System.Diagnostics.Debug.Assert(0 == perThread.doc.perDocTvf.FilePointer);
- }
-
- System.Diagnostics.Debug.Assert(perThread.doc.docID == docState.docID);
- if (termsHashPerField.numPostings != 0)
- {
- // Only necessary if previous doc hit a
- // non-aborting exception while writing vectors in
- // this field:
- termsHashPerField.Reset();
- perThread.termsHashPerThread.Reset(false);
- }
- }
-
- // TODO: only if needed for performance
- //perThread.postingsCount = 0;
-
- return doVectors;
- }
-
- public void Abort()
- {
- }
-
- /// Called once per field per document if term vectors
- /// are enabled, to write the vectors to
- /// RAMOutputStream, which is then quickly flushed to
- /// the real term vectors files in the Directory.
- ///
- internal override void Finish()
- {
-
- System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.finish start"));
-
- int numPostings = termsHashPerField.numPostings;
-
- System.Diagnostics.Debug.Assert(numPostings >= 0);
-
- if (!doVectors || numPostings == 0)
- return ;
-
- if (numPostings > maxNumPostings)
- maxNumPostings = numPostings;
-
- IndexOutput tvf = perThread.doc.perDocTvf;
-
- // This is called once, after inverting all occurences
- // of a given field in the doc. At this point we flush
- // our hash into the DocWriter.
-
- System.Diagnostics.Debug.Assert(fieldInfo.storeTermVector);
- System.Diagnostics.Debug.Assert(perThread.VectorFieldsInOrder(fieldInfo));
-
- perThread.doc.AddField(termsHashPerField.fieldInfo.number);
-
- RawPostingList[] postings = termsHashPerField.SortPostings();
-
- tvf.WriteVInt(numPostings);
- byte bits = (byte) (0x0);
- if (doVectorPositions)
- bits |= TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR;
- if (doVectorOffsets)
- bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR;
- tvf.WriteByte(bits);
-
- int encoderUpto = 0;
- int lastTermBytesCount = 0;
-
- ByteSliceReader reader = perThread.vectorSliceReader;
- char[][] charBuffers = perThread.termsHashPerThread.charPool.buffers;
- for (int j = 0; j < numPostings; j++)
- {
- TermVectorsTermsWriter.PostingList posting = (TermVectorsTermsWriter.PostingList) postings[j];
- int freq = posting.freq;
-
- char[] text2 = charBuffers[posting.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT];
- int start2 = posting.textStart & DocumentsWriter.CHAR_BLOCK_MASK;
-
- // We swap between two encoders to save copying
- // last Term's byte array
- UnicodeUtil.UTF8Result utf8Result = perThread.utf8Results[encoderUpto];
-
- // TODO: we could do this incrementally
- UnicodeUtil.UTF16toUTF8(text2, start2, utf8Result);
- int termBytesCount = utf8Result.length;
-
- // TODO: UTF16toUTF8 could tell us this prefix
- // Compute common prefix between last term and
- // this term
- int prefix = 0;
- if (j > 0)
- {
- byte[] lastTermBytes = perThread.utf8Results[1 - encoderUpto].result;
- byte[] termBytes = perThread.utf8Results[encoderUpto].result;
- while (prefix < lastTermBytesCount && prefix < termBytesCount)
- {
- if (lastTermBytes[prefix] != termBytes[prefix])
- break;
- prefix++;
- }
- }
- encoderUpto = 1 - encoderUpto;
- lastTermBytesCount = termBytesCount;
-
- int suffix = termBytesCount - prefix;
- tvf.WriteVInt(prefix);
- tvf.WriteVInt(suffix);
- tvf.WriteBytes(utf8Result.result, prefix, suffix);
- tvf.WriteVInt(freq);
-
- if (doVectorPositions)
- {
- termsHashPerField.InitReader(reader, posting, 0);
- reader.WriteTo(tvf);
- }
-
- if (doVectorOffsets)
- {
- termsHashPerField.InitReader(reader, posting, 1);
- reader.WriteTo(tvf);
- }
- }
-
- termsHashPerField.Reset();
-
- // NOTE: we clear, per-field, at the thread level,
- // because term vectors fully write themselves on each
- // field; this saves RAM (eg if large doc has two large
- // fields w/ term vectors on) because we recycle/reuse
- // all RAM after each field:
- perThread.termsHashPerThread.Reset(false);
- }
-
- internal void ShrinkHash()
- {
- termsHashPerField.ShrinkHash(maxNumPostings);
- maxNumPostings = 0;
- }
-
- internal override void Start(IFieldable f)
- {
- if (doVectorOffsets)
- {
- offsetAttribute = fieldState.attributeSource.AddAttribute();
- }
- else
- {
- offsetAttribute = null;
- }
- }
-
- internal override void NewTerm(RawPostingList p0)
- {
-
- System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.newTerm start"));
-
- TermVectorsTermsWriter.PostingList p = (TermVectorsTermsWriter.PostingList) p0;
-
- p.freq = 1;
-
- if (doVectorOffsets)
- {
- int startOffset = fieldState.offset + offsetAttribute.StartOffset; ;
- int endOffset = fieldState.offset + offsetAttribute.EndOffset;
-
- termsHashPerField.WriteVInt(1, startOffset);
- termsHashPerField.WriteVInt(1, endOffset - startOffset);
- p.lastOffset = endOffset;
- }
-
- if (doVectorPositions)
- {
- termsHashPerField.WriteVInt(0, fieldState.position);
- p.lastPosition = fieldState.position;
- }
- }
-
- internal override void AddTerm(RawPostingList p0)
- {
-
- System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.addTerm start"));
-
- TermVectorsTermsWriter.PostingList p = (TermVectorsTermsWriter.PostingList) p0;
- p.freq++;
-
- if (doVectorOffsets)
- {
- int startOffset = fieldState.offset + offsetAttribute.StartOffset; ;
- int endOffset = fieldState.offset + offsetAttribute.EndOffset;
-
- termsHashPerField.WriteVInt(1, startOffset - p.lastOffset);
- termsHashPerField.WriteVInt(1, endOffset - startOffset);
- p.lastOffset = endOffset;
- }
-
- if (doVectorPositions)
- {
- termsHashPerField.WriteVInt(0, fieldState.position - p.lastPosition);
- p.lastPosition = fieldState.position;
- }
- }
-
- internal override void SkippingLongTerm()
- {
- }
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/228b970a/src/Lucene.Net.Core/Index/TermVectorsTermsWriterPerThread.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Core/Index/TermVectorsTermsWriterPerThread.cs b/src/Lucene.Net.Core/Index/TermVectorsTermsWriterPerThread.cs
deleted file mode 100644
index e58866a..0000000
--- a/src/Lucene.Net.Core/Index/TermVectorsTermsWriterPerThread.cs
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-using System;
-
-using UnicodeUtil = Lucene.Net.Util.UnicodeUtil;
-
-namespace Lucene.Net.Index
-{
-
- sealed class TermVectorsTermsWriterPerThread:TermsHashConsumerPerThread
- {
-
- internal TermVectorsTermsWriter termsWriter;
- internal TermsHashPerThread termsHashPerThread;
- internal DocumentsWriter.DocState docState;
-
- internal TermVectorsTermsWriter.PerDoc doc;
-
- public TermVectorsTermsWriterPerThread(TermsHashPerThread termsHashPerThread, TermVectorsTermsWriter termsWriter)
- {
- this.termsWriter = termsWriter;
- this.termsHashPerThread = termsHashPerThread;
- docState = termsHashPerThread.docState;
- }
-
- // Used by perField when serializing the term vectors
- internal ByteSliceReader vectorSliceReader = new ByteSliceReader();
-
- internal UnicodeUtil.UTF8Result[] utf8Results = new UnicodeUtil.UTF8Result[]{new UnicodeUtil.UTF8Result(), new UnicodeUtil.UTF8Result()};
-
- public override void StartDocument()
- {
- System.Diagnostics.Debug.Assert(ClearLastVectorFieldName());
- if (doc != null)
- {
- doc.Reset();
- doc.docID = docState.docID;
- }
- }
-
- public override DocumentsWriter.DocWriter FinishDocument()
- {
- try
- {
- return doc;
- }
- finally
- {
- doc = null;
- }
- }
-
- public override TermsHashConsumerPerField AddField(TermsHashPerField termsHashPerField, FieldInfo fieldInfo)
- {
- return new TermVectorsTermsWriterPerField(termsHashPerField, this, fieldInfo);
- }
-
- public override void Abort()
- {
- if (doc != null)
- {
- doc.Abort();
- doc = null;
- }
- }
-
- // Called only by assert
- internal bool ClearLastVectorFieldName()
- {
- lastVectorFieldName = null;
- return true;
- }
-
- // Called only by assert
- internal System.String lastVectorFieldName;
- internal bool VectorFieldsInOrder(FieldInfo fi)
- {
- try
- {
- if (lastVectorFieldName != null)
- return String.CompareOrdinal(lastVectorFieldName, fi.name) < 0;
- else
- return true;
- }
- finally
- {
- lastVectorFieldName = fi.name;
- }
- }
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/228b970a/src/Lucene.Net.Core/Index/TermVectorsWriter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Core/Index/TermVectorsWriter.cs b/src/Lucene.Net.Core/Index/TermVectorsWriter.cs
deleted file mode 100644
index a689478..0000000
--- a/src/Lucene.Net.Core/Index/TermVectorsWriter.cs
+++ /dev/null
@@ -1,246 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-using System;
-
-using Directory = Lucene.Net.Store.Directory;
-using IndexOutput = Lucene.Net.Store.IndexOutput;
-using StringHelper = Lucene.Net.Util.StringHelper;
-using UnicodeUtil = Lucene.Net.Util.UnicodeUtil;
-
-namespace Lucene.Net.Index
-{
- sealed class TermVectorsWriter : IDisposable
- {
-
- private readonly IndexOutput tvx = null;
- private readonly IndexOutput tvd = null;
- private readonly IndexOutput tvf = null;
- private readonly FieldInfos fieldInfos;
- internal UnicodeUtil.UTF8Result[] utf8Results = new[]{new UnicodeUtil.UTF8Result(), new UnicodeUtil.UTF8Result()};
-
- public TermVectorsWriter(Directory directory, System.String segment, FieldInfos fieldInfos)
- {
- // Open files for TermVector storage
- tvx = directory.CreateOutput(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);
- tvx.WriteInt(TermVectorsReader.FORMAT_CURRENT);
- tvd = directory.CreateOutput(segment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION);
- tvd.WriteInt(TermVectorsReader.FORMAT_CURRENT);
- tvf = directory.CreateOutput(segment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION);
- tvf.WriteInt(TermVectorsReader.FORMAT_CURRENT);
-
- this.fieldInfos = fieldInfos;
- }
-
- /// Add a complete document specified by all its term vectors. If document has no
- /// term vectors, add value for tvx.
- ///
- ///
- ///
- ///
- /// IOException
- public void AddAllDocVectors(ITermFreqVector[] vectors)
- {
-
- tvx.WriteLong(tvd.FilePointer);
- tvx.WriteLong(tvf.FilePointer);
-
- if (vectors != null)
- {
- int numFields = vectors.Length;
- tvd.WriteVInt(numFields);
-
- var fieldPointers = new long[numFields];
-
- for (int i = 0; i < numFields; i++)
- {
- fieldPointers[i] = tvf.FilePointer;
-
- int fieldNumber = fieldInfos.FieldNumber(vectors[i].Field);
-
- // 1st pass: write field numbers to tvd
- tvd.WriteVInt(fieldNumber);
-
- int numTerms = vectors[i].Size;
- tvf.WriteVInt(numTerms);
-
- TermPositionVector tpVector;
-
- byte bits;
- bool storePositions;
- bool storeOffsets;
-
- if (vectors[i] is TermPositionVector)
- {
- // May have positions & offsets
- tpVector = (TermPositionVector) vectors[i];
- storePositions = tpVector.Size > 0 && tpVector.GetTermPositions(0) != null;
- storeOffsets = tpVector.Size > 0 && tpVector.GetOffsets(0) != null;
- bits = (byte) ((storePositions?TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR: (byte) 0) + (storeOffsets?TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR: (byte) 0));
- }
- else
- {
- tpVector = null;
- bits = 0;
- storePositions = false;
- storeOffsets = false;
- }
-
- tvf.WriteVInt(bits);
-
- System.String[] terms = vectors[i].GetTerms();
- int[] freqs = vectors[i].GetTermFrequencies();
-
- int utf8Upto = 0;
- utf8Results[1].length = 0;
-
- for (int j = 0; j < numTerms; j++)
- {
-
- UnicodeUtil.UTF16toUTF8(terms[j], 0, terms[j].Length, utf8Results[utf8Upto]);
-
- int start = StringHelper.BytesDifference(utf8Results[1 - utf8Upto].result, utf8Results[1 - utf8Upto].length, utf8Results[utf8Upto].result, utf8Results[utf8Upto].length);
- int length = utf8Results[utf8Upto].length - start;
- tvf.WriteVInt(start); // write shared prefix length
- tvf.WriteVInt(length); // write delta length
- tvf.WriteBytes(utf8Results[utf8Upto].result, start, length); // write delta bytes
- utf8Upto = 1 - utf8Upto;
-
- int termFreq = freqs[j];
-
- tvf.WriteVInt(termFreq);
-
- if (storePositions)
- {
- int[] positions = tpVector.GetTermPositions(j);
- if (positions == null)
- throw new System.SystemException("Trying to write positions that are null!");
- System.Diagnostics.Debug.Assert(positions.Length == termFreq);
-
- // use delta encoding for positions
- int lastPosition = 0;
- foreach (int position in positions)
- {
- tvf.WriteVInt(position - lastPosition);
- lastPosition = position;
- }
- }
-
- if (storeOffsets)
- {
- TermVectorOffsetInfo[] offsets = tpVector.GetOffsets(j);
- if (offsets == null)
- throw new System.SystemException("Trying to write offsets that are null!");
- System.Diagnostics.Debug.Assert(offsets.Length == termFreq);
-
- // use delta encoding for offsets
- int lastEndOffset = 0;
- foreach (TermVectorOffsetInfo t in offsets)
- {
- int startOffset = t.StartOffset;
- int endOffset = t.EndOffset;
- tvf.WriteVInt(startOffset - lastEndOffset);
- tvf.WriteVInt(endOffset - startOffset);
- lastEndOffset = endOffset;
- }
- }
- }
- }
-
- // 2nd pass: write field pointers to tvd
- if (numFields > 1)
- {
- long lastFieldPointer = fieldPointers[0];
- for (int i = 1; i < numFields; i++)
- {
- long fieldPointer = fieldPointers[i];
- tvd.WriteVLong(fieldPointer - lastFieldPointer);
- lastFieldPointer = fieldPointer;
- }
- }
- }
- else
- tvd.WriteVInt(0);
- }
-
- /// Do a bulk copy of numDocs documents from reader to our
- /// streams. This is used to expedite merging, if the
- /// field numbers are congruent.
- ///
- internal void AddRawDocuments(TermVectorsReader reader, int[] tvdLengths, int[] tvfLengths, int numDocs)
- {
- long tvdPosition = tvd.FilePointer;
- long tvfPosition = tvf.FilePointer;
- long tvdStart = tvdPosition;
- long tvfStart = tvfPosition;
- for (int i = 0; i < numDocs; i++)
- {
- tvx.WriteLong(tvdPosition);
- tvdPosition += tvdLengths[i];
- tvx.WriteLong(tvfPosition);
- tvfPosition += tvfLengths[i];
- }
- tvd.CopyBytes(reader.GetTvdStream(), tvdPosition - tvdStart);
- tvf.CopyBytes(reader.GetTvfStream(), tvfPosition - tvfStart);
- System.Diagnostics.Debug.Assert(tvd.FilePointer == tvdPosition);
- System.Diagnostics.Debug.Assert(tvf.FilePointer == tvfPosition);
- }
-
- /// Close all streams.
- public void Dispose()
- {
- // Move to a protected method if class becomes unsealed
-
- // make an effort to close all streams we can but remember and re-throw
- // the first exception encountered in this process
- System.IO.IOException keep = null;
- if (tvx != null)
- try
- {
- tvx.Close();
- }
- catch (System.IO.IOException e)
- {
- keep = e;
- }
- if (tvd != null)
- try
- {
- tvd.Close();
- }
- catch (System.IO.IOException e)
- {
- if (keep == null)
- keep = e;
- }
- if (tvf != null)
- try
- {
- tvf.Close();
- }
- catch (System.IO.IOException e)
- {
- if (keep == null)
- keep = e;
- }
- if (keep != null)
- {
- throw new System.IO.IOException(keep.StackTrace);
- }
- }
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/228b970a/src/Lucene.Net.Core/Index/TermsHashConsumerPerThread.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Core/Index/TermsHashConsumerPerThread.cs b/src/Lucene.Net.Core/Index/TermsHashConsumerPerThread.cs
deleted file mode 100644
index e135ba0..0000000
--- a/src/Lucene.Net.Core/Index/TermsHashConsumerPerThread.cs
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-using System;
-
-namespace Lucene.Net.Index
-{
-
- abstract class TermsHashConsumerPerThread
- {
- public abstract void StartDocument();
- public abstract DocumentsWriter.DocWriter FinishDocument();
- abstract public TermsHashConsumerPerField AddField(TermsHashPerField termsHashPerField, FieldInfo fieldInfo);
- abstract public void Abort();
- }
-}
\ No newline at end of file