http://git-wip-us.apache.org/repos/asf/lucenenet/blob/228b970a/src/Lucene.Net.Core/Index/TermVectorEntryFreqSortedComparator.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Core/Index/TermVectorEntryFreqSortedComparator.cs b/src/Lucene.Net.Core/Index/TermVectorEntryFreqSortedComparator.cs deleted file mode 100644 index 26fd586..0000000 --- a/src/Lucene.Net.Core/Index/TermVectorEntryFreqSortedComparator.cs +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; - -namespace Lucene.Net.Index -{ - - /// Compares s first by frequency and then by - /// the term (case-sensitive) - /// - /// - /// - public class TermVectorEntryFreqSortedComparator : System.Collections.Generic.IComparer - { - public virtual int Compare(TermVectorEntry entry, TermVectorEntry entry1) - { - int result = 0; - result = entry1.Frequency - entry.Frequency; - if (result == 0) - { - result = String.CompareOrdinal(entry.Term, entry1.Term); - if (result == 0) - { - result = String.CompareOrdinal(entry.Field, entry1.Field); - } - } - return result; - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/228b970a/src/Lucene.Net.Core/Index/TermVectorMapper.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Core/Index/TermVectorMapper.cs b/src/Lucene.Net.Core/Index/TermVectorMapper.cs deleted file mode 100644 index e006385..0000000 --- a/src/Lucene.Net.Core/Index/TermVectorMapper.cs +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; - -namespace Lucene.Net.Index -{ - - /// The TermVectorMapper can be used to map Term Vectors into your own - /// structure instead of the parallel array structure used by - /// . - ///

- /// It is up to the implementation to make sure it is thread-safe. - /// - /// - /// - ///

- public abstract class TermVectorMapper - { - - private bool ignoringPositions; - private bool ignoringOffsets; - - - protected internal TermVectorMapper() - { - } - - /// - /// true if this mapper should tell Lucene to ignore positions even if they are stored - /// - /// similar to ignoringPositions - /// - protected internal TermVectorMapper(bool ignoringPositions, bool ignoringOffsets) - { - this.ignoringPositions = ignoringPositions; - this.ignoringOffsets = ignoringOffsets; - } - - /// Tell the mapper what to expect in regards to field, number of terms, offset and position storage. - /// This method will be called once before retrieving the vector for a field. - /// - /// This method will be called before . - /// - /// The field the vector is for - /// - /// The number of terms that need to be mapped - /// - /// true if the mapper should expect offset information - /// - /// true if the mapper should expect positions info - /// - public abstract void SetExpectations(System.String field, int numTerms, bool storeOffsets, bool storePositions); - /// Map the Term Vector information into your own structure - /// The term to add to the vector - /// - /// The frequency of the term in the document - /// - /// null if the offset is not specified, otherwise the offset into the field of the term - /// - /// null if the position is not specified, otherwise the position in the field of the term - /// - public abstract void Map(System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions); - - /// Indicate to Lucene that even if there are positions stored, this mapper is not interested in them and they - /// can be skipped over. Derived classes should set this to true if they want to ignore positions. The default - /// is false, meaning positions will be loaded if they are stored. - /// - /// false - public virtual bool IsIgnoringPositions - { - get { return ignoringPositions; } - } - - /// - /// Same principal as , but applied to offsets. false by default. - /// - /// false - public virtual bool IsIgnoringOffsets - { - get { return ignoringOffsets; } - } - - /// Passes down the index of the document whose term vector is currently being mapped, - /// once for each top level call to a term vector reader. - ///

- /// Default implementation IGNORES the document number. Override if your implementation needs the document number. - ///

- /// NOTE: Document numbers are internal to Lucene and subject to change depending on indexing operations. - /// - ///

- /// index of document currently being mapped - /// - public virtual void SetDocumentNumber(int documentNumber) - { - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/228b970a/src/Lucene.Net.Core/Index/TermVectorOffsetInfo.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Core/Index/TermVectorOffsetInfo.cs b/src/Lucene.Net.Core/Index/TermVectorOffsetInfo.cs deleted file mode 100644 index 1f9d7d2..0000000 --- a/src/Lucene.Net.Core/Index/TermVectorOffsetInfo.cs +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; - -using System.Runtime.InteropServices; - -namespace Lucene.Net.Index -{ - - /// The TermVectorOffsetInfo class holds information pertaining to a Term in a 's - /// offset information. This offset information is the character offset as set during the Analysis phase (and thus may not be the actual offset in the - /// original content). - /// - [Serializable] - public struct TermVectorOffsetInfo : IEquatable - { - /// Convenience declaration when creating a that stores only position information. - [NonSerialized] - public static readonly TermVectorOffsetInfo[] EMPTY_OFFSET_INFO = new TermVectorOffsetInfo[0]; - - [NonSerialized] - public static readonly TermVectorOffsetInfo Null = new TermVectorOffsetInfo(int.MinValue, int.MinValue); - - private int startOffset; - private int endOffset; - - //public TermVectorOffsetInfo() - //{ - //} - - public TermVectorOffsetInfo(int startOffset, int endOffset) - { - this.endOffset = endOffset; - this.startOffset = startOffset; - } - - /// The accessor for the ending offset for the term - /// The offset - public int EndOffset - { - get { return endOffset; } - set { this.endOffset = value; } - } - - /// The accessor for the starting offset of the term. - /// - /// - /// The offset - public int StartOffset - { - get { return startOffset; } - set { this.startOffset = value; } - } - - ///// Two TermVectorOffsetInfos are equals if both the start and end offsets are the same - ///// The comparison Object - ///// - ///// true if both and are the same for both objects. - ///// - //public override bool Equals(System.Object o) - //{ - // if (this == o) - // return true; - // if (!(o is TermVectorOffsetInfo)) - // return false; - - // TermVectorOffsetInfo termVectorOffsetInfo = (TermVectorOffsetInfo) o; - - // if (endOffset != termVectorOffsetInfo.endOffset) - // return false; - // if (startOffset != termVectorOffsetInfo.startOffset) - // return false; - - // return true; - //} - - //public override int GetHashCode() - //{ - // int result; - // result = startOffset; - // result = 29 * result + endOffset; - // return result; - //} - - - public bool Equals(TermVectorOffsetInfo other) - { - return startOffset == other.startOffset && endOffset == other.endOffset; - } - - public override bool Equals(object obj) - { - if (ReferenceEquals(null, obj)) - { - return EndOffset == int.MinValue && StartOffset == int.MinValue; - } - if (obj.GetType() != typeof (TermVectorOffsetInfo)) return false; - return Equals((TermVectorOffsetInfo) obj); - } - - public override int GetHashCode() - { - unchecked - { - return (startOffset*397) ^ endOffset; - } - } - - public static bool operator ==(TermVectorOffsetInfo left, object right) - { - return left.Equals(right); - } - - public static bool operator !=(TermVectorOffsetInfo left, object right) - { - return !left.Equals(right); - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/228b970a/src/Lucene.Net.Core/Index/TermVectorsReader.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Core/Index/TermVectorsReader.cs b/src/Lucene.Net.Core/Index/TermVectorsReader.cs deleted file mode 100644 index 23677a9..0000000 --- a/src/Lucene.Net.Core/Index/TermVectorsReader.cs +++ /dev/null @@ -1,731 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; - -using BufferedIndexInput = Lucene.Net.Store.BufferedIndexInput; -using Directory = Lucene.Net.Store.Directory; -using IndexInput = Lucene.Net.Store.IndexInput; - -namespace Lucene.Net.Index -{ - class TermVectorsReader : System.ICloneable, IDisposable - { - - // NOTE: if you make a new format, it must be larger than - // the current format - internal const int FORMAT_VERSION = 2; - - // Changes to speed up bulk merging of term vectors: - internal const int FORMAT_VERSION2 = 3; - - // Changed strings to UTF8 with length-in-bytes not length-in-chars - internal const int FORMAT_UTF8_LENGTH_IN_BYTES = 4; - - // NOTE: always change this if you switch to a new format! - internal static readonly int FORMAT_CURRENT = FORMAT_UTF8_LENGTH_IN_BYTES; - - //The size in bytes that the FORMAT_VERSION will take up at the beginning of each file - internal const int FORMAT_SIZE = 4; - - internal const byte STORE_POSITIONS_WITH_TERMVECTOR = (byte) (0x1); - internal const byte STORE_OFFSET_WITH_TERMVECTOR = (byte) (0x2); - - private FieldInfos fieldInfos; - - private IndexInput tvx; - private IndexInput tvd; - private IndexInput tvf; - private int size; - private int numTotalDocs; - - // The docID offset where our docs begin in the index - // file. This will be 0 if we have our own private file. - private int docStoreOffset; - - private int format; - private bool isDisposed; - - internal TermVectorsReader(Directory d, System.String segment, FieldInfos fieldInfos):this(d, segment, fieldInfos, BufferedIndexInput.BUFFER_SIZE) - { - } - - internal TermVectorsReader(Directory d, System.String segment, FieldInfos fieldInfos, int readBufferSize):this(d, segment, fieldInfos, readBufferSize, - 1, 0) - { - } - - internal TermVectorsReader(Directory d, System.String segment, FieldInfos fieldInfos, int readBufferSize, int docStoreOffset, int size) - { - bool success = false; - - try - { - if (d.FileExists(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION)) - { - tvx = d.OpenInput(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION, readBufferSize); - format = CheckValidFormat(tvx); - tvd = d.OpenInput(segment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION, readBufferSize); - int tvdFormat = CheckValidFormat(tvd); - tvf = d.OpenInput(segment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION, readBufferSize); - int tvfFormat = CheckValidFormat(tvf); - - System.Diagnostics.Debug.Assert(format == tvdFormat); - System.Diagnostics.Debug.Assert(format == tvfFormat); - - if (format >= FORMAT_VERSION2) - { - System.Diagnostics.Debug.Assert((tvx.Length() - FORMAT_SIZE) % 16 == 0); - numTotalDocs = (int)(tvx.Length() >> 4); - } - else - { - System.Diagnostics.Debug.Assert((tvx.Length() - FORMAT_SIZE) % 8 == 0); - numTotalDocs = (int)(tvx.Length() >> 3); - } - - if (-1 == docStoreOffset) - { - this.docStoreOffset = 0; - this.size = numTotalDocs; - System.Diagnostics.Debug.Assert(size == 0 || numTotalDocs == size); - } - else - { - this.docStoreOffset = docStoreOffset; - this.size = size; - // Verify the file is long enough to hold all of our - // docs - System.Diagnostics.Debug.Assert(numTotalDocs >= size + docStoreOffset, "numTotalDocs=" + numTotalDocs + " size=" + size + " docStoreOffset=" + docStoreOffset); - } - } - else - { - // If all documents flushed in a segment had hit - // non-aborting exceptions, it's possible that - // FieldInfos.hasVectors returns true yet the term - // vector files don't exist. - format = 0; - } - - - this.fieldInfos = fieldInfos; - success = true; - } - finally - { - // With lock-less commits, it's entirely possible (and - // fine) to hit a FileNotFound exception above. In - // this case, we want to explicitly close any subset - // of things that were opened so that we don't have to - // wait for a GC to do so. - if (!success) - { - Dispose(); - } - } - } - - // Used for bulk copy when merging - internal virtual IndexInput GetTvdStream() - { - return tvd; - } - - // Used for bulk copy when merging - internal virtual IndexInput GetTvfStream() - { - return tvf; - } - - private void SeekTvx(int docNum) - { - if (format < FORMAT_VERSION2) - tvx.Seek((docNum + docStoreOffset) * 8L + FORMAT_SIZE); - else - tvx.Seek((docNum + docStoreOffset) * 16L + FORMAT_SIZE); - } - - internal virtual bool CanReadRawDocs() - { - return format >= FORMAT_UTF8_LENGTH_IN_BYTES; - } - - /// Retrieve the length (in bytes) of the tvd and tvf - /// entries for the next numDocs starting with - /// startDocID. This is used for bulk copying when - /// merging segments, if the field numbers are - /// congruent. Once this returns, the tvf & tvd streams - /// are seeked to the startDocID. - /// - internal void RawDocs(int[] tvdLengths, int[] tvfLengths, int startDocID, int numDocs) - { - - if (tvx == null) - { - for (int i = 0; i < tvdLengths.Length; i++) - { - tvdLengths[i] = 0; - } - for (int i = 0; i < tvfLengths.Length; i++) - { - tvfLengths[i] = 0; - } - return ; - } - - // SegmentMerger calls canReadRawDocs() first and should - // not call us if that returns false. - if (format < FORMAT_VERSION2) - throw new System.SystemException("cannot read raw docs with older term vector formats"); - - SeekTvx(startDocID); - - long tvdPosition = tvx.ReadLong(); - tvd.Seek(tvdPosition); - - long tvfPosition = tvx.ReadLong(); - tvf.Seek(tvfPosition); - - long lastTvdPosition = tvdPosition; - long lastTvfPosition = tvfPosition; - - int count = 0; - while (count < numDocs) - { - int docID = docStoreOffset + startDocID + count + 1; - System.Diagnostics.Debug.Assert(docID <= numTotalDocs); - if (docID < numTotalDocs) - { - tvdPosition = tvx.ReadLong(); - tvfPosition = tvx.ReadLong(); - } - else - { - tvdPosition = tvd.Length(); - tvfPosition = tvf.Length(); - System.Diagnostics.Debug.Assert(count == numDocs - 1); - } - tvdLengths[count] = (int) (tvdPosition - lastTvdPosition); - tvfLengths[count] = (int) (tvfPosition - lastTvfPosition); - count++; - lastTvdPosition = tvdPosition; - lastTvfPosition = tvfPosition; - } - } - - private int CheckValidFormat(IndexInput in_Renamed) - { - int format = in_Renamed.ReadInt(); - if (format > FORMAT_CURRENT) - { - throw new CorruptIndexException("Incompatible format version: " + format + " expected " + FORMAT_CURRENT + " or less"); - } - return format; - } - - public void Dispose() - { - Dispose(true); - } - - protected virtual void Dispose(bool disposing) - { - if (isDisposed) return; - - if (disposing) - { - // make all effort to close up. Keep the first exception - // and throw it as a new one. - System.IO.IOException keep = null; - if (tvx != null) - try - { - tvx.Close(); - } - catch (System.IO.IOException e) - { - if (keep == null) - keep = e; - } - if (tvd != null) - try - { - tvd.Close(); - } - catch (System.IO.IOException e) - { - if (keep == null) - keep = e; - } - if (tvf != null) - try - { - tvf.Close(); - } - catch (System.IO.IOException e) - { - if (keep == null) - keep = e; - } - if (keep != null) - { - throw new System.IO.IOException(keep.StackTrace); - } - } - - isDisposed = true; - } - - /// - /// The number of documents in the reader - /// - internal virtual int Size() - { - return size; - } - - public virtual void Get(int docNum, System.String field, TermVectorMapper mapper) - { - if (tvx != null) - { - int fieldNumber = fieldInfos.FieldNumber(field); - //We need to account for the FORMAT_SIZE at when seeking in the tvx - //We don't need to do this in other seeks because we already have the - // file pointer - //that was written in another file - SeekTvx(docNum); - //System.out.println("TVX Pointer: " + tvx.getFilePointer()); - long tvdPosition = tvx.ReadLong(); - - tvd.Seek(tvdPosition); - int fieldCount = tvd.ReadVInt(); - //System.out.println("Num Fields: " + fieldCount); - // There are only a few fields per document. We opt for a full scan - // rather then requiring that they be ordered. We need to read through - // all of the fields anyway to get to the tvf pointers. - int number = 0; - int found = - 1; - for (int i = 0; i < fieldCount; i++) - { - if (format >= FORMAT_VERSION) - number = tvd.ReadVInt(); - else - number += tvd.ReadVInt(); - - if (number == fieldNumber) - found = i; - } - - // This field, although valid in the segment, was not found in this - // document - if (found != - 1) - { - // Compute position in the tvf file - long position; - if (format >= FORMAT_VERSION2) - position = tvx.ReadLong(); - else - position = tvd.ReadVLong(); - for (int i = 1; i <= found; i++) - position += tvd.ReadVLong(); - - mapper.SetDocumentNumber(docNum); - ReadTermVector(field, position, mapper); - } - else - { - //System.out.println("Fieldable not found"); - } - } - else - { - //System.out.println("No tvx file"); - } - } - - - - /// Retrieve the term vector for the given document and field - /// The document number to retrieve the vector for - /// - /// The field within the document to retrieve - /// - /// The TermFreqVector for the document and field or null if there is no termVector for this field. - /// - /// IOException if there is an error reading the term vector files - public /*internal*/ virtual ITermFreqVector Get(int docNum, System.String field) - { - // Check if no term vectors are available for this segment at all - ParallelArrayTermVectorMapper mapper = new ParallelArrayTermVectorMapper(); - Get(docNum, field, mapper); - - return mapper.MaterializeVector(); - } - - // Reads the String[] fields; you have to pre-seek tvd to - // the right point - private System.String[] ReadFields(int fieldCount) - { - int number = 0; - System.String[] fields = new System.String[fieldCount]; - - for (int i = 0; i < fieldCount; i++) - { - if (format >= FORMAT_VERSION) - number = tvd.ReadVInt(); - else - number += tvd.ReadVInt(); - - fields[i] = fieldInfos.FieldName(number); - } - - return fields; - } - - // Reads the long[] offsets into TVF; you have to pre-seek - // tvx/tvd to the right point - private long[] ReadTvfPointers(int fieldCount) - { - // Compute position in the tvf file - long position; - if (format >= FORMAT_VERSION2) - position = tvx.ReadLong(); - else - position = tvd.ReadVLong(); - - long[] tvfPointers = new long[fieldCount]; - tvfPointers[0] = position; - - for (int i = 1; i < fieldCount; i++) - { - position += tvd.ReadVLong(); - tvfPointers[i] = position; - } - - return tvfPointers; - } - - /// Return all term vectors stored for this document or null if the could not be read in. - /// - /// - /// The document number to retrieve the vector for - /// - /// All term frequency vectors - /// - /// IOException if there is an error reading the term vector files - public /*internal*/ virtual ITermFreqVector[] Get(int docNum) - { - ITermFreqVector[] result = null; - if (tvx != null) - { - //We need to offset by - SeekTvx(docNum); - long tvdPosition = tvx.ReadLong(); - - tvd.Seek(tvdPosition); - int fieldCount = tvd.ReadVInt(); - - // No fields are vectorized for this document - if (fieldCount != 0) - { - System.String[] fields = ReadFields(fieldCount); - long[] tvfPointers = ReadTvfPointers(fieldCount); - result = ReadTermVectors(docNum, fields, tvfPointers); - } - } - else - { - //System.out.println("No tvx file"); - } - return result; - } - - public virtual void Get(int docNumber, TermVectorMapper mapper) - { - // Check if no term vectors are available for this segment at all - if (tvx != null) - { - //We need to offset by - - SeekTvx(docNumber); - long tvdPosition = tvx.ReadLong(); - - tvd.Seek(tvdPosition); - int fieldCount = tvd.ReadVInt(); - - // No fields are vectorized for this document - if (fieldCount != 0) - { - System.String[] fields = ReadFields(fieldCount); - long[] tvfPointers = ReadTvfPointers(fieldCount); - mapper.SetDocumentNumber(docNumber); - ReadTermVectors(fields, tvfPointers, mapper); - } - } - else - { - //System.out.println("No tvx file"); - } - } - - - private SegmentTermVector[] ReadTermVectors(int docNum, System.String[] fields, long[] tvfPointers) - { - SegmentTermVector[] res = new SegmentTermVector[fields.Length]; - for (int i = 0; i < fields.Length; i++) - { - var mapper = new ParallelArrayTermVectorMapper(); - mapper.SetDocumentNumber(docNum); - ReadTermVector(fields[i], tvfPointers[i], mapper); - res[i] = (SegmentTermVector) mapper.MaterializeVector(); - } - return res; - } - - private void ReadTermVectors(System.String[] fields, long[] tvfPointers, TermVectorMapper mapper) - { - for (int i = 0; i < fields.Length; i++) - { - ReadTermVector(fields[i], tvfPointers[i], mapper); - } - } - - - /// - /// The field to read in - /// - /// The pointer within the tvf file where we should start reading - /// - /// The mapper used to map the TermVector - /// - /// IOException - private void ReadTermVector(System.String field, long tvfPointer, TermVectorMapper mapper) - { - - // Now read the data from specified position - //We don't need to offset by the FORMAT here since the pointer already includes the offset - tvf.Seek(tvfPointer); - - int numTerms = tvf.ReadVInt(); - //System.out.println("Num Terms: " + numTerms); - // If no terms - return a constant empty termvector. However, this should never occur! - if (numTerms == 0) - return ; - - bool storePositions; - bool storeOffsets; - - if (format >= FORMAT_VERSION) - { - byte bits = tvf.ReadByte(); - storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0; - storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0; - } - else - { - tvf.ReadVInt(); - storePositions = false; - storeOffsets = false; - } - mapper.SetExpectations(field, numTerms, storeOffsets, storePositions); - int start = 0; - int deltaLength = 0; - int totalLength = 0; - byte[] byteBuffer; - char[] charBuffer; - bool preUTF8 = format < FORMAT_UTF8_LENGTH_IN_BYTES; - - // init the buffers - if (preUTF8) - { - charBuffer = new char[10]; - byteBuffer = null; - } - else - { - charBuffer = null; - byteBuffer = new byte[20]; - } - - for (int i = 0; i < numTerms; i++) - { - start = tvf.ReadVInt(); - deltaLength = tvf.ReadVInt(); - totalLength = start + deltaLength; - - System.String term; - - if (preUTF8) - { - // Term stored as java chars - if (charBuffer.Length < totalLength) - { - char[] newCharBuffer = new char[(int) (1.5 * totalLength)]; - Array.Copy(charBuffer, 0, newCharBuffer, 0, start); - charBuffer = newCharBuffer; - } - tvf.ReadChars(charBuffer, start, deltaLength); - term = new System.String(charBuffer, 0, totalLength); - } - else - { - // Term stored as utf8 bytes - if (byteBuffer.Length < totalLength) - { - byte[] newByteBuffer = new byte[(int) (1.5 * totalLength)]; - Array.Copy(byteBuffer, 0, newByteBuffer, 0, start); - byteBuffer = newByteBuffer; - } - tvf.ReadBytes(byteBuffer, start, deltaLength); - term = System.Text.Encoding.UTF8.GetString(byteBuffer, 0, totalLength); - } - int freq = tvf.ReadVInt(); - int[] positions = null; - if (storePositions) - { - //read in the positions - //does the mapper even care about positions? - if (mapper.IsIgnoringPositions == false) - { - positions = new int[freq]; - int prevPosition = 0; - for (int j = 0; j < freq; j++) - { - positions[j] = prevPosition + tvf.ReadVInt(); - prevPosition = positions[j]; - } - } - else - { - //we need to skip over the positions. Since these are VInts, I don't believe there is anyway to know for sure how far to skip - // - for (int j = 0; j < freq; j++) - { - tvf.ReadVInt(); - } - } - } - TermVectorOffsetInfo[] offsets = null; - if (storeOffsets) - { - //does the mapper even care about offsets? - if (mapper.IsIgnoringOffsets == false) - { - offsets = new TermVectorOffsetInfo[freq]; - int prevOffset = 0; - for (int j = 0; j < freq; j++) - { - int startOffset = prevOffset + tvf.ReadVInt(); - int endOffset = startOffset + tvf.ReadVInt(); - offsets[j] = new TermVectorOffsetInfo(startOffset, endOffset); - prevOffset = endOffset; - } - } - else - { - for (int j = 0; j < freq; j++) - { - tvf.ReadVInt(); - tvf.ReadVInt(); - } - } - } - mapper.Map(term, freq, offsets, positions); - } - } - - public virtual System.Object Clone() - { - - TermVectorsReader clone = (TermVectorsReader) base.MemberwiseClone(); - - // These are null when a TermVectorsReader was created - // on a segment that did not have term vectors saved - if (tvx != null && tvd != null && tvf != null) - { - clone.tvx = (IndexInput) tvx.Clone(); - clone.tvd = (IndexInput) tvd.Clone(); - clone.tvf = (IndexInput) tvf.Clone(); - } - - return clone; - } - } - - - /// Models the existing parallel array structure - class ParallelArrayTermVectorMapper:TermVectorMapper - { - - private System.String[] terms; - private int[] termFreqs; - private int[][] positions; - private TermVectorOffsetInfo[][] offsets; - private int currentPosition; - private bool storingOffsets; - private bool storingPositions; - private System.String field; - - public override void SetExpectations(System.String field, int numTerms, bool storeOffsets, bool storePositions) - { - this.field = field; - terms = new System.String[numTerms]; - termFreqs = new int[numTerms]; - this.storingOffsets = storeOffsets; - this.storingPositions = storePositions; - if (storePositions) - this.positions = new int[numTerms][]; - if (storeOffsets) - this.offsets = new TermVectorOffsetInfo[numTerms][]; - } - - public override void Map(System.String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) - { - terms[currentPosition] = term; - termFreqs[currentPosition] = frequency; - if (storingOffsets) - { - this.offsets[currentPosition] = offsets; - } - if (storingPositions) - { - this.positions[currentPosition] = positions; - } - currentPosition++; - } - - /// Construct the vector - /// The based on the mappings. - /// - public virtual ITermFreqVector MaterializeVector() - { - SegmentTermVector tv = null; - if (field != null && terms != null) - { - if (storingPositions || storingOffsets) - { - tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets); - } - else - { - tv = new SegmentTermVector(field, terms, termFreqs); - } - } - return tv; - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/228b970a/src/Lucene.Net.Core/Index/TermVectorsTermsWriter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Core/Index/TermVectorsTermsWriter.cs b/src/Lucene.Net.Core/Index/TermVectorsTermsWriter.cs deleted file mode 100644 index 8d07924..0000000 --- a/src/Lucene.Net.Core/Index/TermVectorsTermsWriter.cs +++ /dev/null @@ -1,380 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; -using System.Collections.Generic; -using IndexOutput = Lucene.Net.Store.IndexOutput; -using RAMOutputStream = Lucene.Net.Store.RAMOutputStream; -using ArrayUtil = Lucene.Net.Util.ArrayUtil; - -namespace Lucene.Net.Index -{ - sealed class TermVectorsTermsWriter:TermsHashConsumer - { - private void InitBlock() - { - docFreeList = new PerDoc[1]; - } - - internal DocumentsWriter docWriter; - internal TermVectorsWriter termVectorsWriter; - internal PerDoc[] docFreeList; - internal int freeCount; - internal IndexOutput tvx; - internal IndexOutput tvd; - internal IndexOutput tvf; - internal int lastDocID; - - public TermVectorsTermsWriter(DocumentsWriter docWriter) - { - InitBlock(); - this.docWriter = docWriter; - } - - public override TermsHashConsumerPerThread AddThread(TermsHashPerThread termsHashPerThread) - { - return new TermVectorsTermsWriterPerThread(termsHashPerThread, this); - } - - internal override void CreatePostings(RawPostingList[] postings, int start, int count) - { - int end = start + count; - for (int i = start; i < end; i++) - postings[i] = new PostingList(); - } - - public override void Flush(IDictionary> threadsAndFields, SegmentWriteState state) - { - lock (this) - { - // NOTE: it's possible that all documents seen in this segment - // hit non-aborting exceptions, in which case we will - // not have yet init'd the TermVectorsWriter. This is - // actually OK (unlike in the stored fields case) - // because, although IieldInfos.hasVectors() will return - // true, the TermVectorsReader gracefully handles - // non-existence of the term vectors files. - if (tvx != null) - { - - if (state.numDocsInStore > 0) - // In case there are some final documents that we - // didn't see (because they hit a non-aborting exception): - Fill(state.numDocsInStore - docWriter.DocStoreOffset); - - tvx.Flush(); - tvd.Flush(); - tvf.Flush(); - } - - foreach(var entry in threadsAndFields) - { - foreach(var field in entry.Value) - { - TermVectorsTermsWriterPerField perField = (TermVectorsTermsWriterPerField)field; - perField.termsHashPerField.Reset(); - perField.ShrinkHash(); - } - - TermVectorsTermsWriterPerThread perThread = (TermVectorsTermsWriterPerThread) entry.Key; - perThread.termsHashPerThread.Reset(true); - } - } - } - - internal override void CloseDocStore(SegmentWriteState state) - { - lock (this) - { - if (tvx != null) - { - // At least one doc in this run had term vectors - // enabled - Fill(state.numDocsInStore - docWriter.DocStoreOffset); - tvx.Close(); - tvf.Close(); - tvd.Close(); - tvx = null; - System.Diagnostics.Debug.Assert(state.docStoreSegmentName != null); - System.String fileName = state.docStoreSegmentName + "." + IndexFileNames.VECTORS_INDEX_EXTENSION; - if (4 + ((long) state.numDocsInStore) * 16 != state.directory.FileLength(fileName)) - throw new System.SystemException("after flush: tvx size mismatch: " + state.numDocsInStore + " docs vs " + state.directory.FileLength(fileName) + " length in bytes of " + fileName + " file exists?=" + state.directory.FileExists(fileName)); - - state.flushedFiles.Add(state.docStoreSegmentName + "." + IndexFileNames.VECTORS_INDEX_EXTENSION); - state.flushedFiles.Add(state.docStoreSegmentName + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION); - state.flushedFiles.Add(state.docStoreSegmentName + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION); - - docWriter.RemoveOpenFile(state.docStoreSegmentName + "." + IndexFileNames.VECTORS_INDEX_EXTENSION); - docWriter.RemoveOpenFile(state.docStoreSegmentName + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION); - docWriter.RemoveOpenFile(state.docStoreSegmentName + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION); - - lastDocID = 0; - } - } - } - - internal int allocCount; - - internal PerDoc GetPerDoc() - { - lock (this) - { - if (freeCount == 0) - { - allocCount++; - if (allocCount > docFreeList.Length) - { - // Grow our free list up front to make sure we have - // enough space to recycle all outstanding PerDoc - // instances - System.Diagnostics.Debug.Assert(allocCount == 1 + docFreeList.Length); - docFreeList = new PerDoc[ArrayUtil.GetNextSize(allocCount)]; - } - return new PerDoc(this); - } - else - return docFreeList[--freeCount]; - } - } - - /// Fills in no-term-vectors for all docs we haven't seen - /// since the last doc that had term vectors. - /// - internal void Fill(int docID) - { - int docStoreOffset = docWriter.DocStoreOffset; - int end = docID + docStoreOffset; - if (lastDocID < end) - { - long tvfPosition = tvf.FilePointer; - while (lastDocID < end) - { - tvx.WriteLong(tvd.FilePointer); - tvd.WriteVInt(0); - tvx.WriteLong(tvfPosition); - lastDocID++; - } - } - } - - internal void InitTermVectorsWriter() - { - lock (this) - { - if (tvx == null) - { - - System.String docStoreSegment = docWriter.DocStoreSegment; - - if (docStoreSegment == null) - return ; - - System.Diagnostics.Debug.Assert(docStoreSegment != null); - - // If we hit an exception while init'ing the term - // vector output files, we must abort this segment - // because those files will be in an unknown - // state: - tvx = docWriter.directory.CreateOutput(docStoreSegment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION); - tvd = docWriter.directory.CreateOutput(docStoreSegment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION); - tvf = docWriter.directory.CreateOutput(docStoreSegment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION); - - tvx.WriteInt(TermVectorsReader.FORMAT_CURRENT); - tvd.WriteInt(TermVectorsReader.FORMAT_CURRENT); - tvf.WriteInt(TermVectorsReader.FORMAT_CURRENT); - - docWriter.AddOpenFile(docStoreSegment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION); - docWriter.AddOpenFile(docStoreSegment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION); - docWriter.AddOpenFile(docStoreSegment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION); - - lastDocID = 0; - } - } - } - - internal void FinishDocument(PerDoc perDoc) - { - lock (this) - { - - System.Diagnostics.Debug.Assert(docWriter.writer.TestPoint("TermVectorsTermsWriter.finishDocument start")); - - InitTermVectorsWriter(); - - Fill(perDoc.docID); - - // Append term vectors to the real outputs: - tvx.WriteLong(tvd.FilePointer); - tvx.WriteLong(tvf.FilePointer); - tvd.WriteVInt(perDoc.numVectorFields); - if (perDoc.numVectorFields > 0) - { - for (int i = 0; i < perDoc.numVectorFields; i++) - tvd.WriteVInt(perDoc.fieldNumbers[i]); - System.Diagnostics.Debug.Assert(0 == perDoc.fieldPointers [0]); - long lastPos = perDoc.fieldPointers[0]; - for (int i = 1; i < perDoc.numVectorFields; i++) - { - long pos = perDoc.fieldPointers[i]; - tvd.WriteVLong(pos - lastPos); - lastPos = pos; - } - perDoc.perDocTvf.WriteTo(tvf); - perDoc.numVectorFields = 0; - } - - System.Diagnostics.Debug.Assert(lastDocID == perDoc.docID + docWriter.DocStoreOffset); - - lastDocID++; - perDoc.Reset(); - Free(perDoc); - System.Diagnostics.Debug.Assert(docWriter.writer.TestPoint("TermVectorsTermsWriter.finishDocument end")); - } - } - - public bool FreeRAM() - { - // We don't hold any state beyond one doc, so we don't - // free persistent RAM here - return false; - } - - public override void Abort() - { - if (tvx != null) - { - try - { - tvx.Close(); - } - catch (System.Exception) - { - } - tvx = null; - } - if (tvd != null) - { - try - { - tvd.Close(); - } - catch (System.Exception) - { - } - tvd = null; - } - if (tvf != null) - { - try - { - tvf.Close(); - } - catch (System.Exception) - { - } - tvf = null; - } - lastDocID = 0; - } - - internal void Free(PerDoc doc) - { - lock (this) - { - System.Diagnostics.Debug.Assert(freeCount < docFreeList.Length); - docFreeList[freeCount++] = doc; - } - } - - internal class PerDoc:DocumentsWriter.DocWriter - { - public PerDoc(TermVectorsTermsWriter enclosingInstance) - { - InitBlock(enclosingInstance); - } - private void InitBlock(TermVectorsTermsWriter enclosingInstance) - { - this.enclosingInstance = enclosingInstance; - buffer = enclosingInstance.docWriter.NewPerDocBuffer(); - perDocTvf = new RAMOutputStream(buffer); - } - private TermVectorsTermsWriter enclosingInstance; - public TermVectorsTermsWriter Enclosing_Instance - { - get - { - return enclosingInstance; - } - - } - - internal DocumentsWriter.PerDocBuffer buffer; - internal RAMOutputStream perDocTvf; - internal int numVectorFields; - - internal int[] fieldNumbers = new int[1]; - internal long[] fieldPointers = new long[1]; - - internal void Reset() - { - perDocTvf.Reset(); - buffer.Recycle(); - numVectorFields = 0; - } - - public override void Abort() - { - Reset(); - Enclosing_Instance.Free(this); - } - - internal void AddField(int fieldNumber) - { - if (numVectorFields == fieldNumbers.Length) - { - fieldNumbers = ArrayUtil.Grow(fieldNumbers); - fieldPointers = ArrayUtil.Grow(fieldPointers); - } - fieldNumbers[numVectorFields] = fieldNumber; - fieldPointers[numVectorFields] = perDocTvf.FilePointer; - numVectorFields++; - } - - public override long SizeInBytes() - { - return buffer.SizeInBytes; - } - - public override void Finish() - { - Enclosing_Instance.FinishDocument(this); - } - } - - internal sealed class PostingList:RawPostingList - { - internal int freq; // How many times this term occurred in the current doc - internal int lastOffset; // Last offset we saw - internal int lastPosition; // Last position where this term occurred - } - - internal override int BytesPerPosting() - { - return RawPostingList.BYTES_SIZE + 3 * DocumentsWriter.INT_NUM_BYTE; - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/228b970a/src/Lucene.Net.Core/Index/TermVectorsTermsWriterPerField.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Core/Index/TermVectorsTermsWriterPerField.cs b/src/Lucene.Net.Core/Index/TermVectorsTermsWriterPerField.cs deleted file mode 100644 index 945f32b..0000000 --- a/src/Lucene.Net.Core/Index/TermVectorsTermsWriterPerField.cs +++ /dev/null @@ -1,290 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; -using Lucene.Net.Analysis.Tokenattributes; -using Lucene.Net.Documents; -using IndexOutput = Lucene.Net.Store.IndexOutput; -using UnicodeUtil = Lucene.Net.Util.UnicodeUtil; - -namespace Lucene.Net.Index -{ - - sealed class TermVectorsTermsWriterPerField:TermsHashConsumerPerField - { - - internal TermVectorsTermsWriterPerThread perThread; - internal TermsHashPerField termsHashPerField; - internal TermVectorsTermsWriter termsWriter; - internal FieldInfo fieldInfo; - internal DocumentsWriter.DocState docState; - internal FieldInvertState fieldState; - - internal bool doVectors; - internal bool doVectorPositions; - internal bool doVectorOffsets; - - internal int maxNumPostings; - internal IOffsetAttribute offsetAttribute = null; - - public TermVectorsTermsWriterPerField(TermsHashPerField termsHashPerField, TermVectorsTermsWriterPerThread perThread, FieldInfo fieldInfo) - { - this.termsHashPerField = termsHashPerField; - this.perThread = perThread; - this.termsWriter = perThread.termsWriter; - this.fieldInfo = fieldInfo; - docState = termsHashPerField.docState; - fieldState = termsHashPerField.fieldState; - } - - internal override int GetStreamCount() - { - return 2; - } - - internal override bool Start(IFieldable[] fields, int count) - { - doVectors = false; - doVectorPositions = false; - doVectorOffsets = false; - - for (int i = 0; i < count; i++) - { - IFieldable field = fields[i]; - if (field.IsIndexed && field.IsTermVectorStored) - { - doVectors = true; - doVectorPositions |= field.IsStorePositionWithTermVector; - doVectorOffsets |= field.IsStoreOffsetWithTermVector; - } - } - - if (doVectors) - { - if (perThread.doc == null) - { - perThread.doc = termsWriter.GetPerDoc(); - perThread.doc.docID = docState.docID; - System.Diagnostics.Debug.Assert(perThread.doc.numVectorFields == 0); - System.Diagnostics.Debug.Assert(0 == perThread.doc.perDocTvf.Length); - System.Diagnostics.Debug.Assert(0 == perThread.doc.perDocTvf.FilePointer); - } - - System.Diagnostics.Debug.Assert(perThread.doc.docID == docState.docID); - if (termsHashPerField.numPostings != 0) - { - // Only necessary if previous doc hit a - // non-aborting exception while writing vectors in - // this field: - termsHashPerField.Reset(); - perThread.termsHashPerThread.Reset(false); - } - } - - // TODO: only if needed for performance - //perThread.postingsCount = 0; - - return doVectors; - } - - public void Abort() - { - } - - /// Called once per field per document if term vectors - /// are enabled, to write the vectors to - /// RAMOutputStream, which is then quickly flushed to - /// the real term vectors files in the Directory. - /// - internal override void Finish() - { - - System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.finish start")); - - int numPostings = termsHashPerField.numPostings; - - System.Diagnostics.Debug.Assert(numPostings >= 0); - - if (!doVectors || numPostings == 0) - return ; - - if (numPostings > maxNumPostings) - maxNumPostings = numPostings; - - IndexOutput tvf = perThread.doc.perDocTvf; - - // This is called once, after inverting all occurences - // of a given field in the doc. At this point we flush - // our hash into the DocWriter. - - System.Diagnostics.Debug.Assert(fieldInfo.storeTermVector); - System.Diagnostics.Debug.Assert(perThread.VectorFieldsInOrder(fieldInfo)); - - perThread.doc.AddField(termsHashPerField.fieldInfo.number); - - RawPostingList[] postings = termsHashPerField.SortPostings(); - - tvf.WriteVInt(numPostings); - byte bits = (byte) (0x0); - if (doVectorPositions) - bits |= TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR; - if (doVectorOffsets) - bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR; - tvf.WriteByte(bits); - - int encoderUpto = 0; - int lastTermBytesCount = 0; - - ByteSliceReader reader = perThread.vectorSliceReader; - char[][] charBuffers = perThread.termsHashPerThread.charPool.buffers; - for (int j = 0; j < numPostings; j++) - { - TermVectorsTermsWriter.PostingList posting = (TermVectorsTermsWriter.PostingList) postings[j]; - int freq = posting.freq; - - char[] text2 = charBuffers[posting.textStart >> DocumentsWriter.CHAR_BLOCK_SHIFT]; - int start2 = posting.textStart & DocumentsWriter.CHAR_BLOCK_MASK; - - // We swap between two encoders to save copying - // last Term's byte array - UnicodeUtil.UTF8Result utf8Result = perThread.utf8Results[encoderUpto]; - - // TODO: we could do this incrementally - UnicodeUtil.UTF16toUTF8(text2, start2, utf8Result); - int termBytesCount = utf8Result.length; - - // TODO: UTF16toUTF8 could tell us this prefix - // Compute common prefix between last term and - // this term - int prefix = 0; - if (j > 0) - { - byte[] lastTermBytes = perThread.utf8Results[1 - encoderUpto].result; - byte[] termBytes = perThread.utf8Results[encoderUpto].result; - while (prefix < lastTermBytesCount && prefix < termBytesCount) - { - if (lastTermBytes[prefix] != termBytes[prefix]) - break; - prefix++; - } - } - encoderUpto = 1 - encoderUpto; - lastTermBytesCount = termBytesCount; - - int suffix = termBytesCount - prefix; - tvf.WriteVInt(prefix); - tvf.WriteVInt(suffix); - tvf.WriteBytes(utf8Result.result, prefix, suffix); - tvf.WriteVInt(freq); - - if (doVectorPositions) - { - termsHashPerField.InitReader(reader, posting, 0); - reader.WriteTo(tvf); - } - - if (doVectorOffsets) - { - termsHashPerField.InitReader(reader, posting, 1); - reader.WriteTo(tvf); - } - } - - termsHashPerField.Reset(); - - // NOTE: we clear, per-field, at the thread level, - // because term vectors fully write themselves on each - // field; this saves RAM (eg if large doc has two large - // fields w/ term vectors on) because we recycle/reuse - // all RAM after each field: - perThread.termsHashPerThread.Reset(false); - } - - internal void ShrinkHash() - { - termsHashPerField.ShrinkHash(maxNumPostings); - maxNumPostings = 0; - } - - internal override void Start(IFieldable f) - { - if (doVectorOffsets) - { - offsetAttribute = fieldState.attributeSource.AddAttribute(); - } - else - { - offsetAttribute = null; - } - } - - internal override void NewTerm(RawPostingList p0) - { - - System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.newTerm start")); - - TermVectorsTermsWriter.PostingList p = (TermVectorsTermsWriter.PostingList) p0; - - p.freq = 1; - - if (doVectorOffsets) - { - int startOffset = fieldState.offset + offsetAttribute.StartOffset; ; - int endOffset = fieldState.offset + offsetAttribute.EndOffset; - - termsHashPerField.WriteVInt(1, startOffset); - termsHashPerField.WriteVInt(1, endOffset - startOffset); - p.lastOffset = endOffset; - } - - if (doVectorPositions) - { - termsHashPerField.WriteVInt(0, fieldState.position); - p.lastPosition = fieldState.position; - } - } - - internal override void AddTerm(RawPostingList p0) - { - - System.Diagnostics.Debug.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.addTerm start")); - - TermVectorsTermsWriter.PostingList p = (TermVectorsTermsWriter.PostingList) p0; - p.freq++; - - if (doVectorOffsets) - { - int startOffset = fieldState.offset + offsetAttribute.StartOffset; ; - int endOffset = fieldState.offset + offsetAttribute.EndOffset; - - termsHashPerField.WriteVInt(1, startOffset - p.lastOffset); - termsHashPerField.WriteVInt(1, endOffset - startOffset); - p.lastOffset = endOffset; - } - - if (doVectorPositions) - { - termsHashPerField.WriteVInt(0, fieldState.position - p.lastPosition); - p.lastPosition = fieldState.position; - } - } - - internal override void SkippingLongTerm() - { - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/228b970a/src/Lucene.Net.Core/Index/TermVectorsTermsWriterPerThread.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Core/Index/TermVectorsTermsWriterPerThread.cs b/src/Lucene.Net.Core/Index/TermVectorsTermsWriterPerThread.cs deleted file mode 100644 index e58866a..0000000 --- a/src/Lucene.Net.Core/Index/TermVectorsTermsWriterPerThread.cs +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; - -using UnicodeUtil = Lucene.Net.Util.UnicodeUtil; - -namespace Lucene.Net.Index -{ - - sealed class TermVectorsTermsWriterPerThread:TermsHashConsumerPerThread - { - - internal TermVectorsTermsWriter termsWriter; - internal TermsHashPerThread termsHashPerThread; - internal DocumentsWriter.DocState docState; - - internal TermVectorsTermsWriter.PerDoc doc; - - public TermVectorsTermsWriterPerThread(TermsHashPerThread termsHashPerThread, TermVectorsTermsWriter termsWriter) - { - this.termsWriter = termsWriter; - this.termsHashPerThread = termsHashPerThread; - docState = termsHashPerThread.docState; - } - - // Used by perField when serializing the term vectors - internal ByteSliceReader vectorSliceReader = new ByteSliceReader(); - - internal UnicodeUtil.UTF8Result[] utf8Results = new UnicodeUtil.UTF8Result[]{new UnicodeUtil.UTF8Result(), new UnicodeUtil.UTF8Result()}; - - public override void StartDocument() - { - System.Diagnostics.Debug.Assert(ClearLastVectorFieldName()); - if (doc != null) - { - doc.Reset(); - doc.docID = docState.docID; - } - } - - public override DocumentsWriter.DocWriter FinishDocument() - { - try - { - return doc; - } - finally - { - doc = null; - } - } - - public override TermsHashConsumerPerField AddField(TermsHashPerField termsHashPerField, FieldInfo fieldInfo) - { - return new TermVectorsTermsWriterPerField(termsHashPerField, this, fieldInfo); - } - - public override void Abort() - { - if (doc != null) - { - doc.Abort(); - doc = null; - } - } - - // Called only by assert - internal bool ClearLastVectorFieldName() - { - lastVectorFieldName = null; - return true; - } - - // Called only by assert - internal System.String lastVectorFieldName; - internal bool VectorFieldsInOrder(FieldInfo fi) - { - try - { - if (lastVectorFieldName != null) - return String.CompareOrdinal(lastVectorFieldName, fi.name) < 0; - else - return true; - } - finally - { - lastVectorFieldName = fi.name; - } - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/228b970a/src/Lucene.Net.Core/Index/TermVectorsWriter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Core/Index/TermVectorsWriter.cs b/src/Lucene.Net.Core/Index/TermVectorsWriter.cs deleted file mode 100644 index a689478..0000000 --- a/src/Lucene.Net.Core/Index/TermVectorsWriter.cs +++ /dev/null @@ -1,246 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; - -using Directory = Lucene.Net.Store.Directory; -using IndexOutput = Lucene.Net.Store.IndexOutput; -using StringHelper = Lucene.Net.Util.StringHelper; -using UnicodeUtil = Lucene.Net.Util.UnicodeUtil; - -namespace Lucene.Net.Index -{ - sealed class TermVectorsWriter : IDisposable - { - - private readonly IndexOutput tvx = null; - private readonly IndexOutput tvd = null; - private readonly IndexOutput tvf = null; - private readonly FieldInfos fieldInfos; - internal UnicodeUtil.UTF8Result[] utf8Results = new[]{new UnicodeUtil.UTF8Result(), new UnicodeUtil.UTF8Result()}; - - public TermVectorsWriter(Directory directory, System.String segment, FieldInfos fieldInfos) - { - // Open files for TermVector storage - tvx = directory.CreateOutput(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION); - tvx.WriteInt(TermVectorsReader.FORMAT_CURRENT); - tvd = directory.CreateOutput(segment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION); - tvd.WriteInt(TermVectorsReader.FORMAT_CURRENT); - tvf = directory.CreateOutput(segment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION); - tvf.WriteInt(TermVectorsReader.FORMAT_CURRENT); - - this.fieldInfos = fieldInfos; - } - - /// Add a complete document specified by all its term vectors. If document has no - /// term vectors, add value for tvx. - /// - /// - /// - /// - /// IOException - public void AddAllDocVectors(ITermFreqVector[] vectors) - { - - tvx.WriteLong(tvd.FilePointer); - tvx.WriteLong(tvf.FilePointer); - - if (vectors != null) - { - int numFields = vectors.Length; - tvd.WriteVInt(numFields); - - var fieldPointers = new long[numFields]; - - for (int i = 0; i < numFields; i++) - { - fieldPointers[i] = tvf.FilePointer; - - int fieldNumber = fieldInfos.FieldNumber(vectors[i].Field); - - // 1st pass: write field numbers to tvd - tvd.WriteVInt(fieldNumber); - - int numTerms = vectors[i].Size; - tvf.WriteVInt(numTerms); - - TermPositionVector tpVector; - - byte bits; - bool storePositions; - bool storeOffsets; - - if (vectors[i] is TermPositionVector) - { - // May have positions & offsets - tpVector = (TermPositionVector) vectors[i]; - storePositions = tpVector.Size > 0 && tpVector.GetTermPositions(0) != null; - storeOffsets = tpVector.Size > 0 && tpVector.GetOffsets(0) != null; - bits = (byte) ((storePositions?TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR: (byte) 0) + (storeOffsets?TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR: (byte) 0)); - } - else - { - tpVector = null; - bits = 0; - storePositions = false; - storeOffsets = false; - } - - tvf.WriteVInt(bits); - - System.String[] terms = vectors[i].GetTerms(); - int[] freqs = vectors[i].GetTermFrequencies(); - - int utf8Upto = 0; - utf8Results[1].length = 0; - - for (int j = 0; j < numTerms; j++) - { - - UnicodeUtil.UTF16toUTF8(terms[j], 0, terms[j].Length, utf8Results[utf8Upto]); - - int start = StringHelper.BytesDifference(utf8Results[1 - utf8Upto].result, utf8Results[1 - utf8Upto].length, utf8Results[utf8Upto].result, utf8Results[utf8Upto].length); - int length = utf8Results[utf8Upto].length - start; - tvf.WriteVInt(start); // write shared prefix length - tvf.WriteVInt(length); // write delta length - tvf.WriteBytes(utf8Results[utf8Upto].result, start, length); // write delta bytes - utf8Upto = 1 - utf8Upto; - - int termFreq = freqs[j]; - - tvf.WriteVInt(termFreq); - - if (storePositions) - { - int[] positions = tpVector.GetTermPositions(j); - if (positions == null) - throw new System.SystemException("Trying to write positions that are null!"); - System.Diagnostics.Debug.Assert(positions.Length == termFreq); - - // use delta encoding for positions - int lastPosition = 0; - foreach (int position in positions) - { - tvf.WriteVInt(position - lastPosition); - lastPosition = position; - } - } - - if (storeOffsets) - { - TermVectorOffsetInfo[] offsets = tpVector.GetOffsets(j); - if (offsets == null) - throw new System.SystemException("Trying to write offsets that are null!"); - System.Diagnostics.Debug.Assert(offsets.Length == termFreq); - - // use delta encoding for offsets - int lastEndOffset = 0; - foreach (TermVectorOffsetInfo t in offsets) - { - int startOffset = t.StartOffset; - int endOffset = t.EndOffset; - tvf.WriteVInt(startOffset - lastEndOffset); - tvf.WriteVInt(endOffset - startOffset); - lastEndOffset = endOffset; - } - } - } - } - - // 2nd pass: write field pointers to tvd - if (numFields > 1) - { - long lastFieldPointer = fieldPointers[0]; - for (int i = 1; i < numFields; i++) - { - long fieldPointer = fieldPointers[i]; - tvd.WriteVLong(fieldPointer - lastFieldPointer); - lastFieldPointer = fieldPointer; - } - } - } - else - tvd.WriteVInt(0); - } - - /// Do a bulk copy of numDocs documents from reader to our - /// streams. This is used to expedite merging, if the - /// field numbers are congruent. - /// - internal void AddRawDocuments(TermVectorsReader reader, int[] tvdLengths, int[] tvfLengths, int numDocs) - { - long tvdPosition = tvd.FilePointer; - long tvfPosition = tvf.FilePointer; - long tvdStart = tvdPosition; - long tvfStart = tvfPosition; - for (int i = 0; i < numDocs; i++) - { - tvx.WriteLong(tvdPosition); - tvdPosition += tvdLengths[i]; - tvx.WriteLong(tvfPosition); - tvfPosition += tvfLengths[i]; - } - tvd.CopyBytes(reader.GetTvdStream(), tvdPosition - tvdStart); - tvf.CopyBytes(reader.GetTvfStream(), tvfPosition - tvfStart); - System.Diagnostics.Debug.Assert(tvd.FilePointer == tvdPosition); - System.Diagnostics.Debug.Assert(tvf.FilePointer == tvfPosition); - } - - /// Close all streams. - public void Dispose() - { - // Move to a protected method if class becomes unsealed - - // make an effort to close all streams we can but remember and re-throw - // the first exception encountered in this process - System.IO.IOException keep = null; - if (tvx != null) - try - { - tvx.Close(); - } - catch (System.IO.IOException e) - { - keep = e; - } - if (tvd != null) - try - { - tvd.Close(); - } - catch (System.IO.IOException e) - { - if (keep == null) - keep = e; - } - if (tvf != null) - try - { - tvf.Close(); - } - catch (System.IO.IOException e) - { - if (keep == null) - keep = e; - } - if (keep != null) - { - throw new System.IO.IOException(keep.StackTrace); - } - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/228b970a/src/Lucene.Net.Core/Index/TermsHashConsumerPerThread.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Core/Index/TermsHashConsumerPerThread.cs b/src/Lucene.Net.Core/Index/TermsHashConsumerPerThread.cs deleted file mode 100644 index e135ba0..0000000 --- a/src/Lucene.Net.Core/Index/TermsHashConsumerPerThread.cs +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; - -namespace Lucene.Net.Index -{ - - abstract class TermsHashConsumerPerThread - { - public abstract void StartDocument(); - public abstract DocumentsWriter.DocWriter FinishDocument(); - abstract public TermsHashConsumerPerField AddField(TermsHashPerField termsHashPerField, FieldInfo fieldInfo); - abstract public void Abort(); - } -} \ No newline at end of file