lucy-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From nwelln...@apache.org
Subject [lucy-commits] [4/4] git commit: refs/heads/cfish-string-prep1 - Rework Highlighter to use StringIterators
Date Thu, 05 Sep 2013 22:11:23 GMT
Rework Highlighter to use StringIterators

Don't scan the whole document for sentences but find sentence or word
boundaries directly from the start and end of a fragment using string
iterators. Merge Find_Best_Fragment into Raw_Excerpt.

As a side effect, this should fix for LUCY-199 in an acceptable way. It
would still be nice to have a third class of boundaries which breaks on
punctuation and symbols, so we'd find boundaries like this:

    * Try to break on sentence boundaries.
    * If no sentence boundary can be found, try to break on whitespace.
    * If no whitespace can be found, try to break und punctuation and
      symbols.

We'd need a way to lookup Unicode general categories for this to work,
though.


Project: http://git-wip-us.apache.org/repos/asf/lucy/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/1f51cae0
Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/1f51cae0
Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/1f51cae0

Branch: refs/heads/cfish-string-prep1
Commit: 1f51cae0f9e6df27bcb9c10c902fda5a8c5782a1
Parents: 52bab25
Author: Nick Wellnhofer <wellnhofer@aevum.de>
Authored: Thu Sep 5 01:34:40 2013 +0200
Committer: Nick Wellnhofer <wellnhofer@aevum.de>
Committed: Thu Sep 5 23:53:17 2013 +0200

----------------------------------------------------------------------
 core/Lucy/Highlight/Highlighter.c          | 577 +++++++++---------------
 core/Lucy/Highlight/Highlighter.cfh        |  34 +-
 core/Lucy/Test/Highlight/TestHighlighter.c | 206 ++-------
 3 files changed, 253 insertions(+), 564 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucy/blob/1f51cae0/core/Lucy/Highlight/Highlighter.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Highlight/Highlighter.c b/core/Lucy/Highlight/Highlighter.c
index d98c027..a4529e7 100644
--- a/core/Lucy/Highlight/Highlighter.c
+++ b/core/Lucy/Highlight/Highlighter.c
@@ -60,7 +60,6 @@ Highlighter_init(Highlighter *self, Searcher *searcher, Obj *query,
     ivars->field          = Str_Clone(field);
     ivars->excerpt_length = excerpt_length;
     ivars->slop           = excerpt_length / 3;
-    ivars->window_width   = excerpt_length + (ivars->slop * 2);
     ivars->pre_tag        = Str_new_from_trusted_utf8("<strong>", 8);
     ivars->post_tag       = Str_new_from_trusted_utf8("</strong>", 9);
     if (Query_Is_A(ivars->query, COMPILER)) {
@@ -165,7 +164,6 @@ Highlighter_Create_Excerpt_IMP(Highlighter *self, HitDoc *hit_doc) {
         return Str_new(0);
     }
     else {
-        StackString *fragment = SSTR_WRAP((String*)field_val);
         DocVector *doc_vec
             = Searcher_Fetch_Doc_Vec(ivars->searcher,
                                      HitDoc_Get_Doc_ID(hit_doc));
@@ -176,22 +174,15 @@ Highlighter_Create_Excerpt_IMP(Highlighter *self, HitDoc *hit_doc) {
         VA_Sort(score_spans, NULL, NULL);
         HeatMap *heat_map
             = HeatMap_new(score_spans, (ivars->excerpt_length * 2) / 3);
-        int32_t top
-            = Highlighter_Find_Best_Fragment(self, (String*)field_val,
-                                             (ViewCharBuf*)fragment, heat_map);
-        VArray *sentences
-            = Highlighter_Find_Sentences(self, (String*)field_val, 0,
-                                         top + ivars->window_width);
 
+        int32_t top;
         String *raw_excerpt
-            = Highlighter_Raw_Excerpt(self, (String*)field_val,
-                                      (String*)fragment, &top, heat_map,
-                                      sentences);
+            = Highlighter_Raw_Excerpt(self, (String*)field_val, &top,
+                                      heat_map);
         String *highlighted
             = Highlighter_Highlight_Excerpt(self, score_spans, raw_excerpt,
                                             top);
 
-        DECREF(sentences);
         DECREF(heat_map);
         DECREF(score_spans);
         DECREF(doc_vec);
@@ -216,294 +207,259 @@ S_hottest(HeatMap *heat_map) {
     return retval;
 }
 
-int32_t
-Highlighter_Find_Best_Fragment_IMP(Highlighter *self,
-                                   const String *field_val,
-                                   ViewCharBuf *fragment, HeatMap *heat_map) {
-    HighlighterIVARS *const ivars = Highlighter_IVARS(self);
+// Find a starting boundary after the current position given by the iterator.
+// Skip up to max_skip code points plus potential whitespace. Update the
+// iterator and return number of code points skipped. Return true if a
+// starting edge (sentence) was found.
+bool
+S_find_starting_boundary(StringIterator *top, uint32_t max_skip,
+                         uint32_t *num_skipped_ptr) {
+    // Keep track of the first word boundary.
+    StringIterator *word = NULL;
+    uint32_t word_offset = 0;
+
+    // Check if we're at a starting boundary already.
+
+    StringIterator *iter = (StringIterator*)StrIter_Clone(top);
+
+    while (true) {
+        uint32_t code_point = StrIter_Prev(iter);
+
+        if (code_point == STRITER_DONE || code_point == '.') {
+            // Skip remaining whitespace.
+            *num_skipped_ptr = StrIter_Skip_Next_Whitespace(top);
+            DECREF(iter);
+            return true;
+        }
+
+        if (StrHelp_is_whitespace(code_point)) {
+            if (word == NULL) { word = (StringIterator*)StrIter_Clone(top); }
+        }
+        else {
+            break;
+        }
+    }
 
-    // Window is 1.66 * excerpt_length, with the loc in the middle.
-    int32_t best_location = S_hottest(heat_map);
+    // Try to start on a boundary.
 
-    if (best_location < (int32_t)ivars->slop) {
-        // If the beginning of the string falls within the window centered
-        // around the hottest point in the field, start the fragment at the
-        // beginning.
-        ViewCB_Assign(fragment, (String*)field_val);
-        int32_t top = ViewCB_Trim_Top(fragment);
-        ViewCB_Truncate(fragment, ivars->window_width);
-        return top;
+    uint32_t num_skipped = 0;
+    bool     found_edge  = false;
+
+    StrIter_Assign(iter, top);
+
+    for (uint32_t i = 0; i < max_skip; ++i) {
+        uint32_t code_point = StrIter_Next(iter);
+
+        if (code_point == STRITER_DONE || code_point == '.') {
+            found_edge = true;
+            StrIter_Assign(top, iter);
+            num_skipped = i + 1;
+            break;
+        }
+
+        if (word == NULL && StrHelp_is_whitespace(code_point)) {
+            word = (StringIterator*)StrIter_Clone(iter);
+            word_offset = i + 1;
+        }
     }
-    else {
-        int32_t top = best_location - ivars->slop;
-        ViewCB_Assign(fragment, (String*)field_val);
-        ViewCB_Nip(fragment, top);
-        top += ViewCB_Trim_Top(fragment);
-        int32_t chars_left = ViewCB_Truncate(fragment, ivars->excerpt_length);
-        int32_t overrun = ivars->excerpt_length - chars_left;
-
-        if (!overrun) {
-            // We've found an acceptable window.
-            ViewCB_Assign(fragment, (String*)field_val);
-            ViewCB_Nip(fragment, top);
-            top += ViewCB_Trim_Top(fragment);
-            ViewCB_Truncate(fragment, ivars->window_width);
-            return top;
+
+    // Try to use word boundary if no sentence boundary was found.
+    if (!found_edge && word != NULL) {
+        StrIter_Assign(top, word);
+        num_skipped = word_offset;
+    }
+
+    // Skip remaining whitespace.
+    num_skipped += StrIter_Skip_Next_Whitespace(top);
+    *num_skipped_ptr = num_skipped;
+
+    DECREF(word);
+    DECREF(iter);
+    return found_edge;
+}
+
+// Find an ending boundary before the current position given by the iterator.
+// Skip up to max_skip code points plus potential whitespace. Update the
+// iterator and return number of code points skipped. Return true if a
+// ending edge (sentence) was found.
+bool
+S_find_ending_boundary(StringIterator *tail, uint32_t max_skip,
+                       uint32_t *num_skipped_ptr) {
+    uint32_t code_point;
+
+    // Check if we're at an ending boundary already. Don't check for a word
+    // boundary because we need space for a trailing ellipsis.
+
+    StringIterator *iter = (StringIterator*)StrIter_Clone(tail);
+
+    do {
+        code_point = StrIter_Next(iter);
+
+        if (code_point == STRITER_DONE) {
+            // Skip remaining whitespace.
+            *num_skipped_ptr = StrIter_Skip_Prev_Whitespace(tail);
+            DECREF(iter);
+            return true;
         }
-        else if (overrun > top) {
-            // The field is very short, so make the whole field the
-            // "fragment".
-            ViewCB_Assign(fragment, (String*)field_val);
-            return ViewCB_Trim_Top(fragment);
+    } while (StrHelp_is_whitespace(code_point));
+
+    // Keep track of the first word boundary.
+    StringIterator *word = NULL;
+    uint32_t word_offset = 0;
+
+    StrIter_Assign(iter, tail);
+
+    for (uint32_t i = 0;
+         STRITER_DONE != (code_point = StrIter_Prev(iter));
+         ++i)
+    {
+        if (code_point == '.') {
+            StrIter_Assign(tail, iter);
+            StrIter_Advance(tail, 1); // Include period.
+            *num_skipped_ptr = i;
+            DECREF(word);
+            DECREF(iter);
+            return true;
         }
-        else {
-            // The fragment is too close to the end, so slide it back.
-            top -= overrun;
-            ViewCB_Assign(fragment, (String*)field_val);
-            ViewCB_Nip(fragment, top);
-            top += ViewCB_Trim_Top(fragment);
-            ViewCB_Truncate(fragment, ivars->excerpt_length);
-            return top;
+
+        if (StrHelp_is_whitespace(code_point)) {
+            if (word == NULL) {
+                word = (StringIterator*)StrIter_Clone(iter);
+                word_offset = i + 1;
+            }
+        }
+        else if (i >= max_skip) {
+            // Break only at non-whitespace to allow another sentence
+            // boundary to be found.
+            break;
         }
     }
-}
 
-// Return true if the window represented by "offset" and "length" overlaps a
-// score span, or if there are no score spans so that no excerpt is measurably
-// superior.
-static bool
-S_has_heat(HeatMap *heat_map, int32_t offset, int32_t length) {
-    VArray   *spans     = HeatMap_Get_Spans(heat_map);
-    uint32_t  num_spans = VA_Get_Size(spans);
-    int32_t   end       = offset + length;
-
-    if (length == 0)    { return false; }
-    if (num_spans == 0) { return true; }
-
-    for (uint32_t i = 0; i < num_spans; i++) {
-        Span *span  = (Span*)VA_Fetch(spans, i);
-        int32_t span_start = Span_Get_Offset(span);
-        int32_t span_end   = span_start + Span_Get_Length(span);;
-        if (offset >= span_start && offset <  span_end) { return true; }
-        if (end    >  span_start && end    <= span_end) { return true; }
-        if (offset <= span_start && end    >= span_end) { return true; }
-        if (span_start > end) { break; }
+    if (word == NULL) {
+        // Make space for ellipsis.
+        *num_skipped_ptr = StrIter_Recede(tail, 1);
+    }
+    else {
+        // Use word boundary if no sentence boundary was found.
+        StrIter_Assign(tail, word);
+
+        // Strip whitespace and punctuation that collides with an ellipsis.
+        while (STRITER_DONE != (code_point = StrIter_Prev(tail))) {
+            if (!StrHelp_is_whitespace(code_point)
+                && code_point != '.'
+                && code_point != ','
+                && code_point != ';'
+                && code_point != ':'
+                && code_point != ':'
+                && code_point != '?'
+                && code_point != '!'
+               ) {
+                StrIter_Advance(tail, 1); // Back up.
+                break;
+            }
+            ++word_offset;
+        }
+
+        *num_skipped_ptr = word_offset;
     }
 
+    DECREF(word);
+    DECREF(iter);
     return false;
 }
 
 String*
 Highlighter_Raw_Excerpt_IMP(Highlighter *self, const String *field_val,
-                            const String *fragment, int32_t *top_ptr,
-                            HeatMap *heat_map, VArray *sentences) {
+                            int32_t *start_ptr, HeatMap *heat_map) {
     HighlighterIVARS *const ivars = Highlighter_IVARS(self);
-    bool     found_starting_edge = false;
-    bool     found_ending_edge   = false;
-    int32_t  top   = *top_ptr;
-    int32_t  start = top;
-    int32_t  end   = 0;
-    double   field_len = Str_Length(field_val);
-    uint32_t min_len = field_len < ivars->excerpt_length * 0.6666
-                       ? (uint32_t)field_len
-                       : (uint32_t)(ivars->excerpt_length * 0.6666);
-
-    // Try to find a starting sentence boundary.
-    const uint32_t num_sentences = VA_Get_Size(sentences);
-    if (num_sentences) {
-        for (uint32_t i = 0; i < num_sentences; i++) {
-            Span *sentence = (Span*)VA_Fetch(sentences, i);
-            int32_t candidate = Span_Get_Offset(sentence);;
-
-            if (candidate > top + (int32_t)ivars->window_width) {
-                break;
-            }
-            else if (candidate >= top) {
-                // Try to start on the first sentence boundary, but only if
-                // there's enough relevant material left after it in the
-                // fragment.
-                StackString *temp = SSTR_WRAP(fragment);
-                SStr_Nip(temp, candidate - top);
-                uint32_t chars_left = SStr_Truncate(temp, ivars->excerpt_length);
-                if (chars_left >= min_len
-                    && S_has_heat(heat_map, candidate, chars_left)
-                   ) {
-                    start = candidate;
-                    found_starting_edge = true;
-                    break;
-                }
-            }
-        }
-    }
 
-    // Try to end on a sentence boundary (but don't try very hard).
-    if (num_sentences) {
-        StackString *start_trimmed = SSTR_WRAP(fragment);
-        SStr_Nip(start_trimmed, start - top);
+    // Find start of excerpt.
 
-        for (uint32_t i = num_sentences; i--;) {
-            Span    *sentence  = (Span*)VA_Fetch(sentences, i);
-            int32_t  last_edge = Span_Get_Offset(sentence)
-                                 + Span_Get_Length(sentence);
+    StringIterator *top = Str_Top(field_val);
 
-            if (last_edge <= start) {
-                break;
-            }
-            else if (last_edge - start > (int32_t)ivars->excerpt_length) {
-                continue;
-            }
-            else {
-                uint32_t chars_left = last_edge - start;
-                if (chars_left > min_len
-                    && S_has_heat(heat_map, start, chars_left)
-                   ) {
-                    found_ending_edge = true;
-                    end = last_edge;
-                    break;
-                }
-                else {
-                    StackString *temp = SSTR_WRAP((String*)start_trimmed);
-                    SStr_Nip(temp, chars_left);
-                    SStr_Trim_Tail(temp);
-                    if (SStr_Get_Size(temp) == 0) {
-                        // Short, but ending on a boundary already.
-                        found_ending_edge = true;
-                        end = last_edge;
-                        break;
-                    }
-                }
-            }
-        }
+    int32_t  best_location = S_hottest(heat_map);
+    int32_t  start;
+    uint32_t max_skip;
+
+    if (best_location <= ivars->slop) {
+        // If the beginning of the string falls within the window centered
+        // around the hottest point in the field, start the fragment at the
+        // beginning.
+        start    = 0;
+        max_skip = best_location;
     }
-    int32_t this_excerpt_len = found_ending_edge
-                               ? end - start
-                               : (int32_t)ivars->excerpt_length;
-    if (!this_excerpt_len) {
-        *top_ptr = start;
-        return Str_new(0);
+    else {
+        start    = best_location - ivars->slop;
+        max_skip = ivars->slop;
+        StrIter_Advance(top, start);
     }
 
-    StackString *substring = SSTR_WRAP((String*)field_val);
+    uint32_t num_skipped;
+    bool found_starting_edge
+        = S_find_starting_boundary(top, max_skip, &num_skipped);
+    start += num_skipped;
+
+    // Find end of excerpt.
 
-    if (found_starting_edge) {
-        SStr_Nip(substring, start);
-        SStr_Truncate(substring, this_excerpt_len);
+    StringIterator *tail = (StringIterator*)StrIter_Clone(top);
+
+    uint32_t max_len = ivars->excerpt_length;
+    if (!found_starting_edge) {
+        // Leave space for starting ellipsis and space character.
+        max_len -= 2;
     }
-    // If not starting on a sentence boundary, prepend an ellipsis.
-    else {
-        const size_t ELLIPSIS_LEN = 2; // Unicode ellipsis plus a space.
-
-        // If the excerpt is already shorter than the spec'd length, we might
-        // not need to make room.
-        this_excerpt_len += ELLIPSIS_LEN;
-
-        // Remember original position
-        int32_t orig_start = start;
-        int32_t orig_len   = this_excerpt_len;
-
-        // Move the start back one in case the character right before the
-        // excerpt starts is whitespace.
-        if (start) {
-            this_excerpt_len += 1;
-            start -= 1;
-            SStr_Nip(substring, start);
-        }
 
-        do {
-            uint32_t code_point = SStr_Nibble(substring);
-            start++;
-            this_excerpt_len--;
+    bool found_ending_edge = true;
+    uint32_t excerpt_len = StrIter_Advance(tail, max_len);
 
-            if (StrHelp_is_whitespace(code_point)) {
-                if (!found_ending_edge) {
-                    // If we still need room, we'll lop it off the end since
-                    // we don't know a solid end point yet.
-                    break;
-                }
-                else if (this_excerpt_len <= (int32_t)ivars->excerpt_length) {
-                    break;
-                }
-            }
-        } while (SStr_Get_Size(substring));
-
-        if (SStr_Get_Size(substring) == 0) {
-            // Word is longer than excerpt_length. Reset to original position
-            // truncating the word.
-            SStr_Assign(substring, (String*)field_val);
-            start            = orig_start;
-            this_excerpt_len = orig_len;
-            int32_t diff = this_excerpt_len - ivars->excerpt_length;
-            if (diff > 0) {
-                SStr_Nip(substring, diff);
-                start            += diff;
-                this_excerpt_len -= diff;
-            }
+    // Skip up to slop code points but keep at least max_len - slop.
+    if (excerpt_len > max_len - ivars->slop) {
+        max_skip = excerpt_len - (max_len - ivars->slop);
+        found_ending_edge
+            = S_find_ending_boundary(tail, max_skip, &num_skipped);
+        if (num_skipped >= excerpt_len) {
+            excerpt_len = 0;
+        }
+        else {
+            excerpt_len -= num_skipped;
         }
-
-        SStr_Truncate(substring, ivars->excerpt_length - ELLIPSIS_LEN);
     }
 
-    // If excerpt doesn't end on a sentence boundary, tack on an ellipsis.
-    if (found_ending_edge) {
-        SStr_Truncate(substring, end - start);
-        SStr_Trim_Tail(substring);
+    // Extract excerpt.
+
+    String *raw_excerpt;
+
+    if (!excerpt_len) {
+        raw_excerpt = Str_new(0);
     }
     else {
-        // Remember original excerpt
-        StackString *orig_substring = SSTR_WRAP((String*)substring);
-        // Check for prepended ellipsis
-        uint32_t min_size = found_starting_edge ? 0 : 4;
-
-        do {
-            uint32_t code_point = SStr_Code_Point_From(substring, 1);
-            SStr_Chop(substring, 1);
-            if (StrHelp_is_whitespace(code_point)) {
-                SStr_Trim_Tail(substring);
-
-                // Strip punctuation that collides with an ellipsis.
-                code_point = SStr_Code_Point_From(substring, 1);
-                while (code_point == '.'
-                       || code_point == ','
-                       || code_point == ';'
-                       || code_point == ':'
-                       || code_point == ':'
-                       || code_point == '?'
-                       || code_point == '!'
-                      ) {
-                    SStr_Chop(substring, 1);
-                    code_point = SStr_Code_Point_From(substring, 1);
-                }
+        String  *substring = StrIter_substring(top, tail);
+        CharBuf *buf       = CB_new(Str_Get_Size(substring) + 8);
+
+        // If not starting on a sentence boundary, prepend an ellipsis.
+        if (!found_starting_edge) {
+            CB_Cat_Char(buf, ELLIPSIS_CODE_POINT);
+            CB_Cat_Char(buf, ' ');
+            start -= 2;
+        }
 
-                break;
-            }
-        } while (SStr_Get_Size(substring) > min_size);
+        CB_Cat(buf, substring);
 
-        if (SStr_Get_Size(substring) == min_size) {
-            // Word is longer than excerpt_length. Reset to original excerpt
-            // truncating the word.
-            SStr_Assign(substring, (String*)orig_substring);
-            SStr_Chop(substring, 1);
+        // If not ending on a sentence boundary, append an ellipsis.
+        if (!found_ending_edge) {
+            CB_Cat_Char(buf, ELLIPSIS_CODE_POINT);
         }
-    }
 
-    CharBuf *buf = CB_new(SStr_Get_Size(substring) + 8);
+        raw_excerpt = CB_Yield_String(buf);
 
-    if (!found_starting_edge) {
-        CB_Cat_Char(buf, ELLIPSIS_CODE_POINT);
-        CB_Cat_Char(buf, ' ');
-        const size_t ELLIPSIS_LEN = 2; // Unicode ellipsis plus a space.
-        start -= ELLIPSIS_LEN;
+        DECREF(buf);
+        DECREF(substring);
     }
 
-    CB_Cat(buf, (String*)substring);
-
-    if (!found_ending_edge) {
-        CB_Cat_Char(buf, ELLIPSIS_CODE_POINT);
-    }
+    *start_ptr = start;
 
-    String *raw_excerpt = CB_Yield_String(buf);
-    DECREF(buf);
-    *top_ptr = start;
+    DECREF(top);
+    DECREF(tail);
     return raw_excerpt;
 }
 
@@ -591,105 +547,6 @@ Highlighter_Highlight_Excerpt_IMP(Highlighter *self, VArray *spans,
     return highlighted;
 }
 
-static Span*
-S_start_sentence(int32_t pos) {
-    return Span_new(pos, 0, 0.0);
-}
-
-static void
-S_close_sentence(VArray *sentences, Span **sentence_ptr,
-                 int32_t sentence_end) {
-    Span *sentence = *sentence_ptr;
-    int32_t length = sentence_end - Span_Get_Offset(sentence);
-    const int32_t MIN_SENTENCE_LENGTH = 3; // e.g. "OK.", but not "2."
-    if (length >= MIN_SENTENCE_LENGTH) {
-        Span_Set_Length(sentence, length);
-        VA_Push(sentences, (Obj*)sentence);
-        *sentence_ptr = NULL;
-    }
-}
-
-VArray*
-Highlighter_Find_Sentences_IMP(Highlighter *self, String *text,
-                               int32_t offset, int32_t length) {
-    /* When [sentence] is NULL, that means a sentence start has not yet been
-     * found.  When it is a Span object, we have a start, but we haven't found
-     * an end.  Once we find the end, we add the sentence to the [sentences]
-     * array and set [sentence] back to NULL to indicate that we're looking
-     * for a start once more.
-     */
-    Span    *sentence       = NULL;
-    VArray  *sentences      = VA_new(10);
-    int32_t  stop           = length == 0
-                              ? INT32_MAX
-                              : offset + length;
-    StackString *fragment = SSTR_WRAP(text);
-    int32_t  pos            = SStr_Trim_Top(fragment);
-    UNUSED_VAR(self);
-
-    /* Our first task will be to find a sentence that either starts at the top
-     * of the fragment, or overlaps its start. Starting at the top of the
-     * field is a special case: we define the first non-whitespace character
-     * to begin a sentence, rather than look for the first character following
-     * a period and whitespace.  Everywhere else, we have to define sentence
-     * starts based on a sentence end that has just passed by.
-     */
-    if (offset <= pos) {
-        // Assume that first non-whitespace character begins a sentence.
-        if (pos < stop && SStr_Get_Size(fragment) > 0) {
-            sentence = S_start_sentence(pos);
-        }
-    }
-    else {
-        SStr_Nip(fragment, offset - pos);
-        pos = offset;
-    }
-
-    while (1) {
-        uint32_t code_point = SStr_Code_Point_At(fragment, 0);
-        if (!code_point) {
-            // End of fragment.  If we have a sentence open, close it,
-            // then bail.
-            if (sentence) { S_close_sentence(sentences, &sentence, pos); }
-            break;
-        }
-        else if (code_point == '.') {
-            uint32_t whitespace_count;
-            pos += SStr_Nip(fragment, 1); // advance past "."
-
-            if (pos == stop && SStr_Get_Size(fragment) == 0) {
-                // Period ending the field string.
-                if (sentence) { S_close_sentence(sentences, &sentence, pos); }
-                break;
-            }
-            else if (0 != (whitespace_count = SStr_Trim_Top(fragment))) {
-                // We've found a period followed by whitespace.  Close out the
-                // existing sentence, if there is one. */
-                if (sentence) { S_close_sentence(sentences, &sentence, pos); }
-
-                // Advance past whitespace.
-                pos += whitespace_count;
-                if (pos < stop && SStr_Get_Size(fragment) > 0) {
-                    // Not at the end of the string? Then we've found a
-                    // sentence start.
-                    sentence = S_start_sentence(pos);
-                }
-            }
-
-            // We may not have reached the end of the field yet, but it's
-            // entirely possible that our last sentence overlapped the end of
-            // the fragment -- in which case, it's time to bail.
-            if (pos >= stop) { break; }
-        }
-        else {
-            SStr_Nip(fragment, 1);
-            pos++;
-        }
-    }
-
-    return sentences;
-}
-
 String*
 Highlighter_Encode_IMP(Highlighter *self, String *text) {
     UNUSED_VAR(self);

http://git-wip-us.apache.org/repos/asf/lucy/blob/1f51cae0/core/Lucy/Highlight/Highlighter.cfh
----------------------------------------------------------------------
diff --git a/core/Lucy/Highlight/Highlighter.cfh b/core/Lucy/Highlight/Highlighter.cfh
index 79742ad..afc9a8e 100644
--- a/core/Lucy/Highlight/Highlighter.cfh
+++ b/core/Lucy/Highlight/Highlighter.cfh
@@ -29,7 +29,6 @@ public class Lucy::Highlight::Highlighter inherits Clownfish::Obj {
     Query      *query;
     String     *field;
     uint32_t    excerpt_length;
-    uint32_t    window_width;
     uint32_t    slop;
     String     *pre_tag;
     String     *post_tag;
@@ -67,23 +66,6 @@ public class Lucy::Highlight::Highlighter inherits Clownfish::Obj {
     public incremented String*
     Encode(Highlighter *self, String *text);
 
-    /** Find sentence boundaries within the specified range, returning them as
-     * an array of Spans.  The "offset" of each Span indicates the start of
-     * the sentence, and is measured from 0, not from <code>offset</code>.
-     * The Span's "length" member indicates the sentence length in code
-     * points.
-     *
-     * @param text The string to scan.
-     * @param offset The place to start looking for offsets, measured in
-     * Unicode code points from the top of <code>text</code>.
-     * @param length The number of code points from <code>offset</code> to
-     * scan. The default value of 0 is a sentinel which indicates to scan
-     * until the end of the string.
-     */
-    incremented VArray*
-    Find_Sentences(Highlighter *self, String *text, int32_t offset = 0,
-                   int32_t length = 0);
-
     /** Highlight a small section of text.  By default, prepends pre-tag and
      * appends post-tag.  This method is called internally by Create_Excerpt()
      * when assembling an excerpt.
@@ -138,25 +120,15 @@ public class Lucy::Highlight::Highlighter inherits Clownfish::Obj {
     Get_Compiler(Highlighter *self);
 
     /** Decide based on heat map the best fragment of field to concentrate on.
-     * Place the result into <code>fragment<code> and return its offset in
-     * code points from the top of the field.
-     *
-     * (Helper function for Create_Excerpt only exposed for testing purposes.)
-     */
-    int32_t
-    Find_Best_Fragment(Highlighter *self, const String *field_val,
-                       ViewCharBuf *fragment, HeatMap *heat_map);
-
-    /** Take the fragment and determine the best edges for it based on
+     * Take the fragment and determine the best edges for it based on
      * sentence boundaries when possible.  Add ellipses when boundaries cannot
      * be found.
      *
      * (Helper function for Create_Excerpt only exposed for testing purposes.)
      */
     String*
-    Raw_Excerpt(Highlighter *self, const String *field_val,
-                const String *fragment, int32_t *top, HeatMap *heat_map,
-                VArray *sentences);
+    Raw_Excerpt(Highlighter *self, const String *field_value, int32_t *top,
+                HeatMap *heat_map);
 
     /** Take the text in raw_excerpt, add highlight tags, encode, and place
      * the result into <code>highlighted</code>.

http://git-wip-us.apache.org/repos/asf/lucy/blob/1f51cae0/core/Lucy/Test/Highlight/TestHighlighter.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Test/Highlight/TestHighlighter.c b/core/Lucy/Test/Highlight/TestHighlighter.c
index 111d655..f8fabc6 100644
--- a/core/Lucy/Test/Highlight/TestHighlighter.c
+++ b/core/Lucy/Test/Highlight/TestHighlighter.c
@@ -58,69 +58,6 @@ TestHighlighter_new() {
 }
 
 static void
-test_Find_Best_Fragment(TestBatchRunner *runner, Searcher *searcher, Obj *query) {
-    String *content = (String*)SSTR_WRAP_STR("content", 7);
-    Highlighter *highlighter = Highlighter_new(searcher, query, content, 3);
-    ViewCharBuf *target = (ViewCharBuf*)SStr_BLANK();
-
-    VArray *spans = VA_new(1);
-    VA_Push(spans, (Obj*)Span_new(2, 1, 1.0f));
-    HeatMap *heat_map = HeatMap_new(spans, 133);
-    DECREF(spans);
-    String *field_val = (String *)SSTR_WRAP_STR("a " PHI " " PHI " b c", 11);
-    int32_t top = Highlighter_Find_Best_Fragment(highlighter, field_val,
-                                                 target, heat_map);
-    TEST_TRUE(runner,
-              Str_Equals_Str((String *)target, PHI " " PHI " b", 7),
-              "Find_Best_Fragment");
-    TEST_TRUE(runner,
-              top == 2,
-              "correct offset returned by Find_Best_Fragment");
-    field_val = (String *)SSTR_WRAP_STR("aa" PHI, 4);
-    top = Highlighter_Find_Best_Fragment(highlighter, field_val,
-                                         target, heat_map);
-    TEST_TRUE(runner,
-              Str_Equals_Str((String *)target, "aa" PHI, 4),
-              "Find_Best_Fragment returns whole field when field is short");
-    TEST_TRUE(runner,
-              top == 0,
-              "correct offset");
-    DECREF(heat_map);
-
-    spans = VA_new(1);
-    VA_Push(spans, (Obj*)Span_new(6, 2, 1.0f));
-    heat_map = HeatMap_new(spans, 133);
-    DECREF(spans);
-    field_val = (String *)SSTR_WRAP_STR("aaaab" PHI PHI, 9);
-    top = Highlighter_Find_Best_Fragment(highlighter, field_val,
-                                         target, heat_map);
-    TEST_TRUE(runner,
-              Str_Equals_Str((String *)target, "b" PHI PHI, 5),
-              "Find_Best_Fragment shifts left to deal with overrun");
-    TEST_TRUE(runner,
-              top == 4,
-              "correct offset");
-    DECREF(heat_map);
-
-    spans = VA_new(1);
-    VA_Push(spans, (Obj*)Span_new(0, 1, 1.0f));
-    heat_map = HeatMap_new(spans, 133);
-    DECREF(spans);
-    field_val = (String *)SSTR_WRAP_STR("a" PHI "bcde", 7);
-    top = Highlighter_Find_Best_Fragment(highlighter, field_val,
-                                         target, heat_map);
-    TEST_TRUE(runner,
-              Str_Equals_Str((String *)target, "a" PHI "bcd", 6),
-              "Find_Best_Fragment start at field beginning");
-    TEST_TRUE(runner,
-              top == 0,
-              "correct offset");
-    DECREF(heat_map);
-
-    DECREF(highlighter);
-}
-
-static void
 test_Raw_Excerpt(TestBatchRunner *runner, Searcher *searcher, Obj *query) {
     String *content = (String*)SSTR_WRAP_STR("content", 7);
     Highlighter *highlighter = Highlighter_new(searcher, query, content, 6);
@@ -128,119 +65,96 @@ test_Raw_Excerpt(TestBatchRunner *runner, Searcher *searcher, Obj *query)
{
     String *raw_excerpt;
 
     String *field_val = (String *)SSTR_WRAP_STR("Ook.  Urk.  Ick.  ", 18);
-    String *fragment  = (String *)SSTR_WRAP_STR("Ook.  Urk.", 10);
     VArray *spans = VA_new(1);
     VA_Push(spans, (Obj*)Span_new(0, 18, 1.0f));
     HeatMap *heat_map = HeatMap_new(spans, 133);
     DECREF(spans);
-    VArray *sentences = VA_new(2);
-    VA_Push(sentences, (Obj*)Span_new(0, 4, 0.0f));
-    VA_Push(sentences, (Obj*)Span_new(6, 4, 0.0f));
-    top = 0;
-    raw_excerpt = Highlighter_Raw_Excerpt(highlighter, field_val, fragment,
-                                          &top, heat_map, sentences);
+    raw_excerpt = Highlighter_Raw_Excerpt(highlighter, field_val, &top,
+                                          heat_map);
     TEST_TRUE(runner,
               Str_Equals_Str(raw_excerpt, "Ook.", 4),
-              "Raw_Excerpt at top");
+              "Raw_Excerpt at top %s", Str_Get_Ptr8(raw_excerpt));
     TEST_TRUE(runner,
               top == 0,
-              "top still 0");
-    DECREF(sentences);
+              "top is 0");
     DECREF(raw_excerpt);
+    DECREF(heat_map);
 
-    fragment    = (String *)SSTR_WRAP_STR(".  Urk.  I", 10);
-    sentences   = VA_new(2);
-    VA_Push(sentences, (Obj*)Span_new(6, 4, 0.0f));
-    VA_Push(sentences, (Obj*)Span_new(12, 4, 0.0f));
-    top = 3;
-    raw_excerpt = Highlighter_Raw_Excerpt(highlighter, field_val, fragment,
-                                          &top, heat_map, sentences);
+    spans = VA_new(1);
+    VA_Push(spans, (Obj*)Span_new(6, 12, 1.0f));
+    heat_map = HeatMap_new(spans, 133);
+    DECREF(spans);
+    raw_excerpt = Highlighter_Raw_Excerpt(highlighter, field_val, &top,
+                                          heat_map);
     TEST_TRUE(runner,
               Str_Equals_Str(raw_excerpt, "Urk.", 4),
               "Raw_Excerpt in middle, with 2 bounds");
     TEST_TRUE(runner,
               top == 6,
               "top in the middle modified by Raw_Excerpt");
-    DECREF(sentences);
-    DECREF(heat_map);
     DECREF(raw_excerpt);
+    DECREF(heat_map);
 
-    field_val   = (String *)SSTR_WRAP_STR("Ook urk ick i.", 14);
-    fragment    = (String *)SSTR_WRAP_STR("ick i.", 6);
-    spans       = VA_new(1);
-    VA_Push(spans, (Obj*)Span_new(0, 14, 1.0f));
+    field_val = (String *)SSTR_WRAP_STR("Ook urk ick i.", 14);
+    spans     = VA_new(1);
+    VA_Push(spans, (Obj*)Span_new(12, 1, 1.0f));
     heat_map = HeatMap_new(spans, 133);
     DECREF(spans);
-    sentences = VA_new(1);
-    VA_Push(sentences, (Obj*)Span_new(0, 14, 0.0f));
-    top = 8;
-    raw_excerpt = Highlighter_Raw_Excerpt(highlighter, field_val, fragment,
-                                          &top, heat_map, sentences);
+    raw_excerpt = Highlighter_Raw_Excerpt(highlighter, field_val, &top,
+                                          heat_map);
     TEST_TRUE(runner,
               Str_Equals_Str(raw_excerpt, ELLIPSIS " i.", 6),
               "Ellipsis at top");
     TEST_TRUE(runner,
               top == 10,
               "top correct when leading ellipsis inserted");
-    DECREF(sentences);
     DECREF(heat_map);
     DECREF(raw_excerpt);
 
-    field_val   = (String *)SSTR_WRAP_STR("Urk.  Iz no good.", 17);
-    fragment    = (String *)SSTR_WRAP_STR("  Iz no go", 10);
-    spans       = VA_new(1);
-    VA_Push(spans, (Obj*)Span_new(0, 17, 1.0f));
+    field_val = (String *)SSTR_WRAP_STR("Urk.  Iz no good.", 17);
+    spans     = VA_new(1);
+    VA_Push(spans, (Obj*)Span_new(6, 2, 1.0f));
     heat_map = HeatMap_new(spans, 133);
     DECREF(spans);
-    sentences = VA_new(1);
-    VA_Push(sentences, (Obj*)Span_new(6, 11, 0.0f));
-    top = 4;
-    raw_excerpt = Highlighter_Raw_Excerpt(highlighter, field_val, fragment,
-                                          &top, heat_map, sentences);
+    raw_excerpt = Highlighter_Raw_Excerpt(highlighter, field_val, &top,
+                                          heat_map);
     TEST_TRUE(runner,
               Str_Equals_Str(raw_excerpt, "Iz no" ELLIPSIS, 8),
               "Ellipsis at end");
     TEST_TRUE(runner,
               top == 6,
               "top trimmed");
-    DECREF(sentences);
     DECREF(heat_map);
     DECREF(raw_excerpt);
 
     // Words longer than excerpt len
 
-    field_val   = (String *)SSTR_WRAP_STR("abc/def/ghi/jkl/mno", 19);
-    sentences = VA_new(1);
-    VA_Push(sentences, (Obj*)Span_new(0, 19, 0.0f));
+    field_val = (String *)SSTR_WRAP_STR("abc/def/ghi/jkl/mno", 19);
 
-    spans       = VA_new(1);
+    spans = VA_new(1);
     VA_Push(spans, (Obj*)Span_new(0, 3, 1.0f));
     heat_map = HeatMap_new(spans, 133);
     DECREF(spans);
-    top = 0;
-    raw_excerpt = Highlighter_Raw_Excerpt(highlighter, field_val, field_val,
-                                          &top, heat_map, sentences);
+    raw_excerpt = Highlighter_Raw_Excerpt(highlighter, field_val, &top,
+                                          heat_map);
     TEST_TRUE(runner,
               Str_Equals_Str(raw_excerpt, "abc/d" ELLIPSIS, 8),
-              "Long word");
+              "Long word at top %s");
     DECREF(heat_map);
     DECREF(raw_excerpt);
 
-    spans       = VA_new(1);
+    spans = VA_new(1);
     VA_Push(spans, (Obj*)Span_new(8, 3, 1.0f));
     heat_map = HeatMap_new(spans, 133);
     DECREF(spans);
-    top = 0;
-    raw_excerpt = Highlighter_Raw_Excerpt(highlighter, field_val, field_val,
-                                          &top, heat_map, sentences);
+    raw_excerpt = Highlighter_Raw_Excerpt(highlighter, field_val, &top,
+                                          heat_map);
     TEST_TRUE(runner,
-              Str_Equals_Str(raw_excerpt, ELLIPSIS " c/d" ELLIPSIS, 10),
-              "Long word");
+              Str_Equals_Str(raw_excerpt, ELLIPSIS " f/g" ELLIPSIS, 10),
+              "Long word in middle");
     DECREF(heat_map);
     DECREF(raw_excerpt);
 
-    DECREF(sentences);
-
     DECREF(highlighter);
 }
 
@@ -407,58 +321,6 @@ test_Create_Excerpt(TestBatchRunner *runner, Searcher *searcher, Obj
*query,
 }
 
 static void
-test_Find_Sentences(TestBatchRunner *runner, Searcher *searcher, Obj *query) {
-    String *content = (String*)SSTR_WRAP_STR("content", 7);
-    Highlighter *highlighter = Highlighter_new(searcher, query, content, 200);
-    String *text = (String*)SSTR_WRAP_STR(
-                        "This is a sentence. This is a sentence. This is a sentence. "
-                        "This is a sentence. This is a sentence. This is a sentence. "
-                        "This is a sentence. This is a sentence. This is a sentence. "
-                        "This is a sentence. This is a sentence. This is a sentence. "
-                        "This is a sentence. This is a sentence. This is a sentence. ",
-                        300);
-
-    VArray *got = Highlighter_Find_Sentences(highlighter, text, 101, 50);
-    VArray *wanted = VA_new(2);
-    VA_Push(wanted, (Obj*)Span_new(120, 19, 0.0f));
-    VA_Push(wanted, (Obj*)Span_new(140, 19, 0.0f));
-    TEST_TRUE(runner,
-              VA_Equals(got, (Obj*)wanted),
-              "find_sentences with explicit args");
-    DECREF(wanted);
-    DECREF(got);
-
-    got = Highlighter_Find_Sentences(highlighter, text, 101, 4);
-    TEST_TRUE(runner,
-              VA_Get_Size(got) == 0,
-              "find_sentences with explicit args, finding nothing");
-    DECREF(got);
-
-    got = Highlighter_Find_Sentences(highlighter, text, 0, 0);
-    wanted = VA_new(15);
-    for (int i = 0; i < 15; ++i) {
-        VA_Push(wanted, (Obj*)Span_new(i * 20, 19, 0.0f));
-    }
-    TEST_TRUE(runner,
-              VA_Equals(got, (Obj*)wanted),
-              "find_sentences with default offset and length");
-    DECREF(wanted);
-    DECREF(got);
-
-    text = (String*)SSTR_WRAP_STR(" Foo", 4);
-    got = Highlighter_Find_Sentences(highlighter, text, 0, 0);
-    wanted = VA_new(1);
-    VA_Push(wanted, (Obj*)Span_new(1, 3, 0.0f));
-    TEST_TRUE(runner,
-              VA_Equals(got, (Obj*)wanted),
-              "Skip leading whitespace but get first sentence");
-    DECREF(wanted);
-    DECREF(got);
-
-    DECREF(highlighter);
-}
-
-static void
 test_highlighting(TestBatchRunner *runner) {
     Schema *schema = Schema_new();
     StandardTokenizer *tokenizer = StandardTokenizer_new();
@@ -507,11 +369,9 @@ test_highlighting(TestBatchRunner *runner) {
     Obj *query = (Obj*)SSTR_WRAP_STR("\"x y z\" AND " PHI, 14);
     Hits *hits = Searcher_Hits(searcher, query, 0, 10, NULL);
 
-    test_Find_Best_Fragment(runner, searcher, query);
     test_Raw_Excerpt(runner, searcher, query);
     test_Highlight_Excerpt(runner, searcher, query);
     test_Create_Excerpt(runner, searcher, query, hits);
-    test_Find_Sentences(runner, searcher, query);
 
     DECREF(hits);
     DECREF(searcher);
@@ -578,7 +438,7 @@ test_hl_selection(TestBatchRunner *runner) {
 
 void
 TestHighlighter_Run_IMP(TestHighlighter *self, TestBatchRunner *runner) {
-    TestBatchRunner_Plan(runner, (TestBatch*)self, 35);
+    TestBatchRunner_Plan(runner, (TestBatch*)self, 23);
     test_highlighting(runner);
     test_hl_selection(runner);
 }


Mime
View raw message