lucy-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From nwelln...@apache.org
Subject [09/15] lucy-clownfish git commit: Rework Str_Find interface
Date Fri, 30 Oct 2015 14:30:44 GMT
Rework Str_Find interface

Make Str_Find return a StringIterator. Add Str_Contains.

Optimize substring search to use memchr. Unfortunately, memmem isn't
widely available.


Project: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/commit/2f6b2393
Tree: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/tree/2f6b2393
Diff: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/diff/2f6b2393

Branch: refs/heads/master
Commit: 2f6b2393c84402b296b3081265f64c350529e96f
Parents: f126bf3
Author: Nick Wellnhofer <wellnhofer@aevum.de>
Authored: Sat Oct 24 15:14:59 2015 +0200
Committer: Nick Wellnhofer <wellnhofer@aevum.de>
Committed: Wed Oct 28 16:10:35 2015 +0100

----------------------------------------------------------------------
 runtime/core/Clownfish/String.c          | 45 ++++++++++++++++++++-------
 runtime/core/Clownfish/String.cfh        | 23 +++++++++++---
 runtime/core/Clownfish/Test/TestObj.c    |  4 +--
 runtime/core/Clownfish/Test/TestString.c | 37 ++++++++++++++++------
 runtime/go/clownfish/string_test.go      | 26 ++++++++++++----
 5 files changed, 101 insertions(+), 34 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/2f6b2393/runtime/core/Clownfish/String.c
----------------------------------------------------------------------
diff --git a/runtime/core/Clownfish/String.c b/runtime/core/Clownfish/String.c
index f4732bd..090bda6 100644
--- a/runtime/core/Clownfish/String.c
+++ b/runtime/core/Clownfish/String.c
@@ -44,6 +44,9 @@ static void
 S_die_invalid_utf8(const char *text, size_t size, const char *file, int line,
                    const char *func);
 
+static const char*
+S_memmem(String *self, const char *substring, size_t size);
+
 static StringIterator*
 S_new_stack_iter(void *allocation, String *string, size_t byte_offset);
 
@@ -388,25 +391,43 @@ Str_Ends_With_Utf8_IMP(String *self, const char *suffix, size_t suffix_len)
{
     return false;
 }
 
-int64_t
+bool
+Str_Contains_IMP(String *self, String *substring) {
+    return !!S_memmem(self, substring->ptr, substring->size);
+}
+
+bool
+Str_Contains_Utf8_IMP(String *self, const char *substring, size_t size) {
+    return !!S_memmem(self, substring, size);
+}
+
+StringIterator*
 Str_Find_IMP(String *self, String *substring) {
     return Str_Find_Utf8(self, substring->ptr, substring->size);
 }
 
-int64_t
-Str_Find_Utf8_IMP(String *self, const char *ptr, size_t size) {
-    StringIterator *iter = STACK_ITER(self, 0);
-    int64_t location = 0;
+StringIterator*
+Str_Find_Utf8_IMP(String *self, const char *substring, size_t size) {
+    const char *ptr = S_memmem(self, substring, size);
+    return ptr ? StrIter_new(self, ptr - self->ptr) : NULL;
+}
 
-    while (iter->byte_offset + size <= self->size) {
-        if (memcmp(self->ptr + iter->byte_offset, ptr, size) == 0) {
-            return location;
-        }
-        StrIter_Advance(iter, 1);
-        location++;
+static const char*
+S_memmem(String *self, const char *substring, size_t size) {
+    if (size == 0)         { return self->ptr; }
+    if (size > self->size) { return NULL;      }
+
+    const char *ptr = self->ptr;
+    const char *end = ptr + self->size - size + 1;
+    char first_char = substring[0];
+
+    // Naive string search.
+    while (NULL != (ptr = (const char*)memchr(ptr, first_char, end - ptr))) {
+        if (memcmp(ptr, substring, size) == 0) { break; }
+        ptr++;
     }
 
-    return -1;
+    return ptr;
 }
 
 String*

http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/2f6b2393/runtime/core/Clownfish/String.cfh
----------------------------------------------------------------------
diff --git a/runtime/core/Clownfish/String.cfh b/runtime/core/Clownfish/String.cfh
index b54f80d..f675aff 100644
--- a/runtime/core/Clownfish/String.cfh
+++ b/runtime/core/Clownfish/String.cfh
@@ -158,13 +158,28 @@ public final class Clownfish::String nickname Str
     public bool
     Ends_With_Utf8(String *self, const char *suffix, size_t size);
 
-    /** Return the location of the substring within the String (measured in
-     * code points), or -1 if the substring does not match.
+    /** Test whether the String contains `substring`.
      */
-    public int64_t
+    public bool
+    Contains(String *self, String *substring);
+
+    /** Test whether the String contains `substring`.
+     */
+    public bool
+    Contains_Utf8(String *self, const char *ptr, size_t size);
+
+    /** Return a [](StringIterator) pointing to the first occurrence of the
+     * substring within the String, or [](@null) if the substring does not
+     * match.
+     */
+    public incremented nullable StringIterator*
     Find(String *self, String *substring);
 
-    public int64_t
+    /** Return a [](StringIterator) pointing to the first occurrence of the
+     * substring within the String, or [](@null) if the substring does not
+     * match.
+     */
+    public incremented nullable StringIterator*
     Find_Utf8(String *self, const char *ptr, size_t size);
 
     /** Test whether the String matches the supplied UTF-8 character data.

http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/2f6b2393/runtime/core/Clownfish/Test/TestObj.c
----------------------------------------------------------------------
diff --git a/runtime/core/Clownfish/Test/TestObj.c b/runtime/core/Clownfish/Test/TestObj.c
index 1bac642..b37ad71 100644
--- a/runtime/core/Clownfish/Test/TestObj.c
+++ b/runtime/core/Clownfish/Test/TestObj.c
@@ -66,7 +66,7 @@ static void
 test_To_String(TestBatchRunner *runner) {
     Obj *testobj = S_new_testobj();
     String *string = Obj_To_String(testobj);
-    TEST_TRUE(runner, Str_Find_Utf8(string, "TestObj", 7) >= 0, "To_String");
+    TEST_TRUE(runner, Str_Contains_Utf8(string, "TestObj", 7), "To_String");
     DECREF(string);
     DECREF(testobj);
 }
@@ -123,7 +123,7 @@ S_verify_abstract_error(TestBatchRunner *runner, Err_Attempt_t routine,
     Err *error = Err_trap(routine, context);
     TEST_TRUE(runner, error != NULL
               && Err_is_a(error, ERR)
-              && Str_Find_Utf8(Err_Get_Mess(error), "bstract", 7) != -1,
+              && Str_Contains_Utf8(Err_Get_Mess(error), "bstract", 7),
               message);
     DECREF(error);
 }

http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/2f6b2393/runtime/core/Clownfish/Test/TestString.c
----------------------------------------------------------------------
diff --git a/runtime/core/Clownfish/Test/TestString.c b/runtime/core/Clownfish/Test/TestString.c
index 629763d..291aee8 100644
--- a/runtime/core/Clownfish/Test/TestString.c
+++ b/runtime/core/Clownfish/Test/TestString.c
@@ -150,28 +150,45 @@ test_Clone(TestBatchRunner *runner) {
     DECREF(wanted);
 }
 
+static int64_t
+S_find(String *string, String *substring) {
+    StringIterator *iter = Str_Find(string, substring);
+    if (iter == NULL) { return -1; }
+    size_t tick = StrIter_Recede(iter, SIZE_MAX);
+    DECREF(iter);
+    return (int64_t)tick;
+}
+
 static void
-test_Find(TestBatchRunner *runner) {
+test_Contains_and_Find(TestBatchRunner *runner) {
     String *string;
     String *substring = S_get_str("foo");
 
     string = S_get_str("");
-    TEST_TRUE(runner, Str_Find(string, substring) == -1, "Not in empty string");
+    TEST_FALSE(runner, Str_Contains(string, substring),
+               "Not contained in empty string");
+    TEST_INT_EQ(runner, S_find(string, substring), -1,
+                "Not found in empty string");
     DECREF(string);
 
     string = S_get_str("foo");
-    TEST_TRUE(runner, Str_Find(string, substring) == 0, "Find complete string");
+    TEST_TRUE(runner, Str_Contains(string, substring),
+              "Contained in complete string");
+    TEST_INT_EQ(runner, S_find(string, substring), 0, "Find complete string");
     DECREF(string);
 
     string = S_get_str("afoo");
-    TEST_TRUE(runner, Str_Find(string, substring) == 1, "Find after first");
-    // TODO: Enable this test when we have real substrings.
-    /*Str_Set_Size(string, 3);
-    TEST_TRUE(runner, Str_Find(string, substring) == -1, "Don't overrun");*/
+    TEST_TRUE(runner, Str_Contains(string, substring),
+              "Contained after first");
+    TEST_INT_EQ(runner, S_find(string, substring), 1, "Find after first");
+    String *prefix = Str_SubString(string, 0, 3);
+    TEST_FALSE(runner, Str_Contains(prefix, substring), "Don't overrun");
+    DECREF(prefix);
     DECREF(string);
 
     string = S_get_str("afood");
-    TEST_TRUE(runner, Str_Find(string, substring) == 1, "Find in middle");
+    TEST_TRUE(runner, Str_Contains(string, substring), "Contained in middle");
+    TEST_INT_EQ(runner, S_find(string, substring), 1, "Find in middle");
     DECREF(string);
 
     DECREF(substring);
@@ -676,12 +693,12 @@ test_iterator_substring(TestBatchRunner *runner) {
 
 void
 TestStr_Run_IMP(TestString *self, TestBatchRunner *runner) {
-    TestBatchRunner_Plan(runner, (TestBatch*)self, 133);
+    TestBatchRunner_Plan(runner, (TestBatch*)self, 138);
     test_new(runner);
     test_Cat(runner);
     test_Clone(runner);
     test_Code_Point_At_and_From(runner);
-    test_Find(runner);
+    test_Contains_and_Find(runner);
     test_SubString(runner);
     test_Trim(runner);
     test_To_F64(runner);

http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/2f6b2393/runtime/go/clownfish/string_test.go
----------------------------------------------------------------------
diff --git a/runtime/go/clownfish/string_test.go b/runtime/go/clownfish/string_test.go
index 6f8c641..fb67aae 100644
--- a/runtime/go/clownfish/string_test.go
+++ b/runtime/go/clownfish/string_test.go
@@ -63,15 +63,29 @@ func TestStringBaseXToI64(t *testing.T) {
 	}
 }
 
+func TestStringContains(t *testing.T) {
+	s := NewString("foobarbaz")
+	if !s.Contains("bar") {
+		t.Error("Contains yes")
+	}
+	if s.Contains("banana") {
+		t.Error("Contains no")
+	}
+}
+
 func TestStringFind(t *testing.T) {
 	s := NewString("foobarbaz")
-	var got int64 = s.Find("bar")
-	if got != 3 {
-		t.Error("Find yes", got)
+	iter := s.Find("bar")
+	var pos int = -1
+	if iter != nil {
+	    pos = int(iter.Recede(100))
+	}
+	if pos != 3 {
+		t.Error("Find yes", pos)
 	}
-	got = s.Find("banana")
-	if got != -1 {
-		t.Error("Find no", got)
+	iter = s.Find("banana")
+	if iter != nil {
+		t.Error("Find no")
 	}
 }
 


Mime
View raw message