lucy-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mar...@apache.org
Subject [1/9] lucy git commit: Copy C host-specific code to Go.
Date Mon, 03 Aug 2015 21:59:14 GMT
Repository: lucy
Updated Branches:
  refs/heads/master 25d664cdc -> eded0db31


Copy C host-specific code to Go.

Copy the C binding code in anticipation of replacing it with Go-specific
binding code.


Project: http://git-wip-us.apache.org/repos/asf/lucy/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/fed6ca76
Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/fed6ca76
Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/fed6ca76

Branch: refs/heads/master
Commit: fed6ca76b6b8310c1db97bfbbed1b56af47e6194
Parents: 633c4d3
Author: Marvin Humphrey <marvin@rectangular.com>
Authored: Tue Jul 14 18:57:11 2015 -0700
Committer: Marvin Humphrey <marvin@rectangular.com>
Committed: Fri Jul 31 14:13:55 2015 -0700

----------------------------------------------------------------------
 common/charmonizer.c    |   2 +-
 common/charmonizer.main |   2 +-
 go/cfext/lucy.c         | 483 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 485 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucy/blob/fed6ca76/common/charmonizer.c
----------------------------------------------------------------------
diff --git a/common/charmonizer.c b/common/charmonizer.c
index d3c3dc3..f0ebfcd 100644
--- a/common/charmonizer.c
+++ b/common/charmonizer.c
@@ -8134,7 +8134,7 @@ lucy_MakeFile_new(chaz_CLI *cli) {
         self->host_src_dir = "xs";
     }
 	else if (chaz_CLI_defined(cli, "enable-go")) {
-        self->host_src_dir = "../c/src";
+        self->host_src_dir = "cfext";
 	}
     else {
         self->host_src_dir = "src";

http://git-wip-us.apache.org/repos/asf/lucy/blob/fed6ca76/common/charmonizer.main
----------------------------------------------------------------------
diff --git a/common/charmonizer.main b/common/charmonizer.main
index 991593b..800442e 100644
--- a/common/charmonizer.main
+++ b/common/charmonizer.main
@@ -252,7 +252,7 @@ lucy_MakeFile_new(chaz_CLI *cli) {
         self->host_src_dir = "xs";
     }
 	else if (chaz_CLI_defined(cli, "enable-go")) {
-        self->host_src_dir = "../c/src";
+        self->host_src_dir = "cfext";
 	}
     else {
         self->host_src_dir = "src";

http://git-wip-us.apache.org/repos/asf/lucy/blob/fed6ca76/go/cfext/lucy.c
----------------------------------------------------------------------
diff --git a/go/cfext/lucy.c b/go/cfext/lucy.c
new file mode 100644
index 0000000..d1044df
--- /dev/null
+++ b/go/cfext/lucy.c
@@ -0,0 +1,483 @@
+/* Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+
+#define C_LUCY_REGEXTOKENIZER
+#define C_LUCY_DOC
+#define C_LUCY_DOCREADER
+#define C_LUCY_DEFAULTDOCREADER
+#define C_LUCY_INVERTER
+#define C_LUCY_INVERTERENTRY
+#define CFISH_USE_SHORT_NAMES
+#define LUCY_USE_SHORT_NAMES
+
+
+
+#include <string.h>
+
+#include "charmony.h"
+
+#include "Lucy/Analysis/RegexTokenizer.h"
+#include "Lucy/Document/Doc.h"
+#include "Lucy/Index/DocReader.h"
+#include "Lucy/Index/Inverter.h"
+#include "Clownfish/Blob.h"
+#include "Clownfish/String.h"
+#include "Clownfish/Err.h"
+#include "Clownfish/Hash.h"
+#include "Clownfish/HashIterator.h"
+#include "Clownfish/Num.h"
+#include "Clownfish/Vector.h"
+#include "Clownfish/Class.h"
+#include "Clownfish/Util/Memory.h"
+#include "Clownfish/Util/StringHelper.h"
+#include "Lucy/Analysis/Token.h"
+#include "Lucy/Analysis/Inversion.h"
+#include "Lucy/Document/HitDoc.h"
+#include "Lucy/Index/Segment.h"
+#include "Lucy/Plan/FieldType.h"
+#include "Lucy/Plan/Schema.h"
+#include "Lucy/Store/InStream.h"
+#include "Lucy/Store/OutStream.h"
+#include "Lucy/Util/Freezer.h"
+
+#if defined(CHY_HAS_PCRE_H)
+
+#include <pcre.h>
+
+static uint32_t
+S_count_code_points(const char *string, size_t len);
+
+bool
+RegexTokenizer_is_available(void) {
+    return true;
+}
+
+RegexTokenizer*
+RegexTokenizer_init(RegexTokenizer *self, String *pattern) {
+    Analyzer_init((Analyzer*)self);
+    RegexTokenizerIVARS *const ivars = RegexTokenizer_IVARS(self);
+
+    char *pattern_buf = NULL;
+    const char *pattern_ptr;
+    if (pattern) {
+        ivars->pattern = Str_Clone(pattern);
+        pattern_buf = Str_To_Utf8(ivars->pattern);
+        pattern_ptr = pattern_buf;
+    }
+    else {
+        pattern_ptr = "\\w+(?:['\\x{2019}]\\w+)*";
+        ivars->pattern
+            = Str_new_from_trusted_utf8(pattern_ptr, strlen(pattern_ptr));
+    }
+
+    int options = PCRE_UTF8 | PCRE_NO_UTF8_CHECK;
+#ifdef PCRE_BSR_UNICODE
+    // Available since PCRE 7.4
+    options |= PCRE_BSR_UNICODE;
+#endif
+#ifdef PCRE_NEWLINE_LF
+    // Available since PCRE 6.7
+    options |= PCRE_NEWLINE_LF;
+#endif
+    const char *err_ptr;
+    int err_offset;
+    pcre *re = pcre_compile(pattern_ptr, options, &err_ptr, &err_offset, NULL);
+    if (pattern_buf) {
+        FREEMEM(pattern_buf);
+    }
+    if (!re) {
+        THROW(ERR, "%s", err_ptr);
+    }
+
+    // TODO: Check whether pcre_study improves performance
+
+    ivars->token_re = re;
+
+    return self;
+}
+
+void
+RegexTokenizer_Destroy_IMP(RegexTokenizer *self) {
+    RegexTokenizerIVARS *const ivars = RegexTokenizer_IVARS(self);
+    DECREF(ivars->pattern);
+    pcre *re = (pcre*)ivars->token_re;
+    if (re) {
+        pcre_free(re);
+    }
+    SUPER_DESTROY(self, REGEXTOKENIZER);
+}
+
+void
+RegexTokenizer_Tokenize_Utf8_IMP(RegexTokenizer *self, const char *string,
+                                 size_t string_len, Inversion *inversion) {
+    RegexTokenizerIVARS *const ivars = RegexTokenizer_IVARS(self);
+    pcre      *re          = (pcre*)ivars->token_re;
+    int        byte_offset = 0;
+    uint32_t   cp_offset   = 0; // Code points
+    int        options     = PCRE_NO_UTF8_CHECK;
+    int        ovector[3];
+
+    int return_code = pcre_exec(re, NULL, string, string_len, byte_offset,
+                                options, ovector, 3);
+    while (return_code >= 0) {
+        const char *match     = string + ovector[0];
+        size_t      match_len = ovector[1] - ovector[0];
+
+        uint32_t cp_before  = S_count_code_points(string + byte_offset,
+                                                  ovector[0] - byte_offset);
+        uint32_t cp_start   = cp_offset + cp_before;
+        uint32_t cp_matched = S_count_code_points(match, match_len);
+        uint32_t cp_end     = cp_start + cp_matched;
+
+        // Add a token to the new inversion.
+        Token *token = Token_new(match, match_len, cp_start, cp_end, 1.0f, 1);
+        Inversion_Append(inversion, token);
+
+        byte_offset = ovector[1];
+        cp_offset   = cp_end;
+        return_code = pcre_exec(re, NULL, string, string_len, byte_offset,
+                                options, ovector, 3);
+    }
+
+    if (return_code != PCRE_ERROR_NOMATCH) {
+        THROW(ERR, "pcre_exec failed: %d", return_code);
+    }
+}
+
+static uint32_t
+S_count_code_points(const char *string, size_t len) {
+    uint32_t num_code_points = 0;
+    size_t i = 0;
+
+    while (i < len) {
+        i += StrHelp_UTF8_COUNT[(uint8_t)(string[i])];
+        ++num_code_points;
+    }
+
+    if (i != len) {
+        THROW(ERR, "Match between code point boundaries in '%s'", string);
+    }
+
+    return num_code_points;
+}
+
+#else // CHY_HAS_PCRE_H
+
+bool
+RegexTokenizer_is_available(void) {
+    return false;
+}
+
+RegexTokenizer*
+RegexTokenizer_init(RegexTokenizer *self, String *pattern) {
+    UNUSED_VAR(self);
+    UNUSED_VAR(pattern);
+    THROW(ERR,
+          "RegexTokenizer is not available because Lucy was compiled"
+          " without PCRE.");
+    UNREACHABLE_RETURN(RegexTokenizer*);
+}
+
+void
+RegexTokenizer_Set_Token_RE_IMP(RegexTokenizer *self, void *token_re) {
+    UNUSED_VAR(self);
+    UNUSED_VAR(token_re);
+    THROW(ERR,
+          "RegexTokenizer is not available because Lucy was compiled"
+          " without PCRE.");
+}
+
+void
+RegexTokenizer_Destroy_IMP(RegexTokenizer *self) {
+    UNUSED_VAR(self);
+    THROW(ERR,
+          "RegexTokenizer is not available because Lucy was compiled"
+          " without PCRE.");
+}
+
+void
+RegexTokenizer_Tokenize_Utf8_IMP(RegexTokenizer *self, const char *string,
+                                 size_t string_len, Inversion *inversion) {
+    UNUSED_VAR(self);
+    UNUSED_VAR(string);
+    UNUSED_VAR(string_len);
+    UNUSED_VAR(inversion);
+    THROW(ERR,
+          "RegexTokenizer is not available because Lucy was compiled"
+          " without PCRE.");
+}
+
+#endif // CHY_HAS_PCRE_H
+
+/********************************** Doc ********************************/
+
+Doc*
+Doc_init(Doc *self, void *fields, int32_t doc_id) {
+    DocIVARS *const ivars = Doc_IVARS(self);
+    Hash *hash;
+
+    if (fields) {
+        hash = (Hash *)INCREF(CERTIFY(fields, HASH));
+    }
+    else {
+        hash = Hash_new(0);
+    }
+    ivars->fields = hash;
+    ivars->doc_id = doc_id;
+
+    return self;
+}
+
+void
+Doc_Set_Fields_IMP(Doc *self, void *fields) {
+    DocIVARS *const ivars = Doc_IVARS(self);
+    DECREF(ivars->fields);
+    ivars->fields = CERTIFY(fields, HASH);
+}
+
+uint32_t
+Doc_Get_Size_IMP(Doc *self) {
+    Hash *hash = (Hash*)Doc_IVARS(self)->fields;
+    return Hash_Get_Size(hash);
+}
+
+void
+Doc_Store_IMP(Doc *self, String *field, Obj *value) {
+    Hash *hash = (Hash*)Doc_IVARS(self)->fields;
+    Hash_Store(hash, field, INCREF(value));
+}
+
+void
+Doc_Serialize_IMP(Doc *self, OutStream *outstream) {
+    DocIVARS *const ivars = Doc_IVARS(self);
+    Hash *hash = (Hash*)ivars->fields;
+    Freezer_serialize_hash(hash, outstream);
+    OutStream_Write_C32(outstream, ivars->doc_id);
+}
+
+Doc*
+Doc_Deserialize_IMP(Doc *self, InStream *instream) {
+    DocIVARS *const ivars = Doc_IVARS(self);
+    ivars->fields = Freezer_read_hash(instream);
+    ivars->doc_id = InStream_Read_C32(instream);
+    return self;
+}
+
+Obj*
+Doc_Extract_IMP(Doc *self, String *field) {
+    Hash *hash = (Hash*)Doc_IVARS(self)->fields;
+    return INCREF(Hash_Fetch(hash, field));
+}
+
+Hash*
+Doc_Dump_IMP(Doc *self) {
+    UNUSED_VAR(self);
+    THROW(ERR, "TODO");
+    UNREACHABLE_RETURN(Hash*);
+}
+
+Doc*
+Doc_Load_IMP(Doc *self, Obj *dump) {
+    UNUSED_VAR(self);
+    UNUSED_VAR(dump);
+    THROW(ERR, "TODO");
+    UNREACHABLE_RETURN(Doc*);
+}
+
+bool
+Doc_Equals_IMP(Doc *self, Obj *other) {
+    if ((Doc*)other == self)   { return true;  }
+    if (!Obj_is_a(other, DOC)) { return false; }
+    DocIVARS *const ivars = Doc_IVARS(self);
+    DocIVARS *const ovars = Doc_IVARS((Doc*)other);
+    return Hash_Equals((Hash*)ivars->fields, (Obj*)ovars->fields);
+}
+
+void
+Doc_Destroy_IMP(Doc *self) {
+    DocIVARS *const ivars = Doc_IVARS(self);
+    DECREF(ivars->fields);
+    SUPER_DESTROY(self, DOC);
+}
+
+
+/**************************** DocReader *****************************/
+
+HitDoc*
+DefDocReader_Fetch_Doc_IMP(DefaultDocReader *self, int32_t doc_id) {
+    DefaultDocReaderIVARS *const ivars = DefDocReader_IVARS(self);
+    Schema   *const schema = ivars->schema;
+    InStream *const dat_in = ivars->dat_in;
+    InStream *const ix_in  = ivars->ix_in;
+    Hash     *const fields = Hash_new(1);
+    int64_t   start;
+    uint32_t  num_fields;
+    uint32_t  field_name_cap = 31;
+    char     *field_name = (char*)MALLOCATE(field_name_cap + 1);
+
+    // Get data file pointer from index, read number of fields.
+    InStream_Seek(ix_in, (int64_t)doc_id * 8);
+    start = InStream_Read_U64(ix_in);
+    InStream_Seek(dat_in, start);
+    num_fields = InStream_Read_C32(dat_in);
+
+    // Decode stored data and build up the doc field by field.
+    while (num_fields--) {
+        uint32_t        field_name_len;
+        Obj       *value;
+        FieldType *type;
+
+        // Read field name.
+        field_name_len = InStream_Read_C32(dat_in);
+        if (field_name_len > field_name_cap) {
+            field_name_cap = field_name_len;
+            field_name     = (char*)REALLOCATE(field_name,
+                                                    field_name_cap + 1);
+        }
+        InStream_Read_Bytes(dat_in, field_name, field_name_len);
+
+        // Find the Field's FieldType.
+        String *field_name_str = SSTR_WRAP_UTF8(field_name, field_name_len);
+        type = Schema_Fetch_Type(schema, field_name_str);
+
+        // Read the field value.
+        switch (FType_Primitive_ID(type) & FType_PRIMITIVE_ID_MASK) {
+            case FType_TEXT: {
+                    uint32_t value_len = InStream_Read_C32(dat_in);
+                    char *buf = (char*)MALLOCATE(value_len + 1);
+                    InStream_Read_Bytes(dat_in, buf, value_len);
+                    buf[value_len] = '\0';
+                    value = (Obj*)Str_new_steal_utf8(buf, value_len);
+                    break;
+                }
+            case FType_BLOB: {
+                    uint32_t value_len = InStream_Read_C32(dat_in);
+                    char *buf = (char*)MALLOCATE(value_len);
+                    InStream_Read_Bytes(dat_in, buf, value_len);
+                    value = (Obj*)Blob_new_steal(buf, value_len);
+                    break;
+                }
+            case FType_FLOAT32:
+                value = (Obj*)Float_new(InStream_Read_F32(dat_in));
+                break;
+            case FType_FLOAT64:
+                value = (Obj*)Float_new(InStream_Read_F64(dat_in));
+                break;
+            case FType_INT32:
+                value = (Obj*)Int_new((int32_t)InStream_Read_C32(dat_in));
+                break;
+            case FType_INT64:
+                value = (Obj*)Int_new((int64_t)InStream_Read_C64(dat_in));
+                break;
+            default:
+                value = NULL;
+                THROW(ERR, "Unrecognized type: %o", type);
+        }
+
+        // Store the value.
+        Hash_Store_Utf8(fields, field_name, field_name_len, value);
+    }
+    FREEMEM(field_name);
+
+    HitDoc *retval = HitDoc_new(fields, doc_id, 0.0);
+    DECREF(fields);
+    return retval;
+}
+
+/**************************** Inverter *****************************/
+
+static InverterEntry*
+S_fetch_entry(InverterIVARS *ivars, String *field) {
+    Schema *const schema = ivars->schema;
+    int32_t field_num = Seg_Field_Num(ivars->segment, field);
+    if (!field_num) {
+        // This field seems not to be in the segment yet.  Try to find it in
+        // the Schema.
+        if (Schema_Fetch_Type(schema, field)) {
+            // The field is in the Schema.  Get a field num from the Segment.
+            field_num = Seg_Add_Field(ivars->segment, field);
+        }
+        else {
+            // We've truly failed to find the field.  The user must
+            // not have spec'd it.
+            THROW(ERR, "Unknown field name: '%o'", field);
+        }
+    }
+
+    InverterEntry *entry
+        = (InverterEntry*)Vec_Fetch(ivars->entry_pool, field_num);
+    if (!entry) {
+        entry = InvEntry_new(schema, (String*)field, field_num);
+        Vec_Store(ivars->entry_pool, field_num, (Obj*)entry);
+    }
+    return entry;
+}
+
+void
+Inverter_Invert_Doc_IMP(Inverter *self, Doc *doc) {
+    InverterIVARS *const ivars = Inverter_IVARS(self);
+    Hash *const fields = (Hash*)Doc_Get_Fields(doc);
+
+    // Prepare for the new doc.
+    Inverter_Set_Doc(self, doc);
+
+    // Extract and invert the doc's fields.
+    HashIterator *iter = HashIter_new(fields);
+    while (HashIter_Next(iter)) {
+        String *field = HashIter_Get_Key(iter);
+        Obj    *obj   = HashIter_Get_Value(iter);
+
+        InverterEntry *inventry = S_fetch_entry(ivars, field);
+        InverterEntryIVARS *inventry_ivars = InvEntry_IVARS(inventry);
+        FieldType *type = inventry_ivars->type;
+
+        // Get the field value.
+        switch (FType_Primitive_ID(type) & FType_PRIMITIVE_ID_MASK) {
+            case FType_TEXT: {
+                    CERTIFY(obj, STRING);
+                    break;
+                }
+            case FType_BLOB: {
+                    CERTIFY(obj, BLOB);
+                    break;
+                }
+            case FType_INT32:
+            case FType_INT64: {
+                    CERTIFY(obj, INTEGER);
+                    break;
+                }
+            case FType_FLOAT32:
+            case FType_FLOAT64: {
+                    CERTIFY(obj, FLOAT);
+                    break;
+                }
+            default:
+                THROW(ERR, "Unrecognized type: %o", type);
+        }
+
+        if (inventry_ivars->value != obj) {
+            DECREF(inventry_ivars->value);
+            inventry_ivars->value = INCREF(obj);
+        }
+
+        Inverter_Add_Field(self, inventry);
+    }
+    DECREF(iter);
+}
+
+


Mime
View raw message