lucy-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From nwelln...@apache.org
Subject [lucy-commits] [3/3] git commit: refs/heads/master - Upgrade StandardTokenizer to Unicode 6.3.0
Date Mon, 14 Oct 2013 19:56:07 GMT
Upgrade StandardTokenizer to Unicode 6.3.0

Hebrew word break behavior has changed in Unicode 6.3.0. Users working
with Hebrew text should consider a reindex.


Project: http://git-wip-us.apache.org/repos/asf/lucy/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy/commit/fdae9b57
Tree: http://git-wip-us.apache.org/repos/asf/lucy/tree/fdae9b57
Diff: http://git-wip-us.apache.org/repos/asf/lucy/diff/fdae9b57

Branch: refs/heads/master
Commit: fdae9b57bc5efef16d55180b9a864f690a620fe0
Parents: 668e82a
Author: Nick Wellnhofer <wellnhofer@aevum.de>
Authored: Mon Oct 14 21:35:19 2013 +0200
Committer: Nick Wellnhofer <wellnhofer@aevum.de>
Committed: Mon Oct 14 21:50:35 2013 +0200

----------------------------------------------------------------------
 core/Lucy/Analysis/StandardTokenizer.c          |   71 +-
 core/Lucy/Test/Analysis/TestStandardTokenizer.c |    2 +-
 devel/bin/gen_word_break_data.pl                |   25 +-
 modules/unicode/ucd/WordBreak.tab               | 1344 ++++++------
 modules/unicode/ucd/WordBreakTest.json          | 1944 ++++++++++++++++--
 5 files changed, 2608 insertions(+), 778 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucy/blob/fdae9b57/core/Lucy/Analysis/StandardTokenizer.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Analysis/StandardTokenizer.c b/core/Lucy/Analysis/StandardTokenizer.c
index 9992927..d5f4521 100644
--- a/core/Lucy/Analysis/StandardTokenizer.c
+++ b/core/Lucy/Analysis/StandardTokenizer.c
@@ -34,15 +34,18 @@
  * in devel/bin.
  */
 
-#define WB_ASingle        1
-#define WB_ALetter        2
-#define WB_Numeric        3
-#define WB_Katakana       4
-#define WB_ExtendNumLet   5
-#define WB_Extend_Format  6
-#define WB_MidNumLet      7
-#define WB_MidLetter      8
-#define WB_MidNum         9
+#define WB_ASingle          1
+#define WB_ALetter          2
+#define WB_Hebrew_Letter    3
+#define WB_Numeric          4
+#define WB_Katakana         5
+#define WB_ExtendNumLet     6
+#define WB_Extend_Format    7
+#define WB_Single_Quote     8
+#define WB_Double_Quote     9
+#define WB_MidNumLet       10
+#define WB_MidLetter       11
+#define WB_MidNum          12
 
 #include "WordBreak.tab"
 
@@ -170,27 +173,67 @@ S_parse_word(const char *text, size_t len, lucy_StringIter *iter,
 
         switch (wb) {
             case WB_ALetter:
+            case WB_Hebrew_Letter:
             case WB_Numeric:
                 if (state == WB_Katakana) { goto word_break; }
+                // Rules WB5, WB8, WB9, WB10, and WB13b.
                 break;
             case WB_Katakana:
-                if (state == WB_ALetter || state == WB_Numeric) {
+                if (state != WB_Katakana && state != WB_ExtendNumLet) {
                     goto word_break;
                 }
+                // Rules WB13 and WB13b.
                 break;
             case WB_ExtendNumLet:
+                // Rule WB13a.
                 break;
             case WB_Extend_Format:
-                // keep state
+                // Rule WB4. Keep state.
                 wb = state;
                 break;
+            case WB_Single_Quote:
             case WB_MidNumLet:
             case WB_MidLetter:
             case WB_MidNum:
-                if ((state == WB_ALetter && wb != WB_MidNum)
-                    ||  (state == WB_Numeric && wb != WB_MidLetter)) {
+                if (state == WB_ALetter) {
+                    if (wb == WB_MidNum) { goto word_break; }
                     wb = S_skip_extend_format(text, len, iter);
-                    if (wb == state) { break; }
+                    if (wb == WB_ALetter || wb == WB_Hebrew_Letter) {
+                        // Rules WB6 and WB7.
+                        state = wb;
+                        break;
+                    }
+                }
+                else if (state == WB_Hebrew_Letter) {
+                    if (wb == WB_MidNum) { goto word_break; }
+                    if (wb == WB_Single_Quote) {
+                        // Rule WB7a.
+                        ++end.byte_pos;
+                        ++end.char_pos;
+                    }
+                    wb = S_skip_extend_format(text, len, iter);
+                    if (wb == WB_ALetter || wb == WB_Hebrew_Letter) {
+                        // Rules WB6 and WB7.
+                        state = wb;
+                        break;
+                    }
+                }
+                else if (state == WB_Numeric) {
+                    if (wb == WB_MidLetter) { goto word_break; }
+                    wb = S_skip_extend_format(text, len, iter);
+                    if (wb == state) {
+                        // Rules WB11 and WB12.
+                        break;
+                    }
+                }
+                goto word_break;
+            case WB_Double_Quote:
+                if (state == WB_Hebrew_Letter) {
+                    wb = S_skip_extend_format(text, len, iter);
+                    if (wb == state) {
+                        // Rules WB7b and WB7c.
+                        break;
+                    }
                 }
                 goto word_break;
             default:

http://git-wip-us.apache.org/repos/asf/lucy/blob/fdae9b57/core/Lucy/Test/Analysis/TestStandardTokenizer.c
----------------------------------------------------------------------
diff --git a/core/Lucy/Test/Analysis/TestStandardTokenizer.c b/core/Lucy/Test/Analysis/TestStandardTokenizer.c
index 8a0ae3f..7f181c1 100644
--- a/core/Lucy/Test/Analysis/TestStandardTokenizer.c
+++ b/core/Lucy/Test/Analysis/TestStandardTokenizer.c
@@ -117,7 +117,7 @@ test_tokenizer(TestBatchRunner *runner) {
 
 void
 TestStandardTokenizer_Run_IMP(TestStandardTokenizer *self, TestBatchRunner *runner) {
-    TestBatchRunner_Plan(runner, (TestBatch*)self, 1084);
+    TestBatchRunner_Plan(runner, (TestBatch*)self, 1378);
     test_Dump_Load_and_Equals(runner);
     test_tokenizer(runner);
 }

http://git-wip-us.apache.org/repos/asf/lucy/blob/fdae9b57/devel/bin/gen_word_break_data.pl
----------------------------------------------------------------------
diff --git a/devel/bin/gen_word_break_data.pl b/devel/bin/gen_word_break_data.pl
index 9bcf916..c94d18a 100755
--- a/devel/bin/gen_word_break_data.pl
+++ b/devel/bin/gen_word_break_data.pl
@@ -32,7 +32,7 @@ the UCD to JSON.
 UCD_SRC_DIR should point to a directory containing the files
 WordBreakProperty.txt, WordBreakTest.txt, and DerivedCoreProperties.txt from
 the Unicode Character Database available at
-L<http://www.unicode.org/Public/6.2.0/ucd/>.
+L<http://www.unicode.org/Public/6.3.0/ucd/>.
 
 =head1 OUTPUT FILES
 
@@ -63,14 +63,17 @@ my %wb_map = (
     Newline            => 0,
     Regional_Indicator => 0,  # These are symbols, so ignore them.
     ALetter            => 2,
-    Numeric            => 3,
-    Katakana           => 4,
-    ExtendNumLet       => 5,
-    Extend             => 6,
-    Format             => 6,
-    MidNumLet          => 7,
-    MidLetter          => 8,
-    MidNum             => 9,
+    Hebrew_Letter      => 3,
+    Numeric            => 4,
+    Katakana           => 5,
+    ExtendNumLet       => 6,
+    Extend             => 7,
+    Format             => 7,
+    Single_Quote       => 8,
+    Double_Quote       => 9,
+    MidNumLet          => 10,
+    MidLetter          => 11,
+    MidNum             => 12,
 );
 
 my %opts;
@@ -175,7 +178,7 @@ while (<$in_file>) {
             }
 
             my $wb = $wb->lookup($code);
-            $word = $chr if $wb >= 1 && $wb <= 5;
+            $word = $chr if $wb >= 1 && $wb <= 6;
         }
         elsif ( $break eq "\xD7" ) {    # multiplication sign
             $word .= $chr if $word ne '';
@@ -207,7 +210,7 @@ __DATA__
 
 This file is generated with devel/bin/gen_word_break_data.pl. DO NOT EDIT!
 The contents of this file are derived from the Unicode Character Database,
-version 6.2.0, available from http://www.unicode.org/Public/6.2.0/ucd/.
+version 6.3.0, available from http://www.unicode.org/Public/6.3.0/ucd/.
 The Unicode copyright and permission notice follows.
 
 Copyright (c) 1991-2011 Unicode, Inc. All rights reserved. Distributed under


Mime
View raw message