whimsical-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From s...@apache.org
Subject [whimsy] branch master updated: Fix up some ASCII conversions
Date Tue, 26 May 2020 18:32:13 GMT
This is an automated email from the ASF dual-hosted git repository.

sebb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/whimsy.git


The following commit(s) were added to refs/heads/master by this push:
     new 5ad0f71  Fix up some ASCII conversions
5ad0f71 is described below

commit 5ad0f7143bc70f12af95ec2ef3ba790a67d77885
Author: Sebb <sebb@apache.org>
AuthorDate: Tue May 26 19:31:50 2020 +0100

    Fix up some ASCII conversions
    
    Also handle upper/lower case better
---
 lib/whimsy/asf/person.rb | 142 ++++++++++++++++++++++++++++++++++-------------
 1 file changed, 102 insertions(+), 40 deletions(-)

diff --git a/lib/whimsy/asf/person.rb b/lib/whimsy/asf/person.rb
index 5cfd60c..42c9fa7 100644
--- a/lib/whimsy/asf/person.rb
+++ b/lib/whimsy/asf/person.rb
@@ -11,56 +11,109 @@ module ASF
   class Person
     # sort support
 
-    def self.asciize(name)
-      if name.match %r{[^\x00-\x7F]}
+    # Convert non-ASCII characters to equivalent ASCII
+    # optionally: replace any remaining non-word characters (e.g. '.' and space) with '-'
+    def self.asciize(name, nonWord = '-')
+      if name.match %r{[^\x00-\x7F]} # at least one non-ASCII character present
         # digraphs.  May be culturally sensitive
+        # Note that the combining accents require matching two characters
         name.gsub! %r{\u00df}, 'ss'
-        name.gsub! %r{\u00e4|a\u0308}, 'ae'
-        name.gsub! %r{\u00e5|a\u030a}, 'aa'
-        name.gsub! %r{\u00e6}, 'ae'
-        name.gsub! %r{\u00f1|n\u0303}, 'ny'
-        name.gsub! %r{\u00f6|o\u0308}, 'oe'
-        name.gsub! %r{\u00fc|u\u0308}, 'ue'
+        name.gsub! %r{\u00e4|a\u0308}, 'ae' # 308 = combining diaeresis
+        name.gsub! %r{\u00e5|a\u030a}, 'aa' # a with ring above: should this translate as
'a'?
+        name.gsub! %r{\u00c5|A\u030a}, 'AA' # a with ring above: should this translate as
'a'?
+        name.gsub! %r{\u00e6},         'ae' # small letter ae
+        name.gsub! %r{\u00c6},         'AE' # large letter AE
+        name.gsub! %r{\u00f1|n\u0303}, 'ny' # 303 = combining tilde
+        name.gsub! %r{\u00d1|N\u0303}, 'NY' # 303 = combining tilde
+        name.gsub! %r{\u00f6|o\u0308}, 'oe' # 308 = combining diaeresis
+        name.gsub! %r{\u00d6|O\u0308}, 'OE' # 308 = combining diaeresis
+        name.gsub! %r{\u00de},         'TH' # thorn
+        name.gsub! %r{\u00fe},         'th' # thorn
+        name.gsub! %r{\u00fc|u\u0308}, 'ue' # 308 = combining diaeresis
+        name.gsub! %r{\u00dc|U\u0308}, 'UE' # 308 = combining diaeresis
 
         # latin 1
-        name.gsub! %r{\u00c9}, 'e'
-        name.gsub! %r{\u00d3}, 'o'
-        name.gsub! %r{[\u00e0-\u00e5]}, 'a'
-        name.gsub! %r{\u00e7}, 'c'
+        name.gsub! %r{[\u00e0-\u00e3]}, 'a' # a with various accents
+        name.gsub! %r{[\u00c0-\u00c3]}, 'A' # A with various accents
+        name.gsub! %r{\u00e7},          'c' # c-cedilla
+        name.gsub! %r{\u00c7},          'C' # C-cedilla
+        name.gsub! %r{\u00f0},          'd' # eth
+        name.gsub! %r{\u00d0},          'D' # eth
         name.gsub! %r{[\u00e8-\u00eb]}, 'e'
+        name.gsub! %r{[\u00c8-\u00cb]}, 'E'
         name.gsub! %r{[\u00ec-\u00ef]}, 'i'
-        name.gsub! %r{[\u00f2-\u00f6]|\u00f8}, 'o'
-        name.gsub! %r{[\u00f9-\u00fc]}, 'u'
-        name.gsub! %r{[\u00fd\u00ff]}, 'y'
+        name.gsub! %r{[\u00cc-\u00cf]}, 'I'
+        name.gsub! %r{[\u00f2-\u00f5\u00f8]}, 'o'
+        name.gsub! %r{[\u00d2-\u00d5\u00d8]}, 'O'
+        name.gsub! %r{[\u00f9-\u00fb]}, 'u'
+        name.gsub! %r{[\u00d9-\u00db]}, 'U'
+        name.gsub! %r{[\u00fd\u00ff]},  'y'
+        name.gsub! %r{[\u00dd\u0178]},  'Y'
 
         # Latin Extended-A
-        name.gsub! %r{[\u0100-\u0105]}, 'a'
-        name.gsub! %r{[\u0106-\u010d]}, 'c'
-        name.gsub! %r{[\u010e-\u0111]}, 'd'
-        name.gsub! %r{[\u0112-\u011b]}, 'e'
-        name.gsub! %r{[\u011c-\u0123]}, 'g'
-        name.gsub! %r{[\u0124-\u0127]}, 'h'
-        name.gsub! %r{[\u0128-\u0131]}, 'i'
-        name.gsub! %r{[\u0132-\u0133]}, 'ij'
-        name.gsub! %r{[\u0134-\u0135]}, 'j'
-        name.gsub! %r{[\u0136-\u0138]}, 'k'
-        name.gsub! %r{[\u0139-\u0142]}, 'l'
-        name.gsub! %r{[\u0143-\u014b]}, 'n'
-        name.gsub! %r{[\u014C-\u0151]}, 'o'
-        name.gsub! %r{[\u0152-\u0153]}, 'oe'
-        name.gsub! %r{[\u0154-\u0159]}, 'r'
-        name.gsub! %r{[\u015a-\u0162]}, 's'
-        name.gsub! %r{[\u0162-\u0167]}, 't'
-        name.gsub! %r{[\u0168-\u0173]}, 'u'
-        name.gsub! %r{[\u0174-\u0175]}, 'w'
-        name.gsub! %r{[\u0176-\u0178]}, 'y'
-        name.gsub! %r{[\u0179-\u017e]}, 'z'
-
-        # denormalized diacritics
+        name.gsub! %r{[\u0100\u0102\u0104]}, 'A'
+        name.gsub! %r{[\u0101\u0103\u0105]}, 'a'
+        name.gsub! %r{[\u0106\u0108\u010A\u010C]}, 'C'
+        name.gsub! %r{[\u0107\u0109\u010B\u010D]}, 'c'
+        name.gsub! %r{[\u010E\u0110]}, 'D'
+        name.gsub! %r{[\u010F\u0111]}, 'd'
+        name.gsub! %r{[\u0112\u0114\u0116\u0118\u011A]}, 'E'
+        name.gsub! %r{[\u0113\u0115\u0117\u0119\u011B]}, 'e'
+        name.gsub! %r{[\u014A]}, 'ENG'
+        name.gsub! %r{[\u014B]}, 'eng'
+        name.gsub! %r{[\u011C\u011E\u0120\u0122]}, 'G'
+        name.gsub! %r{[\u011D\u011F\u0121\u0123]}, 'g'
+        name.gsub! %r{[\u0124\u0126]}, 'H'
+        name.gsub! %r{[\u0125\u0127]}, 'h'
+        name.gsub! %r{[\u0128\u012A\u012C\u012E\u0130]}, 'I'
+        name.gsub! %r{[\u0129\u012B\u012D\u012F\u0131]}, 'i'
+        name.gsub! %r{[\u0132]}, 'IJ'
+        name.gsub! %r{[\u0133]}, 'ij'
+        name.gsub! %r{[\u0134]}, 'J'
+        name.gsub! %r{[\u0135]}, 'j'
+        name.gsub! %r{[\u0136]}, 'K'
+        name.gsub! %r{[\u0137]}, 'k'
+        name.gsub! %r{[\u0138]}, 'kra'
+        name.gsub! %r{[\u0139\u013B\u013D\u013F\u0141]}, 'L'
+        name.gsub! %r{[\u013A\u013C\u013E\u0140\u0142]}, 'l'
+        name.gsub! %r{[\u0143\u0145\u0147]}, 'N'
+        name.gsub! %r{[\u0144\u0146\u0148\u0149]}, 'n'
+        name.gsub! %r{[\u014C\u014E\u0150]}, 'O'
+        name.gsub! %r{[\u014D\u014F\u0151]}, 'o'
+        name.gsub! %r{[\u0152]}, 'OE'
+        name.gsub! %r{[\u0153]}, 'oe'
+        name.gsub! %r{[\u0154\u0156\u0158]}, 'R'
+        name.gsub! %r{[\u0155\u0157\u0159]}, 'r'
+        name.gsub! %r{[\u015A\u015C\u015E\u0160]}, 'S'
+        name.gsub! %r{[\u015B\u015D\u015F\u0161]}, 's'
+        name.gsub! %r{[\u0162\u0164\u0166]}, 'T'
+        name.gsub! %r{[\u0163\u0165\u0167]}, 't'
+        name.gsub! %r{[\u0168\u016A\u016C\u016E\u0170\u0172]}, 'U'
+        name.gsub! %r{[\u0169\u016B\u016D\u016F\u0171\u0173]}, 'u'
+        name.gsub! %r{[\u0174]}, 'W'
+        name.gsub! %r{[\u0175]}, 'w'
+        name.gsub! %r{[\u0176\u0178]}, 'Y'
+        name.gsub! %r{[\u0177]}, 'y'
+        name.gsub! %r{[\u0179\u017B\u017D]}, 'Z'
+        name.gsub! %r{[\u017A\u017C\u017E]}, 'z'
+
+        # Latin Extended Additional
+        # N.B. Only ones seen in iclas.txt are included here
+        name.gsub! %r{\u1ea0},          'A' # A with combining dot below
+        name.gsub! %r{\u1ea1},          'a' # a with combining dot below
+        name.gsub! %r{\u1ec4},          'E' # E with circumflex and tilde
+        name.gsub! %r{\u1ec5},          'e' # e with circumflex and tilde
+
+        # remove unhandled combining diacritics (some combinations are handled above)
         name.gsub! %r{[\u0300-\u036f]}, ''
       end
 
-      name.strip.gsub %r{[^\w]+}, '-'
+      if nonWord
+        # deal with any remaining non-word characters
+        name.strip.gsub %r{[^\w]+}, nonWord if nonWord
+      else
+        name
+      end
     end
 
     # generational suffixes
@@ -104,7 +157,16 @@ module ASF
       result
     end
 
-    # return name in a sortable order (last name first)
+    # DRAFT
+    # return name suitable for a filename stem
+    # Should normally be applied to the legal name
+    def self.stem_DRAFT(name)
+      # need to split before 
+      name = name.gsub(',', ' ').split(/ +/).map{|n|n.gsub(%r{^(Dr|Jr|Sr|[A-Z])\.$},'\1')}
+      asciize(name.join('-')).downcase
+    end
+
+    # return public name in a sortable order (last name first)
     def sortable_name
       Person.sortable_name(self.public_name)
     end


Mime
View raw message