guile-commits
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Guile-commits] 01/02: Reimplement 'unidata_to_charset.pl' in Awk.


From: Ludovic Courtès
Subject: [Guile-commits] 01/02: Reimplement 'unidata_to_charset.pl' in Awk.
Date: Thu, 24 Mar 2022 09:31:44 -0400 (EDT)

civodul pushed a commit to branch main
in repository guile.

commit 63886aeda2d11a11fcd2415b7ee41e816a6f2bf0
Author: Timothy Sample <samplet@ngyro.com>
AuthorDate: Wed Mar 16 21:13:45 2022 -0600

    Reimplement 'unidata_to_charset.pl' in Awk.
    
    * libguile/unidata_to_charset.pl: Delete file.
    * libguile/unidata_to_charset.awk: New file.
    * libguile/Makefile.am (EXTRA_DIST): Adjust accordingly.
    
    Signed-off-by: Ludovic Courtès <ludo@gnu.org>
---
 libguile/Makefile.am            |   2 +-
 libguile/unidata_to_charset.awk | 409 ++++++++++++++++++++++++++++++++++++++++
 libguile/unidata_to_charset.pl  | 401 ---------------------------------------
 3 files changed, 410 insertions(+), 402 deletions(-)

diff --git a/libguile/Makefile.am b/libguile/Makefile.am
index 40619d379..b2a7d1c51 100644
--- a/libguile/Makefile.am
+++ b/libguile/Makefile.am
@@ -728,7 +728,7 @@ EXTRA_DIST = ChangeLog-scm ChangeLog-threads                
                \
     guile-func-name-check                                              \
     cpp-E.syms cpp-E.c cpp-SIG.syms cpp-SIG.c                          \
     c-tokenize.lex                                                     \
-    scmconfig.h.top libgettext.h unidata_to_charset.pl libguile.map    \
+    scmconfig.h.top libgettext.h unidata_to_charset.awk libguile.map   \
     vm-operations.h libguile-@GUILE_EFFECTIVE_VERSION@-gdb.scm         \
     $(lightening_c_files) $(lightening_extra_files)
 #    $(DOT_DOC_FILES) $(EXTRA_DOT_DOC_FILES) \
diff --git a/libguile/unidata_to_charset.awk b/libguile/unidata_to_charset.awk
new file mode 100644
index 000000000..11dfb2686
--- /dev/null
+++ b/libguile/unidata_to_charset.awk
@@ -0,0 +1,409 @@
+# unidata_to_charset.awk --- Compute SRFI-14 charsets from UnicodeData.txt
+#
+# Copyright (C) 2009, 2010, 2022 Free Software Foundation, Inc.
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 3 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+# Utilities
+###########
+
+# Print MESSAGE to standard error, and exit with STATUS.
+function die(status, message) {
+    print "unidata_to_charset.awk:", message | "cat 1>&2";
+    exit_status = status;
+    exit exit_status;
+}
+
+# Parse the string S as a hexadecimal number.  Note that R, C, and B are
+# local variables that need not be set by callers.  Most Awk
+# implementations have an 'strtonum' function that we could use, but it
+# is not part of POSIX.
+function hex(s, r, c, b) {
+    if (length(s) == 0) {
+        die(1, "Cannot parse empty string as hexadecimal.");
+    }
+    r = 0;
+    for (i = 1; i <= length(s); i++) {
+        c = substr(s, i, 1);
+        b = 0;
+        if      (c == "0") { b =  0; }
+        else if (c == "1") { b =  1; }
+        else if (c == "2") { b =  2; }
+        else if (c == "3") { b =  3; }
+        else if (c == "4") { b =  4; }
+        else if (c == "5") { b =  5; }
+        else if (c == "6") { b =  6; }
+        else if (c == "7") { b =  7; }
+        else if (c == "8") { b =  8; }
+        else if (c == "9") { b =  9; }
+        else if (c == "A") { b = 10; }
+        else if (c == "B") { b = 11; }
+        else if (c == "C") { b = 12; }
+        else if (c == "D") { b = 13; }
+        else if (c == "E") { b = 14; }
+        else if (c == "F") { b = 15; }
+        else { die(1, "Invalid hexadecimal character: " c); }
+        r *= 16;
+        r += b;
+    }
+    return r;
+}
+
+# Program initialization
+########################
+
+BEGIN {
+    # The columns are separated by semicolons.
+    FS = ";";
+
+    # This will help us handle errors.
+    exit_status = 0;
+
+    # List of charsets.
+    all_charsets_count = 0;
+    all_charsets[all_charsets_count++] = "lower_case";
+    all_charsets[all_charsets_count++] = "upper_case";
+    all_charsets[all_charsets_count++] = "title_case";
+    all_charsets[all_charsets_count++] = "letter";
+    all_charsets[all_charsets_count++] = "digit";
+    all_charsets[all_charsets_count++] = "hex_digit";
+    all_charsets[all_charsets_count++] = "letter_plus_digit";
+    all_charsets[all_charsets_count++] = "graphic";
+    all_charsets[all_charsets_count++] = "whitespace";
+    all_charsets[all_charsets_count++] = "printing";
+    all_charsets[all_charsets_count++] = "iso_control";
+    all_charsets[all_charsets_count++] = "punctuation";
+    all_charsets[all_charsets_count++] = "symbol";
+    all_charsets[all_charsets_count++] = "blank";
+    all_charsets[all_charsets_count++] = "ascii";
+    all_charsets[all_charsets_count++] = "empty";
+    all_charsets[all_charsets_count++] = "designated";
+
+    # Initialize charset state table.
+    for (i in all_charsets) {
+        cs = all_charsets[i];
+        state[cs, "start"] = -1;
+        state[cs, "end"] = -1;
+        state[cs, "count"] = 0;
+    }
+}
+
+# Record initialization
+#######################
+
+# In this block we give names to each field, and do some basic
+# initialization.
+{
+    codepoint = hex($1);
+    name = $2;
+    category = $3;
+    uppercase = $13;
+    lowercase = $14;
+
+    codepoint_end = codepoint;
+    charset_count = 0;
+}
+
+# Some pairs of lines in UnicodeData.txt delimit ranges of
+# characters.
+name ~ /First>$/ {
+    getline;
+    last_name = name;
+    sub(/First>$/, "Last>", last_name);
+    if (last_name != $2) {
+        die(1, "Invalid range in Unicode data.");
+        exit_status = 1;
+        exit 1;
+    }
+    codepoint_end = hex($1);
+}
+
+# Character set predicates
+##########################
+
+## The lower_case character set
+###############################
+
+# For Unicode, we follow Java's specification: a character is
+# lowercase if
+#    * it is not in the range [U+2000,U+2FFF] ([8192,12287]), and
+#    * the Unicode attribute table does not give a lowercase mapping
+#      for it, and
+#    * at least one of the following is true:
+#          o the Unicode attribute table gives a mapping to uppercase
+#            for the character, or
+#          o the name for the character in the Unicode attribute table
+#            contains the words "SMALL LETTER" or "SMALL LIGATURE".
+
+(codepoint < 8192 || codepoint > 12287) &&
+lowercase == "" &&
+(uppercase != "" || name ~ /(SMALL LETTER|SMALL LIGATURE)/) {
+    charsets[charset_count++] = "lower_case";
+}
+
+## The upper_case character set
+###############################
+
+# For Unicode, we follow Java's specification: a character is
+# uppercase if
+#    * it is not in the range [U+2000,U+2FFF] ([8192,12287]), and
+#    * the Unicode attribute table does not give an uppercase mapping
+#      for it (this excludes titlecase characters), and
+#    * at least one of the following is true:
+#          o the Unicode attribute table gives a mapping to lowercase
+#            for the character, or
+#          o the name for the character in the Unicode attribute table
+#            contains the words "CAPITAL LETTER" or "CAPITAL LIGATURE".
+
+(codepoint < 8192 || codepoint > 12287) &&
+uppercase == "" &&
+(lowercase != "" || name ~ /(CAPITAL LETTER|CAPITAL LIGATURE)/) {
+    charsets[charset_count++] = "upper_case";
+}
+
+## The title_case character set
+###############################
+
+# A character is titlecase if it has the category Lt in the character
+# attribute database.
+
+category == "Lt" {
+    charsets[charset_count++] = "title_case";
+}
+
+## The letter character set
+###########################
+
+# A letter is any character with one of the letter categories (Lu, Ll,
+# Lt, Lm, Lo) in the Unicode character database.
+
+category == "Lu" ||
+category == "Ll" ||
+category == "Lt" ||
+category == "Lm" ||
+category == "Lo" {
+    charsets[charset_count++] = "letter";
+    charsets[charset_count++] = "letter_plus_digit";
+}
+
+## The digit character set
+##########################
+
+# A character is a digit if it has the category Nd in the character
+# attribute database. In Latin-1 and ASCII, the only such characters
+# are 0123456789. In Unicode, there are other digit characters in
+# other code blocks, such as Gujarati digits and Tibetan digits.
+
+category == "Nd" {
+    charsets[charset_count++] = "digit";
+    charsets[charset_count++] = "letter_plus_digit";
+}
+
+## The hex_digit character set
+##############################
+
+# The only hex digits are 0123456789abcdefABCDEF.
+
+(codepoint >= 48 && codepoint <= 57) ||
+(codepoint >= 65 && codepoint <= 70) ||
+(codepoint >= 97 && codepoint <= 102) {
+    charsets[charset_count++] = "hex_digit";
+}
+
+## The graphic character set
+############################
+
+# Characters that would 'use ink' when printed
+
+category ~ /L|M|N|P|S/ {
+    charsets[charset_count++] = "graphic";
+    charsets[charset_count++] = "printing";
+}
+
+## The whitespace character set
+###############################
+
+# A whitespace character is either
+#    * a character with one of the space, line, or paragraph separator
+#      categories (Zs, Zl or Zp) of the Unicode character database.
+#    * U+0009 (09) Horizontal tabulation (\t control-I)
+#    * U+000A (10) Line feed (\n control-J)
+#    * U+000B (11) Vertical tabulation (\v control-K)
+#    * U+000C (12) Form feed (\f control-L)
+#    * U+000D (13) Carriage return (\r control-M)
+
+category ~ /Zs|Zl|Zp/ ||
+(codepoint >= 9 && codepoint <= 13) {
+    charsets[charset_count++] = "whitespace";
+    charsets[charset_count++] = "printing";
+}
+
+## The iso_control character set
+################################
+
+# The ISO control characters are the Unicode/Latin-1 characters in the
+# ranges [U+0000,U+001F] ([0,31]) and [U+007F,U+009F] ([127,159]).
+
+(codepoint >= 0 && codepoint <= 31) ||
+(codepoint >= 127 && codepoint <= 159) {
+    charsets[charset_count++] = "iso_control";
+}
+
+## The punctuation character set
+################################
+
+# A punctuation character is any character that has one of the
+# punctuation categories in the Unicode character database (Pc, Pd,
+# Ps, Pe, Pi, Pf, or Po.)
+
+# Note that srfi-14 gives conflicting requirements!!  It claims that
+# only the Unicode punctuation is necessary, but, explicitly calls out
+# the soft hyphen character (U+00AD) as punctution.  Current versions
+# of Unicode consider U+00AD to be a formatting character, not
+# punctuation.
+
+category ~ /P/ {
+    charsets[charset_count++] = "punctuation";
+}
+
+## The symbol character set
+###########################
+
+# A symbol is any character that has one of the symbol categories in
+# the Unicode character database (Sm, Sc, Sk, or So).
+
+category ~ /S/ {
+    charsets[charset_count++] = "symbol";
+}
+
+## The blank character set
+##########################
+
+# Blank chars are horizontal whitespace.  A blank character is either
+#    * a character with the space separator category (Zs) in the
+#      Unicode character database.
+#    * U+0009 (9) Horizontal tabulation (\t control-I)
+
+category ~ /Zs/ || codepoint == 9 {
+    charsets[charset_count++] = "blank";
+}
+
+## The ascii character set
+##########################
+
+codepoint <= 127 {
+    charsets[charset_count++] = "ascii";
+}
+
+## The designated character set
+###############################
+
+# Designated -- All characters except for the surrogates
+
+category !~ /Cs/ {
+    charsets[charset_count++] = "designated";
+}
+
+## Other character sets
+#######################
+
+# Note that the "letter_plus_digit" and "printing" character sets, which
+# are unions of other character sets, are included in the patterns
+# matching their constituent parts (i.e., the "letter_plus_digit"
+# character set is included as part of the "letter" and "digit"
+# patterns).
+#
+# Also, the "empty" character is computed by doing precisely nothing!
+
+# Keeping track of state
+########################
+
+# Update the state for each charset.
+{
+    for (i = 0; i < charset_count; i++) {
+        cs = charsets[i];
+        if (state[cs, "start"] == -1) {
+            state[cs, "start"] = codepoint;
+            state[cs, "end"] = codepoint_end;
+        } else if (state[cs, "end"] + 1 == codepoint) {
+            state[cs, "end"] = codepoint_end;
+        } else {
+            count = state[cs, "count"];
+            state[cs, "count"]++;
+            state[cs, "ranges", count, 0] = state[cs, "start"];
+            state[cs, "ranges", count, 1] = state[cs, "end"];
+            state[cs, "start"] = codepoint;
+            state[cs, "end"] = codepoint_end;
+        }
+    }
+}
+
+# Printing and error handling
+#############################
+
+END {
+    # Normally, an exit statement runs all the 'END' blocks before
+    # actually exiting.  We use the 'exit_status' variable to short
+    # circuit the rest of the 'END' block by reissuing the exit
+    # statement.
+    if (exit_status != 0) {
+        exit exit_status;
+    }
+
+    # Write a bit of a header.
+    print("/* srfi-14.i.c -- standard SRFI-14 character set data */");
+    print("");
+    print("/* This file is #include'd by srfi-14.c.  */");
+    print("");
+    print("/* This file was generated from");
+    print("   https://unicode.org/Public/UNIDATA/UnicodeData.txt";);
+    print("   with the unidata_to_charset.awk script.  */");
+    print("");
+
+    for (i = 0; i < all_charsets_count; i++) {
+        cs = all_charsets[i];
+
+        # Extra logic to ensure that the last range is included.
+        if (state[cs, "start"] != -1) {
+            count = state[cs, "count"];
+            state[cs, "count"]++;
+            state[cs, "ranges", count, 0] = state[cs, "start"];
+            state[cs, "ranges", count, 1] = state[cs, "end"];
+        }
+
+        count = state[cs, "count"];
+
+        print("static const scm_t_char_range cs_" cs "_ranges[] = {");
+        for (j = 0; j < count; j++) {
+            rstart = state[cs, "ranges", j, 0];
+            rend = state[cs, "ranges", j, 1];
+            if (j + 1 < count) {
+                printf("  {0x%04x, 0x%04x},\n", rstart, rend);
+            } else {
+                printf("  {0x%04x, 0x%04x}\n", rstart, rend);
+            }
+        }
+        print("};");
+        print("");
+
+        count = state[cs, "count"];
+        printf("static const size_t cs_%s_len = %d;\n", cs, count);
+        if (i + 1 < all_charsets_count) {
+            print("");
+        }
+    }
+}
+
+# And we're done.
diff --git a/libguile/unidata_to_charset.pl b/libguile/unidata_to_charset.pl
deleted file mode 100755
index 9cd7e6e71..000000000
--- a/libguile/unidata_to_charset.pl
+++ /dev/null
@@ -1,401 +0,0 @@
-#!/usr/bin/perl
-# unidata_to_charset.pl --- Compute SRFI-14 charsets from UnicodeData.txt
-#
-# Copyright (C) 2009, 2010, 2022 Free Software Foundation, Inc.
-#
-# This library is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public
-# License as published by the Free Software Foundation; either
-# version 3 of the License, or (at your option) any later version.
-#
-# This library is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with this library; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-
-open(my $in,  "<",  "UnicodeData.txt")  or die "Can't open UnicodeData.txt: 
$!";           
-open(my $out, ">",  "srfi-14.i.c") or die "Can't open srfi-14.i.c: $!";
-
-# For Unicode, we follow Java's specification: a character is
-# lowercase if
-#    * it is not in the range [U+2000,U+2FFF], and
-#    * the Unicode attribute table does not give a lowercase mapping
-#      for it, and
-#    * at least one of the following is true:
-#          o the Unicode attribute table gives a mapping to uppercase
-#            for the character, or
-#          o the name for the character in the Unicode attribute table
-#            contains the words "SMALL LETTER" or "SMALL LIGATURE".
-
-sub lower_case {
-    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
-    if (($codepoint < 0x2000 || $codepoint > 0x2FFF)
-        && (!defined($lowercase) || $lowercase eq "")
-        && ((defined($uppercase) && $uppercase ne "")
-            || ($name =~ /(SMALL LETTER|SMALL LIGATURE)/))) {
-        return 1;
-    } else {
-        return 0;
-    }
-}
-
-# For Unicode, we follow Java's specification: a character is
-# uppercase if
-#    * it is not in the range [U+2000,U+2FFF], and
-#    * the Unicode attribute table does not give an uppercase mapping
-#      for it (this excludes titlecase characters), and
-#    * at least one of the following is true:
-#          o the Unicode attribute table gives a mapping to lowercase
-#            for the character, or
-#          o the name for the character in the Unicode attribute table
-#            contains the words "CAPITAL LETTER" or "CAPITAL LIGATURE".
-
-sub upper_case {
-    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
-    if (($codepoint < 0x2000 || $codepoint > 0x2FFF)
-        && (!defined($uppercase) || $uppercase eq "")
-        && ((defined($lowercase) && $lowercase ne "")
-            || ($name =~ /(CAPITAL LETTER|CAPITAL LIGATURE)/))) {
-        return 1;
-    } else {
-        return 0;
-    }
-}
-
-# A character is titlecase if it has the category Lt in the character
-# attribute database.
-
-sub title_case {
-    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
-    if (defined($category) && $category eq "Lt") {
-        return 1;
-    } else {
-        return 0;
-    }
-}
-
-# A letter is any character with one of the letter categories (Lu, Ll,
-# Lt, Lm, Lo) in the Unicode character database.
-
-sub letter {
-    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
-    if (defined($category) && ($category eq "Lu"
-                               || $category eq "Ll"
-                               || $category eq "Lt"
-                               || $category eq "Lm"
-                               || $category eq "Lo")) {
-        return 1;
-    } else {
-        return 0;
-    }
-}
-
-# A character is a digit if it has the category Nd in the character
-# attribute database. In Latin-1 and ASCII, the only such characters
-# are 0123456789. In Unicode, there are other digit characters in
-# other code blocks, such as Gujarati digits and Tibetan digits.
-
-sub digit {
-    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
-    if (defined($category) && $category eq "Nd") {
-        return 1;
-    } else {
-        return 0;
-    }
-}
-
-# The only hex digits are 0123456789abcdefABCDEF. 
-
-sub hex_digit {
-    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
-    if (($codepoint >= 0x30 && $codepoint <= 0x39)
-        || ($codepoint >= 0x41 && $codepoint <= 0x46)
-        || ($codepoint >= 0x61 && $codepoint <= 0x66)) {
-        return 1;
-    } else {
-        return 0;
-    }
-}
-
-# The union of char-set:letter and char-set:digit.
-
-sub letter_plus_digit {
-    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
-    if (letter($codepoint, $name, $category, $uppercase, $lowercase)
-        || digit($codepoint, $name, $category, $uppercase, $lowercase)) {
-        return 1;
-    } else {
-        return 0;
-    }
-}
-
-# Characters that would 'use ink' when printed
-sub graphic {
-    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
-    if ($category =~ (/L|M|N|P|S/)) {
-        return 1;
-    } else {
-        return 0;
-    }
-}
-
-# A whitespace character is either
-#    * a character with one of the space, line, or paragraph separator
-#      categories (Zs, Zl or Zp) of the Unicode character database.
-#    * U+0009 Horizontal tabulation (\t control-I)
-#    * U+000A Line feed (\n control-J)
-#    * U+000B Vertical tabulation (\v control-K)
-#    * U+000C Form feed (\f control-L)
-#    * U+000D Carriage return (\r control-M)
-
-sub whitespace {
-    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
-    if ($category =~ (/Zs|Zl|Zp/)
-        || $codepoint == 0x9
-        || $codepoint == 0xA 
-        || $codepoint == 0xB 
-        || $codepoint == 0xC 
-        || $codepoint == 0xD) { 
-        return 1;
-    } else {
-        return 0;
-    }
-}
-
-# A printing character is one that would occupy space when printed,
-# i.e., a graphic character or a space character. char-set:printing is
-# the union of char-set:whitespace and char-set:graphic.
-
-sub printing {
-    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
-    if (whitespace($codepoint, $name, $category, $uppercase, $lowercase)
-        || graphic($codepoint, $name, $category, $uppercase, $lowercase)) {
-        return 1;
-    } else {
-        return 0;
-    }
-}
-
-# The ISO control characters are the Unicode/Latin-1 characters in the
-# ranges [U+0000,U+001F] and [U+007F,U+009F].
-
-sub iso_control {
-    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
-    if (($codepoint >= 0x00 && $codepoint <= 0x1F)
-        || ($codepoint >= 0x7F && $codepoint <= 0x9F)) {
-        return 1;
-    } else {
-        return 0;
-    }
-}
-
-# A punctuation character is any character that has one of the
-# punctuation categories in the Unicode character database (Pc, Pd,
-# Ps, Pe, Pi, Pf, or Po.)
-
-# Note that srfi-14 gives conflicting requirements!!  It claims that
-# only the Unicode punctuation is necessary, but, explicitly calls out
-# the soft hyphen character (U+00AD) as punctution.  Current versions
-# of Unicode consider U+00AD to be a formatting character, not
-# punctuation.
-
-sub punctuation {
-    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
-    if ($category =~ (/P/)) {
-        return 1;
-    } else {
-        return 0;
-    }
-}
-        
-# A symbol is any character that has one of the symbol categories in
-# the Unicode character database (Sm, Sc, Sk, or So).
-
-sub symbol {
-    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
-    if ($category =~ (/S/)) {
-        return 1;
-    } else {
-        return 0;
-    }
-}
-        
-# Blank chars are horizontal whitespace.  A blank character is either
-#    * a character with the space separator category (Zs) in the
-#      Unicode character database.
-#    * U+0009 Horizontal tabulation (\t control-I) 
-sub blank {
-    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
-    if ($category =~ (/Zs/)
-        || $codepoint == 0x9) { 
-        return 1;
-    } else {
-        return 0;
-    }
-}
-
-# ASCII
-sub ascii {
-    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
-    if ($codepoint <= 0x7F) {
-        return 1;
-    } else {
-        return 0;
-    }
-}
-
-# Empty
-sub empty {
-    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
-    return 0;
-}
-
-# Designated -- All characters except for the surrogates
-sub designated {
-    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
-    if ($category =~ (/Cs/)) {
-        return 0;
-    } else {
-        return 1;
-    }
-}
-
-
-# The procedure generates the two C structures necessary to describe a
-# given category.
-sub compute {
-    my($f) = @_;
-    my $start = -1;
-    my $end = -1;
-    my $len = 0;
-    my @rstart = (-1);
-    my @rend = (-1);
-
-    seek($in, 0, 0) or die "Can't seek to beginning of file: $!";
-
-    print "$f\n";
-
-    while (<$in>) {
-        # Parse the 14 column, semicolon-delimited UnicodeData.txt
-        # file
-        chomp;
-        my(@fields) = split(/;/);
-
-        # The codepoint: an integer
-        my $codepoint = hex($fields[0]); 
-
-        # If this is a character range, the last character in this
-        # range
-        my $codepoint_end = $codepoint;  
-
-        # The name of the character
-        my $name = $fields[1];    
-
-        # A two-character category code, such as Ll (lower-case
-        # letter)
-        my $category = $fields[2];       
-
-        # The codepoint of the uppercase version of this char
-        my $uppercase = $fields[12];   
-
-        # The codepoint of the lowercase version of this char
-        my $lowercase = $fields[13];    
-
-        my $pass = &$f($codepoint,$name,$category,$uppercase,$lowercase);
-        if ($pass == 1) {
-
-            # Some pairs of lines in UnicodeData.txt delimit ranges of
-            # characters.
-            if ($name =~ /First/) {
-                $line = <$in>;
-                die $! if $!;
-                $codepoint_end = hex( (split(/;/, $line))[0] );
-            }                 
-
-            # Compute ranges of characters [start:end] that meet the
-            # criteria.  Store the ranges.
-            if ($start == -1) {
-                $start = $codepoint;
-                $end = $codepoint_end;
-            } elsif ($end + 1 == $codepoint) {
-                $end = $codepoint_end;
-            } else {
-                $rstart[$len] = $start;
-                $rend[$len] = $end;
-                $len++;
-                $start = $codepoint;
-                $end = $codepoint_end;
-            }
-        }
-    }
-
-    # Extra logic to ensure that the last range is included
-    if ($start != -1) {
-        if ($len > 0 && $rstart[@rstart-1] != $start) {
-            $rstart[$len] = $start;
-            $rend[$len] = $end;
-            $len++;
-        } elsif ($len == 0) {
-           $rstart[0] = $start;
-           $rend[0] = $end;
-           $len++;
-        }
-    }
-
-    # Print the C struct that contains the range list.
-    print $out "static const scm_t_char_range cs_" . $f . "_ranges[] = {\n";
-    if ($rstart[0] != -1) {
-        for (my $i=0; $i<@rstart-1; $i++) {
-            printf $out "  {0x%04x, 0x%04x},\n", $rstart[$i], $rend[$i];
-        }
-        printf $out "  {0x%04x, 0x%04x}\n", $rstart[@rstart-1], 
$rend[@rstart-1];
-    }
-    print $out "};\n\n";
-
-    # Print the C struct that contains the range list length and
-    # pointer to the range list.
-    print $out "static const size_t cs_${f}_len = $len;\n\n";
-}
-
-# Write a bit of a header
-print $out "/* srfi-14.i.c -- standard SRFI-14 character set data */\n\n";
-print $out "/* This file is #include'd by srfi-14.c.  */\n\n";
-print $out "/* This file was generated from\n";
-print $out "   http://unicode.org/Public/UNIDATA/UnicodeData.txt\n";;
-print $out "   with the unidata_to_charset.pl script.  */\n\n";
-
-# Write the C structs for each SRFI-14 charset
-compute "lower_case";
-compute "upper_case";
-compute "title_case";
-compute "letter";
-compute "digit";
-compute "hex_digit";
-compute "letter_plus_digit";
-compute "graphic";
-compute "whitespace";
-compute "printing";
-compute "iso_control";
-compute "punctuation";
-compute "symbol";
-compute "blank";
-compute "ascii";
-compute "empty";
-compute "designated";
-
-close $in;
-close $out;
-
-exec ('indent srfi-14.i.c') or print STDERR "call to 'indent' failed: $!";
-
-# And we're done.
-
-
-
-
-
-



reply via email to

[Prev in Thread] Current Thread [Next in Thread]