guile-commits
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Guile-commits] GNU Guile branch, master, updated. release_1-9-2-123-gd5


From: Michael Gran
Subject: [Guile-commits] GNU Guile branch, master, updated. release_1-9-2-123-gd5ecf57
Date: Fri, 28 Aug 2009 01:53:50 +0000

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU Guile".

http://git.savannah.gnu.org/cgit/guile.git/commit/?id=d5ecf5797dade3882db816bd0a325568e3610ade

The branch, master has been updated
       via  d5ecf5797dade3882db816bd0a325568e3610ade (commit)
       via  d0434ddf2522265b7e023ca84c8b3b4773f2459c (commit)
      from  a27173cf78a03758317dfe5571380ed3347fa242 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit d5ecf5797dade3882db816bd0a325568e3610ade
Author: Michael Gran <address@hidden>
Date:   Thu Aug 27 18:52:53 2009 -0700

    Fix FUNC_NAME definitions and #endif in srfi-14.[ch]
    
    * libguile/srfi-14.c: whitespace and FUNC_NAME fixes
    
    * libguile/srfi-14.h: #endif comment

commit d0434ddf2522265b7e023ca84c8b3b4773f2459c
Author: Michael Gran <address@hidden>
Date:   Thu Aug 27 18:23:46 2009 -0700

    Script to generate srfi-14 charsets from UnicodeData.txt
    
    This script was used to generate srfi-14.i.c from the UnicodeData.txt
    file supplied by ftp://www.unicode.org/Public/UNIDATA/
    
    * libguile/unidata_to_charset.pl

-----------------------------------------------------------------------

Summary of changes:
 libguile/srfi-14.c             |   25 +---
 libguile/srfi-14.h             |    2 +-
 libguile/unidata_to_charset.pl |  399 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 403 insertions(+), 23 deletions(-)
 create mode 100755 libguile/unidata_to_charset.pl

diff --git a/libguile/srfi-14.c b/libguile/srfi-14.c
index fe56270..7ab65ac 100644
--- a/libguile/srfi-14.c
+++ b/libguile/srfi-14.c
@@ -704,7 +704,6 @@ SCM_DEFINE (scm_char_set_cursor, "char-set-cursor", 1, 0, 0,
     }
   SCM_RETURN_NEWSMOB (scm_tc16_charset_cursor, cur_data);
 }
-
 #undef FUNC_NAME
 
 
@@ -734,7 +733,6 @@ SCM_DEFINE (scm_char_set_ref, "char-set-ref", 2, 0, 0,
     SCM_MISC_ERROR ("invalid character set cursor: ~A", scm_list_1 (cursor));
   return SCM_MAKE_CHAR (cur_data->n);
 }
-
 #undef FUNC_NAME
 
 
@@ -784,7 +782,6 @@ SCM_DEFINE (scm_char_set_cursor_next, 
"char-set-cursor-next", 2, 0, 0,
 
   return cursor;
 }
-
 #undef FUNC_NAME
 
 
@@ -803,7 +800,6 @@ SCM_DEFINE (scm_end_of_char_set_p, "end-of-char-set?", 1, 
0, 0,
 
   return SCM_BOOL_F;
 }
-
 #undef FUNC_NAME
 
 
@@ -943,7 +939,6 @@ SCM_DEFINE (scm_char_set_for_each, "char-set-for-each", 2, 
0, 0,
 
   return SCM_UNSPECIFIED;
 }
-
 #undef FUNC_NAME
 
 
@@ -1007,7 +1002,6 @@ SCM_DEFINE (scm_char_set_copy, "char-set-copy", 1, 0, 0,
 
   return ret;
 }
-
 #undef FUNC_NAME
 
 
@@ -1182,7 +1176,6 @@ SCM_DEFINE (scm_char_set_filter, "char-set-filter", 2, 1, 
0,
       }
   return ret;
 }
-
 #undef FUNC_NAME
 
 
@@ -1214,7 +1207,6 @@ SCM_DEFINE (scm_char_set_filter_x, "char-set-filter!", 3, 
0, 0,
       }
   return base_cs;
 }
-
 #undef FUNC_NAME
 
 
@@ -1347,7 +1339,6 @@ SCM_DEFINE (scm_char_set_size, "char-set-size", 1, 0, 0,
 
   return scm_from_int (count);
 }
-
 #undef FUNC_NAME
 
 
@@ -1400,7 +1391,6 @@ SCM_DEFINE (scm_char_set_to_list, "char-set->list", 1, 0, 
0,
       result = scm_cons (SCM_MAKE_CHAR (n), result);
   return result;
 }
-
 #undef FUNC_NAME
 
 
@@ -1445,7 +1435,6 @@ SCM_DEFINE (scm_char_set_to_string, "char-set->string", 
1, 0, 0,
       }
   return result;
 }
-
 #undef FUNC_NAME
 
 
@@ -1489,7 +1478,6 @@ SCM_DEFINE (scm_char_set_every, "char-set-every", 2, 0, 0,
       }
   return SCM_BOOL_T;
 }
-
 #undef FUNC_NAME
 
 
@@ -1634,7 +1622,6 @@ SCM_DEFINE (scm_char_set_complement, 
"char-set-complement", 1, 0, 0,
   charsets_complement (p, q);
   return res;
 }
-
 #undef FUNC_NAME
 
 
@@ -1816,7 +1803,6 @@ SCM_DEFINE (scm_char_set_complement_x, 
"char-set-complement!", 1, 0, 0,
   cs = scm_char_set_complement (cs);
   return cs;
 }
-
 #undef FUNC_NAME
 
 
@@ -1831,7 +1817,6 @@ SCM_DEFINE (scm_char_set_union_x, "char-set-union!", 1, 
0, 1,
   cs1 = scm_char_set_union (scm_cons (cs1, rest));
   return cs1;
 }
-
 #undef FUNC_NAME
 
 
@@ -1846,7 +1831,6 @@ SCM_DEFINE (scm_char_set_intersection_x, 
"char-set-intersection!", 1, 0, 1,
   cs1 = scm_char_set_intersection (scm_cons (cs1, rest));
   return cs1;
 }
-
 #undef FUNC_NAME
 
 
@@ -1861,7 +1845,6 @@ SCM_DEFINE (scm_char_set_difference_x, 
"char-set-difference!", 1, 0, 1,
   cs1 = scm_char_set_difference (cs1, rest);
   return cs1;
 }
-
 #undef FUNC_NAME
 
 
@@ -1877,7 +1860,6 @@ SCM_DEFINE (scm_char_set_xor_x, "char-set-xor!", 1, 0, 1,
    */
   return scm_char_set_xor (scm_cons (cs1, rest));
 }
-
 #undef FUNC_NAME
 
 
@@ -1897,8 +1879,8 @@ SCM_DEFINE (scm_char_set_diff_plus_intersection_x,
   cs2 = intersect;
   return scm_values (scm_list_2 (cs1, cs2));
 }
-
 #undef FUNC_NAME
+
 
 
 /* Standard character sets.  */
@@ -1937,7 +1919,7 @@ define_charset (const char *name, const scm_t_char_set *p)
 SCM_DEFINE (scm_debug_char_set, "debug-char-set", 1, 0, 0,
             (SCM charset),
             "Print out the internal C structure of @var{charset}.\n")
-#define FUNC_NAME s_debug_char_set
+#define FUNC_NAME s_scm_debug_char_set
 {
   int i;
   scm_t_char_set *cs = SCM_CHARSET_DATA (charset);
@@ -1956,9 +1938,8 @@ SCM_DEFINE (scm_debug_char_set, "debug-char-set", 1, 0, 0,
   printf ("\n");
   return SCM_UNSPECIFIED;
 }
-
 #undef FUNC_NAME
-#endif
+#endif /* SCM_CHARSET_DEBUG */
 
 
 
diff --git a/libguile/srfi-14.h b/libguile/srfi-14.h
index bdcdd08..1b9c295 100644
--- a/libguile/srfi-14.h
+++ b/libguile/srfi-14.h
@@ -102,7 +102,7 @@ SCM_API SCM scm_char_set_xor_x (SCM cs1, SCM rest);
 SCM_API SCM scm_char_set_diff_plus_intersection_x (SCM cs1, SCM cs2, SCM rest);
 #if SCM_CHARSET_DEBUG
 SCM_API SCM scm_debug_char_set (SCM cs);
-#endif
+#endif /* SCM_CHARSET_DEBUG */
 
 SCM_API SCM scm_char_set_lower_case;
 SCM_API SCM scm_char_set_upper_case;
diff --git a/libguile/unidata_to_charset.pl b/libguile/unidata_to_charset.pl
new file mode 100755
index 0000000..6871e67
--- /dev/null
+++ b/libguile/unidata_to_charset.pl
@@ -0,0 +1,399 @@
+#!/usr/bin/perl
+# unidata_to_charset.pl --- Compute SRFI-14 charsets from UnicodeData.txt
+#
+# Copyright (C) 2009 Free Software Foundation, Inc.
+# 
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 3 of the License, or (at your option) any later version.
+# 
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+# 
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+
+open(my $in,  "<",  "UnicodeData.txt")  or die "Can't open UnicodeData.txt: 
$!";           
+open(my $out, ">",  "srfi-14.i.c") or die "Can't open srfi-14.i.c: $!";
+
+# For Unicode, we follow Java's specification: a character is
+# lowercase if
+#    * it is not in the range [U+2000,U+2FFF], and
+#    * the Unicode attribute table does not give a lowercase mapping
+#      for it, and
+#    * at least one of the following is true:
+#          o the Unicode attribute table gives a mapping to uppercase
+#            for the character, or
+#          o the name for the character in the Unicode attribute table
+#            contains the words "SMALL LETTER" or "SMALL LIGATURE".
+
+sub lower_case {
+    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
+    if (($codepoint < 0x2000 || $codepoint > 0x2FFF)
+        && (!defined($lowercase) || $lowercase eq "")
+        && ((defined($uppercase) && $uppercase ne "")
+            || ($name =~ /(SMALL LETTER|SMALL LIGATURE)/))) {
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+# For Unicode, we follow Java's specification: a character is
+# uppercase if
+#    * it is not in the range [U+2000,U+2FFF], and
+#    * the Unicode attribute table does not give an uppercase mapping
+#      for it (this excludes titlecase characters), and
+#    * at least one of the following is true:
+#          o the Unicode attribute table gives a mapping to lowercase
+#            for the character, or
+#          o the name for the character in the Unicode attribute table
+#            contains the words "CAPITAL LETTER" or "CAPITAL LIGATURE".
+
+sub upper_case {
+    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
+    if (($codepoint < 0x2000 || $codepoint > 0x2FFF)
+        && (!defined($uppercase) || $uppercase eq "")
+        && ((defined($lowercase) && $lowercase ne "")
+            || ($name =~ /(CAPITAL LETTER|CAPITAL LIGATURE)/))) {
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+# A character is titlecase if it has the category Lt in the character
+# attribute database.
+
+sub title_case {
+    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
+    if (defined($category) && $category eq "Lt") {
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+# A letter is any character with one of the letter categories (Lu, Ll,
+# Lt, Lm, Lo) in the Unicode character database.
+
+sub letter {
+    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
+    if (defined($category) && ($category eq "Lu"
+                               || $category eq "Ll"
+                               || $category eq "Lt"
+                               || $category eq "Lm"
+                               || $category eq "Lo")) {
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+# A character is a digit if it has the category Nd in the character
+# attribute database. In Latin-1 and ASCII, the only such characters
+# are 0123456789. In Unicode, there are other digit characters in
+# other code blocks, such as Gujarati digits and Tibetan digits.
+
+sub digit {
+    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
+    if (defined($category) && $category eq "Nd") {
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+# The only hex digits are 0123456789abcdefABCDEF. 
+
+sub hex_digit {
+    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
+    if (($codepoint >= 0x30 && $codepoint <= 0x39)
+        || ($codepoint >= 0x41 && $codepoint <= 0x46)
+        || ($codepoint >= 0x61 && $codepoint <= 0x66)) {
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+# The union of char-set:letter and char-set:digit.
+
+sub letter_plus_digit {
+    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
+    if (letter($codepoint, $name, $category, $uppercase, $lowercase)
+        || digit($codepoint, $name, $category, $uppercase, $lowercase)) {
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+# Characters that would 'use ink' when printed
+sub graphic {
+    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
+    if ($category =~ (/L|M|N|P|S/)) {
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+# A whitespace character is either
+#    * a character with one of the space, line, or paragraph separator
+#      categories (Zs, Zl or Zp) of the Unicode character database.
+#    * U+0009 Horizontal tabulation (\t control-I)
+#    * U+000A Line feed (\n control-J)
+#    * U+000B Vertical tabulation (\v control-K)
+#    * U+000C Form feed (\f control-L)
+#    * U+000D Carriage return (\r control-M)
+
+sub whitespace {
+    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
+    if ($category =~ (/Zs|Zl|Zp/)
+        || $codepoint == 0x9
+        || $codepoint == 0xA 
+        || $codepoint == 0xB 
+        || $codepoint == 0xC 
+        || $codepoint == 0xD) { 
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+# A printing character is one that would occupy space when printed,
+# i.e., a graphic character or a space character. char-set:printing is
+# the union of char-set:whitespace and char-set:graphic.
+
+sub printing {
+    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
+    if (whitespace($codepoint, $name, $category, $uppercase, $lowercase)
+        || graphic($codepoint, $name, $category, $uppercase, $lowercase)) {
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+# The ISO control characters are the Unicode/Latin-1 characters in the
+# ranges [U+0000,U+001F] and [U+007F,U+009F].
+
+sub iso_control {
+    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
+    if (($codepoint >= 0x00 && $codepoint <= 0x1F)
+        || ($codepoint >= 0x7F && $codepoint <= 0x9F)) {
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+# A punctuation character is any character that has one of the
+# punctuation categories in the Unicode character database (Pc, Pd,
+# Ps, Pe, Pi, Pf, or Po.)
+
+# Note that srfi-14 gives conflicting requirements!!  It claims that
+# only the Unicode punctuation is necessary, but, explicitly calls out
+# the soft hyphen character (U+00AD) as punctution.  Current versions
+# of Unicode consider U+00AD to be a formatting character, not
+# punctuation.
+
+sub punctuation {
+    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
+    if ($category =~ (/P/)) {
+        return 1;
+    } else {
+        return 0;
+    }
+}
+        
+# A symbol is any character that has one of the symbol categories in
+# the Unicode character database (Sm, Sc, Sk, or So).
+
+sub symbol {
+    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
+    if ($category =~ (/S/)) {
+        return 1;
+    } else {
+        return 0;
+    }
+}
+        
+# Blank chars are horizontal whitespace.  A blank character is either
+#    * a character with the space separator category (Zs) in the
+#      Unicode character database.
+#    * U+0009 Horizontal tabulation (\t control-I) 
+sub blank {
+    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
+    if ($category =~ (/Zs/)
+        || $codepoint == 0x9) { 
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+# ASCII
+sub ascii {
+    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
+    if ($codepoint <= 0x7F) {
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+# Empty
+sub empty {
+    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
+    return 0;
+}
+
+# Full -- All characters.  
+sub full {
+    my($codepoint, $name, $category, $uppercase, $lowercase)= @_;
+    return 1;
+}
+
+
+# The procedure generates the two C structures necessary to describe a
+# given category.
+sub compute {
+    my($f) = @_;
+    my $start = -1;
+    my $end = -1;
+    my $len = 0;
+    my @rstart = (-1);
+    my @rend = (-1);
+
+    seek($in, 0, 0) or die "Can't seek to beginning of file: $!";
+
+    print "$f\n";
+
+    while (<$in>) {
+        # Parse the 14 column, semicolon-delimited UnicodeData.txt
+        # file
+        chomp;
+        my(@fields) = split(/;/);
+
+        # The codepoint: an integer
+        my $codepoint = hex($fields[0]); 
+
+        # If this is a character range, the last character in this
+        # range
+        my $codepoint_end = $codepoint;  
+
+        # The name of the character
+        my $name = $fields[1];    
+
+        # A two-character category code, such as Ll (lower-case
+        # letter)
+        my $category = $fields[2];       
+
+        # The codepoint of the uppercase version of this char
+        my $uppercase = $fields[12];   
+
+        # The codepoint of the lowercase version of this char
+        my $lowercase = $fields[13];    
+
+        my $pass = &$f($codepoint,$name,$category,$uppercase,$lowercase);
+        if ($pass == 1) {
+
+            # Some pairs of lines in UnicodeData.txt delimit ranges of
+            # characters.
+            if ($name =~ /First/) {
+                $line = <$in>;
+                die $! if $!;
+                $codepoint_end = hex( (split(/;/, $line))[0] );
+            }                 
+
+            # Compute ranges of characters [start:end] that meet the
+            # criteria.  Store the ranges.
+            if ($start == -1) {
+                $start = $codepoint;
+                $end = $codepoint_end;
+            } elsif ($end + 1 == $codepoint) {
+                $end = $codepoint_end;
+            } else {
+                $rstart[$len] = $start;
+                $rend[$len] = $end;
+                $len++;
+                $start = $codepoint;
+                $end = $codepoint_end;
+            }
+        }
+    }
+
+    # Extra logic to ensure that the last range is included
+    if ($start != -1) {
+        if ($len > 0 && address@hidden != $start) {
+            $rstart[$len] = $start;
+            $rend[$len] = $end;
+            $len++;
+        } elsif ($len == 0) {
+            $rstart[0] = $start;
+            $rend[0] = $end;
+        }
+    }
+
+    # Print the C struct that contains the range list.
+    print $out "scm_t_char_range cs_" . $f . "_ranges[] = {\n";
+    if ($rstart[0] != -1) {
+        for (my $i=0; $i<@rstart-1; $i++) {
+            printf $out "  {0x%04x, 0x%04x},\n", $rstart[$i], $rend[$i];
+        }
+        printf $out "  {0x%04x, 0x%04x}\n", address@hidden, address@hidden;
+    }
+    print $out "};\n\n";
+
+    # Print the C struct that contains the range list length and
+    # pointer to the range list.
+    print $out "scm_t_char_set cs_${f} = {\n";
+    print $out "  $len,\n";
+    print $out "  cs_" . $f . "_ranges\n";
+    print $out "};\n\n";
+}
+
+# Write a bit of a header
+print $out "/* srfi-14.i.c -- standard SRFI-14 character set data */\n\n";
+print $out "/* This file is #include'd by srfi-14.c.  */\n\n";
+print $out "/* This file was generated from\n"
+print $out "   http://unicode.org/Public/UNIDATA/UnicodeData.txt\n";;
+print $out "   with the unidata_to_charset.pl script.  */\n\n";
+
+# Write the C structs for each SRFI-14 charset
+compute "lower_case";
+compute "upper_case";
+compute "title_case";
+compute "letter";
+compute "digit";
+compute "hex_digit";
+compute "letter_plus_digit";
+compute "graphic";
+compute "whitespace";
+compute "printing";
+compute "iso_control";
+compute "punctuation";
+compute "symbol";
+compute "blank";
+compute "ascii";
+compute "empty";
+compute "full";
+
+close $in;
+close $out;
+
+exec ('indent srfi-14.i.c') or print STDERR "call to 'indent' failed: $!";
+
+# And we're done.
+
+
+
+
+
+


hooks/post-receive
-- 
GNU Guile




reply via email to

[Prev in Thread] Current Thread [Next in Thread]