guile-commits
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Guile-commits] GNU Guile branch, master, updated. release_1-9-13-98-g64


From: Andy Wingo
Subject: [Guile-commits] GNU Guile branch, master, updated. release_1-9-13-98-g644c516
Date: Tue, 23 Nov 2010 21:36:23 +0000

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU Guile".

http://git.savannah.gnu.org/cgit/guile.git/commit/?id=644c5165ee449a3beccadeb969e02746954703ee

The branch, master has been updated
       via  644c5165ee449a3beccadeb969e02746954703ee (commit)
      from  612aa5bee87bf85b908ed26e73d496af6f0d8520 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 644c5165ee449a3beccadeb969e02746954703ee
Author: Andy Wingo <address@hidden>
Date:   Tue Nov 23 22:39:06 2010 +0100

    fix regexp matches to refer to chars, not bytes
    
    * libguile/regex-posix.c (fixup_multibyte_match): Fixup the match
      structure to refer to character offsets, not byte offsets. Fixes bug
      31650.
    
    * test-suite/tests/regexp.test: Add a test.

-----------------------------------------------------------------------

Summary of changes:
 libguile/regex-posix.c       |   52 ++++++++++++++++++++++++++++++++++++++++-
 test-suite/tests/regexp.test |    9 ++++++-
 2 files changed, 58 insertions(+), 3 deletions(-)

diff --git a/libguile/regex-posix.c b/libguile/regex-posix.c
index 6259f28..4c03577 100644
--- a/libguile/regex-posix.c
+++ b/libguile/regex-posix.c
@@ -1,4 +1,4 @@
-/*     Copyright (C) 1997, 1998, 1999, 2000, 2001, 2004, 2006, 2007 Free 
Software Foundation, Inc.
+/*     Copyright (C) 1997, 1998, 1999, 2000, 2001, 2004, 2006, 2007, 2010 Free 
Software Foundation, Inc.
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public License
@@ -53,6 +53,10 @@
 #endif
 #endif
 
+#ifdef HAVE_WCHAR_H
+#include <wchar.h>
+#endif
+
 #include "libguile/async.h"
 #include "libguile/smob.h"
 #include "libguile/symbols.h"
@@ -196,6 +200,43 @@ SCM_DEFINE (scm_make_regexp, "make-regexp", 1, 0, 1,
 }
 #undef FUNC_NAME
 
+#ifdef HAVE_WCHAR_H
+/*
+ * While regexec does respect the current locale, it returns byte
+ * offsets instead of character offsets. This routine fixes up the
+ * regmatch_t structures to refer to characters instead. See "Converting
+ * a Character" in the libc manual, for more details.
+ */
+static void
+fixup_multibyte_match (regmatch_t *matches, int nmatches, char *str)
+{
+  mbstate_t state;
+  int i;
+  size_t char_idx, byte_idx;
+  size_t nbytes = 1; /* just to kick off the for loop */
+
+  memset (&state, '\0', sizeof (state));
+
+  for (char_idx = byte_idx = 0; nbytes > 0; char_idx++, byte_idx += nbytes)
+    {
+      for (i = 0; i < nmatches; ++i)
+        {
+          if (matches[i].rm_so == byte_idx)
+            matches[i].rm_so = char_idx;
+          if (matches[i].rm_eo == byte_idx)
+            matches[i].rm_eo = char_idx;
+        }
+
+      nbytes = mbrlen (str + byte_idx, MB_LEN_MAX, &state);
+    }
+
+  if (nbytes >= (size_t) -2)
+    /* Something is wrong. Shouldn't be possible, as the regex match
+       succeeded.  */
+    abort ();
+}
+#endif
+
 SCM_DEFINE (scm_regexp_exec, "regexp-exec", 2, 2, 0,
             (SCM rx, SCM str, SCM start, SCM flags),
            "Match the compiled regular expression @var{rx} against\n"
@@ -256,11 +297,18 @@ SCM_DEFINE (scm_regexp_exec, "regexp-exec", 2, 2, 0,
   /* re_nsub doesn't account for the `subexpression' representing the
      whole regexp, so add 1 to nmatches. */
 
+  c_str = scm_to_locale_string (substr);
+
   nmatches = SCM_RGX(rx)->re_nsub + 1;
   matches = scm_malloc (sizeof (regmatch_t) * nmatches);
-  c_str = scm_to_locale_string (substr);
   status = regexec (SCM_RGX (rx), c_str, nmatches, matches,
                    scm_to_int (flags));
+
+#ifdef HAVE_WCHAR_H
+  if (!status)
+    fixup_multibyte_match (matches, nmatches, c_str);
+#endif
+
   free (c_str);
 
   if (!status)
diff --git a/test-suite/tests/regexp.test b/test-suite/tests/regexp.test
index efa0e7e..1b58789 100644
--- a/test-suite/tests/regexp.test
+++ b/test-suite/tests/regexp.test
@@ -1,4 +1,4 @@
-;;;; regexp.test --- test Guile's regular expression functions -*- scheme -*-
+;;;; regexp.test ---  test Guile's regexps   -*- coding: utf-8; mode: scheme 
-*-
 ;;;; Jim Blandy <address@hidden> --- September 1999
 ;;;;
 ;;;;   Copyright (C) 1999, 2004, 2006, 2007, 2008, 2009, 2010 Free Software 
Foundation, Inc.
@@ -265,3 +265,10 @@
   ;; Jan Nieuwenhuizen's bug, 2 Sep 1999
   (try "" "_" (make-string 500 #\_)
        'post))
+
+(with-test-prefix "nonascii locales"
+  (with-locale "en_US.utf8"
+    ;; bug 31650
+    (pass-if "match structures refer to char offsets"
+      (equal? (match:substring (string-match ".*" "calçot") 0)
+              "calçot"))))


hooks/post-receive
-- 
GNU Guile



reply via email to

[Prev in Thread] Current Thread [Next in Thread]