bug-gettext
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[bug #66378] cldr-plurals fails to parse recent CLDR data: 1c6, ...


From: Michele Locati
Subject: [bug #66378] cldr-plurals fails to parse recent CLDR data: 1c6, ...
Date: Mon, 28 Oct 2024 07:27:41 -0400 (EDT)

Follow-up Comment #1, bug #66378 (group gettext):

I managed to get cldr-plurals work with the following patch:

From fadb2220b6cd04c50d6d2bb5c6c6488c119c5484 Mon Sep 17 00:00:00 2001
From: Michele Locati <michele@locati.it>
Date: Mon, 28 Oct 2024 12:15:15 +0100
Subject: [PATCH] Accept CLDR rules with XcY samples and c and e vars

---
 gettext-tools/src/cldr-plural-exp.c | 128 ++++++++++++++++++++++++++--
 gettext-tools/src/cldr-plural.y     |   4 +-
 2 files changed, 126 insertions(+), 6 deletions(-)

diff --git a/gettext-tools/src/cldr-plural-exp.c
b/gettext-tools/src/cldr-plural-exp.c
index 60d3d0baf..dbce9627b 100644
--- a/gettext-tools/src/cldr-plural-exp.c
+++ b/gettext-tools/src/cldr-plural-exp.c
@@ -24,6 +24,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
+#include "string-buffer.h"
 #include "unistr.h"
 #include "xalloc.h"
 
@@ -101,20 +102,136 @@ cldr_plural_rule_list_free (struct
cldr_plural_rule_list_ty *rules)
   free (rules);
 }
 
+static
+const char * get_XcY_end(const char *str)
+{
+  bool found_c = false;
+  if (str[0] < '0' || str[0] > '9')
+    return NULL;
+  str++;
+  while (str[0] != '\0')
+    {
+      if (str[0] == 'c')
+        {
+          if (found_c || str[1] < '0' || str[1] > '9')
+            return NULL;
+          found_c = true;
+        }
+      else if ((str[0] < '0' || str[0] > '9') && str[0] != '.')
+        break;
+      str++;
+    }
+  if (!found_c)
+    return NULL;
+  while(str[0] == ' ')
+    str++;
+  if (str[0] == ',')
+    {
+      str++;
+      while(str[0] == ' ')
+        str++;
+    }
+  return str;
+}
+
+static
+char * remove_XcY(const char *input)
+{
+  struct string_buffer buffer;
+  const char *p;
+  const char *p_next;
+  const char *p_next1;
+  const char *p_next2;
+
+  sb_init (&buffer);
+  p = (char*) input;
+  for (;;) {
+    int comma_and_spaces;
+    p_next1 = strstr(p, "@integer ");
+    p_next2 = strstr(p, "@decimal ");
+    if (p_next1 == NULL && p_next2 == NULL)
+      {
+        sb_append_c(&buffer, p);
+        break;
+      }
+    if (p_next1 != NULL && (p_next2 == NULL || p_next1 < p_next2))
+      p_next = p_next1 + /* strlen("@integer ")*/ 9;
+    else
+      p_next = p_next2 + /* strlen("@decimal ")*/ 9;
+    while (p < p_next)
+      sb_append1(&buffer, *p++);
+    while (p[0] == ' ')
+      sb_append1(&buffer, *p++);
+    comma_and_spaces = -1;
+    for (;;)
+      {
+        const char * XcY_end;
+        if (p[0] < '0' || p[0] > '9')
+          break;
+        XcY_end = get_XcY_end(p);
+        if (XcY_end != NULL)
+          {
+            p = XcY_end;
+            continue;
+          }
+        if (comma_and_spaces >= 0)
+          {
+            sb_append1(&buffer, ',');
+            while (comma_and_spaces > 0)
+              {
+                sb_append1(&buffer, ' ');
+                comma_and_spaces--;
+              }
+          }
+        while ((p[0] >= '0' && p[0] <= '9') || p[0] == '.' || p[0] == '~')
+          {
+            sb_append1(&buffer, p[0]);
+            p++;
+          }
+        if (p[0] != ',')
+          break;
+        comma_and_spaces = 0;
+        p++;
+        while (p[0] == ' ')
+          {
+            comma_and_spaces++;
+            p++;
+          }
+      }
+      if (comma_and_spaces > 0 && p[0] == '\xE2' && p[1] == '\x80' && p[2] ==
'\xA6')
+        {
+          sb_append1(&buffer, ',');
+          while (comma_and_spaces > 0)
+            {
+              sb_append1(&buffer, ' ');
+              comma_and_spaces--;
+            }
+        }
+  }
+  return sb_dupfree_c(&buffer);
+}
+
 struct cldr_plural_rule_list_ty *
 cldr_plural_parse (const char *input)
 {
   struct cldr_plural_parse_args arg;
+  char *input_without_XcY;
 
   memset (&arg, 0, sizeof (struct cldr_plural_parse_args));
-  arg.cp = input;
-  arg.cp_end = input + strlen (input);
+  input_without_XcY = remove_XcY(input);
+  if (input_without_XcY == NULL)
+    return NULL;
+  arg.cp = input_without_XcY;
+  arg.cp_end = input_without_XcY + strlen(input_without_XcY);;
   arg.result = XMALLOC (struct cldr_plural_rule_list_ty);
   memset (arg.result, 0, sizeof (struct cldr_plural_rule_list_ty));
 
   if (yyparse (&arg) != 0)
-    return NULL;
-
+    {
+      free(input_without_XcY);
+      return NULL;
+    }
+  free(input_without_XcY);
   return arg.result;
 }
 
@@ -156,10 +273,11 @@ eval_relation (struct cldr_plural_relation_ty
*relation)
       break;
     case 'f': case 't':
     case 'v': case 'w':
+    case 'c': case 'e':
       {
         /* Since plural expression in gettext only supports unsigned
            integer, turn relations whose operand is either 'f', 't',
-           'v', or 'w' into a constant truth value.  */
+           'v', 'w', 'c', or 'e' into a constant truth value.  */
         /* FIXME: check mod?  */
         size_t i;
         for (i = 0; i < relation->ranges->nitems; i++)
diff --git a/gettext-tools/src/cldr-plural.y
b/gettext-tools/src/cldr-plural.y
index 05c1b56ec..3e28d224b 100644
--- a/gettext-tools/src/cldr-plural.y
+++ b/gettext-tools/src/cldr-plural.y
@@ -263,6 +263,7 @@ at_decimal: %empty
         ;
 
 sample_list: sample_list1 sample_ellipsis
+        | ELLIPSIS
         ;
 sample_list1: sample_range
         | sample_list1 ',' sample_range
@@ -413,7 +414,8 @@ yylex (YYSTYPE *lval, struct cldr_plural_parse_args *arg)
           {
             switch (ident[0])
               {
-              case 'n': case 'i': case 'f': case 't': case 'v': case 'w':
+              // See
https://unicode.org/reports/tr35/tr35-numbers.html#table-plural-operand-meanings
+              case 'n': case 'i': case 'f': case 't': case 'v': case 'w':
case 'c': case 'e':
                 arg->cp = exp;
                 lval->ival = ident[0];
                 sb_free (&buffer);


This patch basically:

1. accept the "e" and "c" variables described in
https://unicode.org/reports/tr35/tr35-numbers.html#table-plural-operand-meanings
(assuming they have a value of zero)

2. strips out the XcY samples (eg 1c6, 1.0000001c6), thus working for example
on

one: i = 0,1 @integer 0, 1 @decimal 0.0~1.5; many: e = 0 and i != 0 and i %
1000000 = 0 and v = 0 or e != 0..5 @integer 1000000, … @decimal …; other: 
@integer 2~17, 100, 1000, 10000, 100000, … @decimal 2.0~3.5, 10.0, 100.0,
1000.0, 10000.0, 100000.0, 1000000.0, …

instead of

one: i = 0,1 @integer 0, 1 @decimal 0.0~1.5; many: e = 0 and i != 0 and i %
1000000 = 0 and v = 0 or e != 0..5 @integer 1000000, 1c6, 2c6, 3c6, 4c6, 5c6,
6c6, … @decimal 1.0000001c6, 1.1c6, 2.0000001c6, 2.1c6, 3.0000001c6, 3.1c6,
…; other:  @integer 2~17, 100, 1000, 10000, 100000, 1c3, 2c3, 3c3, 4c3, 5c3,
6c3, … @decimal 2.0~3.5, 10.0, 100.0, 1000.0, 10000.0, 100000.0, 1000000.0,
1.0001c3, 1.1c3, 2.0001c3, 2.1c3, 3.0001c3, 3.1c3, …


Generating this output:

nplurals=3; plural=(n==0 || n==1 ? 0 : n!=0 && n%1000000==0 ? 1 : 2);




    _______________________________________________________

Reply to this item at:

  <https://savannah.gnu.org/bugs/?66378>

_______________________________________________
Message sent via Savannah
https://savannah.gnu.org/

Attachment: signature.asc
Description: PGP signature


reply via email to

[Prev in Thread] Current Thread [Next in Thread]