branch master updated: * tp/Texinfo/XS/parsetexi/def.c (split_def

texinfo-commits
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
branch master updated: * tp/Texinfo/XS/parsetexi/def.c (split_def_args):

From:	Patrice Dumas
Subject:	branch master updated: * tp/Texinfo/XS/parsetexi/def.c (split_def_args): count UTF-8 encoded Unicode characters for source marks locations.
Date:	Mon, 30 Jan 2023 17:27:15 -0500
This is an automated email from the git hooks/post-receive script.

pertusus pushed a commit to branch master
in repository texinfo.

The following commit(s) were added to refs/heads/master by this push:
     new 282701e238 * tp/Texinfo/XS/parsetexi/def.c (split_def_args): count 
UTF-8 encoded Unicode characters for source marks locations.
282701e238 is described below

commit 282701e238e01fce8b36d078f8abf488369e58e4
Author: Patrice Dumas <pertusus@free.fr>
AuthorDate: Mon Jan 30 23:27:05 2023 +0100

    * tp/Texinfo/XS/parsetexi/def.c (split_def_args): count UTF-8 encoded
    Unicode characters for source marks locations.
    
    * tp/t/19def.t: do not skip end_of_lines_protected_non_ascii test.
    
    * tp/Texinfo/XS/parsetexi/source_marks.c,
    tp/Texinfo/XS/parsetexi/parser.c (count_convert_u8): move
    count_convert_u8 to parser.c.
---
 ChangeLog                              | 11 +++++++++++
 tp/Texinfo/XS/parsetexi/def.c          | 28 +++++++++++++++++++++++++---
 tp/Texinfo/XS/parsetexi/parser.c       | 17 +++++++++++++++++
 tp/Texinfo/XS/parsetexi/parser.h       |  1 +
 tp/Texinfo/XS/parsetexi/source_marks.c | 13 -------------
 tp/t/19def.t                           |  2 +-
 6 files changed, 55 insertions(+), 17 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 554ccfaced..c3782056ad 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,14 @@
+2023-01-30  Patrice Dumas  <pertusus@free.fr>
+
+       * tp/Texinfo/XS/parsetexi/def.c (split_def_args): count UTF-8 encoded
+       Unicode characters for source marks locations.
+
+       * tp/t/19def.t: do not skip end_of_lines_protected_non_ascii test.
+
+       * tp/Texinfo/XS/parsetexi/source_marks.c,
+       tp/Texinfo/XS/parsetexi/parser.c (count_convert_u8): move
+       count_convert_u8 to parser.c.
+
 2023-01-30  Patrice Dumas  <pertusus@free.fr>
 
        * tp/Texinfo/XS/parsetexi/source_marks.c (count_convert_u8)
diff --git a/tp/Texinfo/XS/parsetexi/def.c b/tp/Texinfo/XS/parsetexi/def.c
index 52c1ffaac5..306d533fe8 100644
--- a/tp/Texinfo/XS/parsetexi/def.c
+++ b/tp/Texinfo/XS/parsetexi/def.c
@@ -15,6 +15,9 @@
 
 #include <config.h>
 #include <string.h>
+#include <stdbool.h>
+#include "uniconv.h"
+#include "unistr.h"
 
 #include "parser.h"
 #include "text.h"
@@ -201,20 +204,32 @@ split_def_args (ELEMENT *current, int starting_idx)
       char *p;
       ELEMENT *new;
       int len;
-      int current_position = 0;
-      int previous_position = 0;
+      /* count UTF-8 encoded Unicode characters for source marks locations */
+      size_t current_position = 0;
+      size_t previous_position = 0;
+      uint8_t *u8_text = 0;
+      uint8_t *u8_p;
+
       if (e->type == ET_bracketed)
         {
           isolate_last_space (e);
           e->type = ET_bracketed_def_content;
           continue;
         }
+
       if (e->text.end == 0)
         continue;
+
       p = e->text.text;
 
+      if (e->source_mark_list.number)
+        u8_text = u8_strconv_from_encoding (p, "UTF-8",
+                                            iconveh_question_mark);
+      u8_p = u8_text;
+
       while (1)
         {
+          size_t u8_len = 0;
           len = strspn (p, whitespace_chars);
           if (len)
             {
@@ -226,7 +241,13 @@ split_def_args (ELEMENT *current, int starting_idx)
               len = strcspn (p, whitespace_chars);
               new = new_element (ET_NONE);
             }
-          current_position += len;
+          if (u8_text)
+            {
+              u8_len = u8_mbsnlen (u8_p, len);
+              u8_p += u8_len;
+              current_position += u8_len;
+            }
+
           while (e->source_mark_list.number)
             {
               SOURCE_MARK *source_mark
@@ -251,6 +272,7 @@ split_def_args (ELEMENT *current, int starting_idx)
           previous_position = current_position;
         }
       destroy_element (remove_from_contents (current, i--));
+      free (u8_text);
     }
 }
 
diff --git a/tp/Texinfo/XS/parsetexi/parser.c b/tp/Texinfo/XS/parsetexi/parser.c
index 0c369ffc9c..0a51799594 100644
--- a/tp/Texinfo/XS/parsetexi/parser.c
+++ b/tp/Texinfo/XS/parsetexi/parser.c
@@ -18,6 +18,9 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <ctype.h>
+#include <stdbool.h>
+#include "uniconv.h"
+#include "unistr.h"
 
 #include "parser.h"
 #include "text.h"
@@ -31,6 +34,20 @@ const char *digit_chars = "0123456789";
 // [^\S\r\n] in Perl
 const char *whitespace_chars_except_newline = " \t\v\f";
 
+/* count characters, not bytes. */
+size_t
+count_convert_u8 (char *text)
+{
+  /* FIXME error checking? */
+  uint8_t *resultbuf = u8_strconv_from_encoding (text, "UTF-8",
+                                                 iconveh_question_mark);
+  size_t result = u8_mbsnlen (resultbuf, u8_strlen (resultbuf));
+
+  free (resultbuf);
+
+  return result;
+}
+
 /* Check if the contents of S2 appear at S1). */
 int
 looking_at (char *s1, char *s2)
diff --git a/tp/Texinfo/XS/parsetexi/parser.h b/tp/Texinfo/XS/parsetexi/parser.h
index fc0c991dd6..4bf185908d 100644
--- a/tp/Texinfo/XS/parsetexi/parser.h
+++ b/tp/Texinfo/XS/parsetexi/parser.h
@@ -152,6 +152,7 @@ ELEMENT *handle_separator (ELEMENT *current, char separator,
                            char **line_inout);
 
 /* In parser.c */
+size_t count_convert_u8 (char *text);
 ELEMENT *parse_texi (ELEMENT *root_elt, ELEMENT *current_elt);
 void push_conditional_stack (enum command_id cond);
 enum command_id pop_conditional_stack (void);
diff --git a/tp/Texinfo/XS/parsetexi/source_marks.c 
b/tp/Texinfo/XS/parsetexi/source_marks.c
index 8d23fd727f..51ef605816 100644
--- a/tp/Texinfo/XS/parsetexi/source_marks.c
+++ b/tp/Texinfo/XS/parsetexi/source_marks.c
@@ -14,14 +14,10 @@
    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
 
 #include <string.h>
-#include <stdbool.h>
-#include "uniconv.h"
-#include "unistr.h"
 
 #include "source_marks.h"
 #include "tree.h"
 #include "errors.h"
-/* for debugging only */
 #include "parser.h"
 
 int include_counter = 0;
@@ -72,15 +68,6 @@ add_source_marks (SOURCE_MARK_LIST *source_mark_list, 
ELEMENT *e)
     }
 }
 
-/* count characters, not bytes. */
-size_t
-count_convert_u8 (char *text)
-{
-  uint8_t *resultbuf = u8_strconv_from_encoding (text, "UTF-8",
-                                                 iconveh_question_mark);
-  return u8_mbsnlen (resultbuf, u8_strlen (resultbuf));
-}
-
 /* ELEMENT should be the parent container.
    The source mark is put in the last content if it is text
    or registered in the parent container. */
diff --git a/tp/t/19def.t b/tp/t/19def.t
index ef08c9695b..3a1b4c0876 100644
--- a/tp/t/19def.t
+++ b/tp/t/19def.t
@@ -124,7 +124,7 @@ deffn
 @end deffn
 '],
 ['end_of_lines_protected_non_ascii',
-undef, {'test_file' => 'end_of_lines_protected_non_ascii.texi',},# 'skip' => 
'XS counts bytes not characters' },
+undef, {'test_file' => 'end_of_lines_protected_non_ascii.texi',},
 ],
 ['empty_def_command',
 '@deffn empty deffn
[Prev in Thread]
Current Thread
[Next in Thread]
branch master updated: * tp/Texinfo/XS/parsetexi/def.c (split_def_args): count UTF-8 encoded Unicode characters for source marks locations., Patrice Dumas <=