poke-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH v2 3/4] std.pk: Implement strtok


From: Arsen Arsenović
Subject: [PATCH v2 3/4] std.pk: Implement strtok
Date: Sun, 29 Jan 2023 21:04:37 +0100

* doc/poke.texi (strtok): Document the strtok API.
(String Functions): Add strtok to menu.
(Concept Index): Merge fn -> cp, as there's not enough use of the
function index to justify separate section currently.
* libpoke/std.pk (Tokenizer): New type.  Holds the persistent
state of the strtok-like operation.
(strtok): New function.  Returns a new Tokenizer.
(Tokenizer): New type.  Holds tokenization state.
* testsuite/poke.std/std-test.pk: Add strtok test.
---
 ChangeLog                      |  13 +++++
 doc/poke.texi                  |  95 ++++++++++++++++++++++++++++++
 libpoke/std.pk                 | 104 +++++++++++++++++++++++++++++++++
 testsuite/poke.std/std-test.pk |  42 +++++++++++++
 4 files changed, 254 insertions(+)

diff --git a/ChangeLog b/ChangeLog
index 7f90ba64..ab6c47f0 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,16 @@
+2023-01-29  Arsen Arsenović  <arsen@aarsen.me>
+
+       std.pk: Implement strtok
+       * doc/poke.texi (strtok): Document the strtok API.
+       (String Functions): Add strtok to menu.
+       (Concept Index): Merge fn -> cp, as there's not enough use of the
+       function index to justify separate section currently.
+       * libpoke/std.pk (String_Tokenizer): New type.  Holds the persistent
+       state of the strtok-like operation.
+       (strtok): New function.  Returns a new String_Tokenizer.
+       (String_Tokenizer): New type.  Holds tokenization state.
+       * testsuite/poke.std/std-test.pk: Add strtok test.
+
 2023-01-29  Arsen Arsenović  <arsen@aarsen.me>
 
        std.pk: Refactor atoi into strtoi
diff --git a/doc/poke.texi b/doc/poke.texi
index b1997f17..2ae005e2 100644
--- a/doc/poke.texi
+++ b/doc/poke.texi
@@ -15720,6 +15720,7 @@ work on strings:
 * rtrim::              Remove trailing characters.
 * strchr::             Locate a character in a string, from the beginning.
 * strrchr::            Locate a character in a string, from the end.
+* strtok::             String tokenization.
 @end menu
 
 @node ltrim
@@ -15776,6 +15777,97 @@ It returns the index of the last occurrence of the 
character @var{c}
 in the string @var{s}.  If the character is not found in the string,
 this function returns -1.
 
+@node strtok
+@subsection @code{strtok}
+@cindex @code{strtok}
+@cindex string, tokenizing
+
+@code{strtok} is a helper for tokenizing strings.  The synopsis of
+this API are:
+
+@example
+type String_Tokenizer =
+  struct
+  @{
+    uint<64> i;
+    string str;
+    computed uint<32> more;
+
+    method get_more = uint<32>: @{ @dots{} @}
+
+    method peek = char: @{ @dots{} @}
+    method pop = char: @{ @dots{} @}
+
+    method pop_number = (int<32> @var{base} = 10) int<64>: @{ @dots{} @}
+
+    method popdelim = (string @var{delimiters}) string: @{ @dots{} @}
+    method poprdelim = (string @var{delimiters}) string: @{ @dots{} @}
+  @}
+
+fun strtok = (string @var{a}) String_Tokenizer: @{ @dots{} @}
+@end example
+
+@deftypefun String_Tokenizer strtok (string @var{a})
+Creates a new tokenizer for the string @var{a}, initially on the zero
+position.
+@end deftypefun
+
+@cindex @code{String_Tokenizer}
+The members of the @code{String_Tokenizer} class are:
+
+@deftypeivar String_Tokenizer uint<64> i
+Offset to the next character to be tokenized, i.e. to the first
+character that has not already been consumed.
+@end deftypeivar
+
+@deftypeivar String_Tokenizer string str
+The string being tokenized.  This string is never tokenized.
+@end deftypeivar
+
+@deftypeivar String_Tokenizer uint<32> more
+A read-only computed property whose value is truthy if there's more
+characters and falsey otherwise.
+@end deftypeivar
+
+@deftypemethod String_Tokenizer char poke ()
+Returns the first unread character of the string, but does not advance
+the @var{i} offset.
+
+Raises @code{E_out_of_bounds} if at the end of the string.
+@end deftypemethod
+
+@deftypemethod String_Tokenizer char peek ()
+Returns the first unread character of the string, and advances the
+tokenizer.
+
+Raises @code{E_out_of_bounds} if at the end of the string.
+@end deftypemethod
+
+@deftypemethod String_Tokenizer int<64> pop_number (int<32> @var{base} = 10)
+Returns the number at the start of the string and advances the
+tokenizer in the given @var{base}.  The bases that are supported are
+the same as for @code{strtoi}.
+@xref{strtoi} for a list of supported bases.
+
+Raises @code{E_out_of_bounds} if at the end of the string.
+@end deftypemethod
+
+@deftypemethod String_Tokenizer string popdelim (string @var{delim})
+Returns the substring up to the first character also present in the
+string @var{delim}.  Advances the tokenizer to after the delimiter
+character (i.e. it consumes the delimiter character).
+
+Raises @code{E_out_of_bounds} if at the end of the string.
+@end deftypemethod
+
+@deftypemethod String_Tokenizer string poprdelim (string @var{delim})
+Returns the substring up to the last character also present in the
+string @var{delim}.  Advances the tokenizer to after the delimiter
+character (i.e. it consumes the delimiter character).
+
+Raises @code{E_out_of_bounds} if at the end of the string.
+@end deftypemethod
+
 @node Character Functions
 @section Character Functions
 The Poke standard library provides the following functions to deal
@@ -16263,6 +16355,9 @@ It returns an offset that is the result of aligning the 
given
 @node Concept Index
 @appendixsec Concept Index
 
+@c Merge the findex into the cpindex.
+@syncodeindex fn cp
+
 @printindex cp
 
 @node PVM Instruction Index
diff --git a/libpoke/std.pk b/libpoke/std.pk
index a9ed3305..45b27f90 100644
--- a/libpoke/std.pk
+++ b/libpoke/std.pk
@@ -546,3 +546,107 @@ fun isxdigit = (uint<8> c) int<32>:
 {
   return (c - '0' < 10UB) || ((c | 0x20UB) - 'a' < 6UB);
 }
+
+/* A helper object to tokenize a string.  */
+
+type String_Tokenizer =
+  struct
+  {
+    uint<64> i;
+    string str;
+    computed uint<32> more;
+
+    method get_more = uint<32>:
+    {
+      return i < str'length;
+    }
+
+    method peek = uint<8>:
+    {
+      if (!get_more ())
+        raise E_out_of_bounds;
+
+      return str[i];
+    }
+
+    method pop = uint<8>:
+    {
+      if (!get_more ())
+        raise E_out_of_bounds;
+      return str[i++];
+    }
+
+    method pop_number = (int<32> base = 10) int<64>:
+    {
+      if (!get_more ())
+        raise E_out_of_bounds;
+
+      var res = strtoi (str, base, i);
+      if (res.off == i)
+        raise E_inval;
+
+      i = res.off;
+      return res.val;
+    }
+
+    method popdelim = (string delimiters) string:
+    {
+      /* TODO(arsen): Replace with some set-based thing at some point.  */
+      if (delimiters'length == 0)
+        raise E_inval;
+
+      if (!get_more ())
+        raise E_out_of_bounds;
+
+      var j = i;
+      for (; j < str'length; j++)
+        {
+          var c = str[j];
+          for (d in delimiters)
+            {
+              if (c != d)
+                continue;
+
+              var s = str[i:j];
+              i = j + 1;
+              return s;
+            }
+        }
+      var s = str[i:];
+      i = str'length;
+      return s;
+    }
+
+    method poprdelim = (string delimiters) string:
+    {
+      /* TODO(arsen): Replace with some set-based thing at some point.  */
+      if (delimiters'length == 0)
+        raise E_inval;
+
+      if (!get_more ())
+        raise E_out_of_bounds;
+
+      var j = str'length - 1;
+      for (; j >= i && j < str'length; j--)
+        {
+          var c = str[j];
+          for (d in delimiters)
+            {
+              if (c != d)
+                continue;
+
+              var s = str[i:j];
+              i = j + 1;
+              return s;
+            }
+        }
+      var s = str[i:];
+      i = str'length;
+      return s;
+    }
+  };
+
+fun strtok = (string a) String_Tokenizer:
+{
+  return String_Tokenizer { str = a, i = 0 };
+}
diff --git a/testsuite/poke.std/std-test.pk b/testsuite/poke.std/std-test.pk
index 924767f5..5b8de358 100644
--- a/testsuite/poke.std/std-test.pk
+++ b/testsuite/poke.std/std-test.pk
@@ -382,6 +382,48 @@ var tests = [
         assert (x.val == 1);
       },
   },
+  PkTest {
+    name = "strtok",
+    func = lambda (string name) void:
+      {
+        var x = strtok ("abc123def-foo-bar-baz-xy");
+        try
+          {
+            /* Test whether we emit E_inval for zero-width number parses.  */
+            x.pop_number ();
+            assert (0, "unreachable reached!");
+          }
+        catch if E_inval
+          {
+            assert (1, "expects exception");
+          }
+        assert (x.pop () == 'a');
+        assert (x.more);
+        assert (x.pop () == 'b');
+        assert (x.more);
+        assert (x.pop () == 'c');
+        assert (x.more);
+        assert (x.pop_number (8) == 0o123);
+        assert (x.more);
+        assert (x.popdelim ("-") == "def");
+        assert (x.more);
+        assert (x.peek () == 'f');
+        assert (x.more);
+        assert (x.poprdelim ("-") == "foo-bar-baz");
+        assert (x.more);
+        assert (x.popdelim ("-") == "xy");
+        assert (!x.more);
+        try
+          {
+            x.pop ();
+            assert (0, "unreachable reached!");
+          }
+        catch if E_out_of_bounds
+          {
+            assert (1, "expects exception");
+          }
+      },
+  },
 ];
 
 exit (pktest_run (tests) ? 0 : 1);
-- 
2.39.1




reply via email to

[Prev in Thread] Current Thread [Next in Thread]