[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [PATCH 3/4] std.pk: Implement strtok
From: |
Jose E. Marchesi |
Subject: |
Re: [PATCH 3/4] std.pk: Implement strtok |
Date: |
Sun, 29 Jan 2023 17:51:03 +0100 |
User-agent: |
Gnus/5.13 (Gnus v5.13) |
Hi Arsen.
Thanks for the patch.
> diff --git a/doc/poke.texi b/doc/poke.texi
> index 54a369c8..ff498220 100644
> --- a/doc/poke.texi
> +++ b/doc/poke.texi
> @@ -15720,6 +15720,7 @@ work on strings:
> * rtrim:: Remove trailing characters.
> * strchr:: Locate a character in a string, from the beginning.
> * strrchr:: Locate a character in a string, from the end.
> +* strtok:: String tokenization.
> @end menu
Again the weird indentation in the menu.
> @node ltrim
> @@ -15776,6 +15777,97 @@ It returns the index of the last occurrence of the
> character @var{c}
> in the string @var{s}. If the character is not found in the string,
> this function returns -1.
>
> +@node strtok
> +@subsection @code{strtok}
> +@cindex @code{strtok}
> +@cindex string, tokenizing
> +
> +@code{strtok} is a helper for tokenizing strings. The synopsis of
> +this API are:
> +
> +@example
> +type Tokenizer =
> + struct
> + @{
> + uint<64> i;
> + string str;
> + computed uint<32> more;
> +
> + method get_more = uint<32>: @{ @dots{} @}
> +
> + method peek = char: @{ @dots{} @}
> + method pop = char: @{ @dots{} @}
> +
> + method pop_number = (int<32> @var{base} = 10) int<64>: @{ @dots{} @}
> +
> + method popdelim = (string @var{delimiters}) string: @{ @dots{} @}
> + method poprdelim = (string @var{delimiters}) string: @{ @dots{} @}
> + @}
> +
> +fun strtok = (string @var{a}) Tokenizer: @{ @dots{} @}
> +@end example
What about calling the type String_Tokenizer instead of Tokenizer. Less
generic.
> +@deftypefun Tokenizer strtok (string @var{a})
> +Creates a new tokenizer for the string @var{a}, initially on the zero
> +position.
> +@end deftypefun
> +
> +@cindex @code{Tokenizer}
> +The members of the @code{Tokenizer} class are:
> +
> +@deftypeivar Tokenizer uint<64> i
> +Offset to the next character to be tokenized, i.e. to the first
> +character that has not already been consumed.
> +@end deftypeivar
> +
> +@deftypeivar Tokenizer string str
> +The string being tokenized. This string is never tokenized.
> +@end deftypeivar
> +
> +@deftypeivar Tokenizer uint<32> more
> +A read-only computed property whose value is truthy if there's more
> +characters and falsey otherwise.
> +@end deftypeivar
> +
> +@deftypemethod Tokenizer char poke ()
> +Returns the first unread character of the string, but does not advance
> +the @var{i} offset.
> +
> +Raises @code{E_out_of_bounds} if at the end of the string.
> +@end deftypemethod
> +
> +@deftypemethod Tokenizer char peek ()
> +Returns the first unread character of the string, and advances the
> +tokenizer.
> +
> +Raises @code{E_out_of_bounds} if at the end of the string.
> +@end deftypemethod
> +
> +@deftypemethod Tokenizer int<64> pop_number (int<32> @var{base} = 10)
> +Returns the number at the start of the string and advances the
> +tokenizer in the given @var{base}. The bases that are supported are
> +the same as for @code{strtoi}.
> +@xref{strtoi} for a list of supported bases.
> +
> +Raises @code{E_out_of_bounds} if at the end of the string.
> +@end deftypemethod
> +
> +@deftypemethod Tokenizer string popdelim (string @var{delim})
> +Returns the substring up to the first character also present in the
> +string @var{delim}. Advances the tokenizer to after the delimiter
> +character (i.e. it consumes the delimiter character).
> +
> +Raises @code{E_out_of_bounds} if at the end of the string.
> +@end deftypemethod
> +
> +@deftypemethod Tokenizer string poprdelim (string @var{delim})
> +Returns the substring up to the last character also present in the
> +string @var{delim}. Advances the tokenizer to after the delimiter
> +character (i.e. it consumes the delimiter character).
> +
> +Raises @code{E_out_of_bounds} if at the end of the string.
> +@end deftypemethod
> +
> @node Character Functions
> @section Character Functions
> The Poke standard library provides the following functions to deal
> @@ -16263,6 +16355,9 @@ It returns an offset that is the result of aligning
> the given
> @node Concept Index
> @appendixsec Concept Index
>
> +@c Merge the findex into the cpindex.
> +@syncodeindex fn cp
> +
> @printindex cp
>
> @node PVM Instruction Index
> diff --git a/libpoke/std.pk b/libpoke/std.pk
> index a9ed3305..02776052 100644
> --- a/libpoke/std.pk
> +++ b/libpoke/std.pk
> @@ -546,3 +546,107 @@ fun isxdigit = (uint<8> c) int<32>:
> {
> return (c - '0' < 10UB) || ((c | 0x20UB) - 'a' < 6UB);
> }
> +
> +/* A helper object to tokenize a string. */
> +
> +type Tokenizer =
> + struct
> + {
> + uint<64> i;
> + string str;
> + computed uint<32> more;
> +
> + method get_more = uint<32>:
> + {
> + return i < str'length;
> + }
> +
> + method peek = uint<8>:
> + {
> + if (!get_more ())
> + raise E_out_of_bounds;
> +
> + return str[i];
> + }
> +
> + method pop = uint<8>:
> + {
> + if (!get_more ())
> + raise E_out_of_bounds;
> + return str[i++];
> + }
> +
> + method pop_number = (int<32> base = 10) int<64>:
> + {
> + if (!get_more ())
> + raise E_out_of_bounds;
> +
> + var res = strtoi (str, base, i);
> + if (res.off == i)
> + raise E_inval;
> +
> + i = res.off;
> + return res.val;
> + }
> +
> + method popdelim = (string delimiters) string:
> + {
> + /* TODO(arsen): Replace with some set-based thing at some point. */
> + if (delimiters'length == 0)
> + raise E_inval;
> +
> + if (!get_more ())
> + raise E_out_of_bounds;
> +
> + var j = i;
> + for (; j < str'length; j++)
> + {
> + var c = str[j];
> + for (d in delimiters)
> + {
> + if (c != d)
> + continue;
> +
> + var s = str[i:j];
> + i = j + 1;
> + return s;
> + }
> + }
> + var s = str[i:];
> + i = str'length;
> + return s;
> + }
> +
> + method poprdelim = (string delimiters) string:
> + {
> + /* TODO(arsen): Replace with some set-based thing at some point. */
> + if (delimiters'length == 0)
> + raise E_inval;
> +
> + if (!get_more ())
> + raise E_out_of_bounds;
> +
> + var j = str'length - 1;
> + for (; j >= i && j < str'length; j--)
> + {
> + var c = str[j];
> + for (d in delimiters)
> + {
> + if (c != d)
> + continue;
> +
> + var s = str[i:j];
> + i = j + 1;
> + return s;
> + }
> + }
> + var s = str[i:];
> + i = str'length;
> + return s;
> + }
> + };
> +
> +fun strtok = (string a) Tokenizer:
> +{
> + return Tokenizer { str = a, i = 0 };
> +}
> diff --git a/testsuite/poke.std/std-test.pk b/testsuite/poke.std/std-test.pk
> index 924767f5..5b8de358 100644
> --- a/testsuite/poke.std/std-test.pk
> +++ b/testsuite/poke.std/std-test.pk
> @@ -382,6 +382,48 @@ var tests = [
> assert (x.val == 1);
> },
> },
> + PkTest {
> + name = "strtok",
> + func = lambda (string name) void:
> + {
> + var x = strtok ("abc123def-foo-bar-baz-xy");
> + try
> + {
> + /* Test whether we emit E_inval for zero-width number parses. */
> + x.pop_number ();
> + assert (0, "unreachable reached!");
> + }
> + catch if E_inval
> + {
> + assert (1, "expects exception");
> + }
> + assert (x.pop () == 'a');
> + assert (x.more);
> + assert (x.pop () == 'b');
> + assert (x.more);
> + assert (x.pop () == 'c');
> + assert (x.more);
> + assert (x.pop_number (8) == 0o123);
> + assert (x.more);
> + assert (x.popdelim ("-") == "def");
> + assert (x.more);
> + assert (x.peek () == 'f');
> + assert (x.more);
> + assert (x.poprdelim ("-") == "foo-bar-baz");
> + assert (x.more);
> + assert (x.popdelim ("-") == "xy");
> + assert (!x.more);
> + try
> + {
> + x.pop ();
> + assert (0, "unreachable reached!");
> + }
> + catch if E_out_of_bounds
> + {
> + assert (1, "expects exception");
> + }
> + },
> + },
> ];
>
> exit (pktest_run (tests) ? 0 : 1);