bug-datamash
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH] vnlog support


From: Dima Kogan
Subject: Re: [PATCH] vnlog support
Date: Sat, 14 May 2022 13:18:30 -0700
User-agent: mu4e 1.6.10; emacs 29.0.50

Since we're talking about working on this again, and making a new
release, I'd like to ping this feature request. I exchanged a few emails
about it with Assaf right before he disappeared, and it sounded like he
was going to add this feature. I've no idea what, if anything, he wanted
to change about the patch.

vnlog support would make both projects much more useful. The original
mailing list post (quoted in full below) contains a demo and a patch.
The patch needs to be updated such that -v implies -W. If I can get an
ACK from whoever is intending to take over datamash, I can re-test the
patch, finalize things, add tests, and so on.




Dima Kogan <datamash@dima.secretsauce.net> writes:

> Hi.
>
> I maintain vnlog, a toolkit for manipulating tabular ascii data:
>
>   https://github.com/dkogan/vnlog
>
> The cmdline tools are largely thin frontends around awk and GNU
> coreutils. The capabilities are complementary with datamash, and it'd be
> nice if datamash supported vnlog's data format. It already does 99% of
> it, and I'm attaching a prototype patch (to the 1.4 stable release) that
> adds the rest. The vnlog format:
>
> - A whitespace-separated table of text
>
> - Lines beginning with # are comments
>
> - The first line that begins with a single # (not ## or #!) is a legend, 
> naming each column
>
> - Empty fields reported as -
>
> As you can see, this is very close to what datamash does already. Would
> you be interested in adding this support to datamash? You probably would
> want to rework the patch somewhat, I imagine. It's not intrusive, so
> that shouldn't be very effortful. Trivial demo:
>
>
>   $ (echo '## comment'; echo '# x y'; seq 5 | awk '{print $1, $1*$1}') | 
> ./datamash -v sum y mean x
>   # sum(y) mean(x)
>   55 3
>
>
> The patch handles most everything, except the "whitespace-delimited"
> part. As far as I can tell there currently isn't a way to make datamash
> work with \s+ as a field separator: -t' ' treats a sequence of N spaces
> as N field separators. I can write a patch for that, if you're
> interested. It probably would be good to support that regardless, since
> that would match how awk does things.
>
> Thanks.
>
> [2. text/x-diff; datamash-initial-vnlog-support.patch]
> diff --git a/src/datamash.c b/src/datamash.c
> index 4219c26..e2dbf2f 100644
> --- a/src/datamash.c
> +++ b/src/datamash.c
> @@ -114,7 +114,7 @@ enum
>    UNDOC_RMDUP_TEST
>  };
>  
> -static char const short_options[] = "sfF:izg:t:HWR:C";
> +static char const short_options[] = "sfF:izg:t:HWR:Cv";
>  
>  static struct option const long_options[] =
>  {
> @@ -127,6 +127,7 @@ static struct option const long_options[] =
>    {"header-in", no_argument, NULL, INPUT_HEADER_OPTION},
>    {"header-out", no_argument, NULL, OUTPUT_HEADER_OPTION},
>    {"headers", no_argument, NULL, 'H'},
> +  {"vnlog", no_argument, NULL, 'v'},
>    {"full", no_argument, NULL, 'f'},
>    {"filler", required_argument, NULL, 'F'},
>    {"format", required_argument, NULL, CUSTOM_FORMAT_OPTION},
> @@ -433,6 +434,9 @@ print_input_line (const struct line_record_t* lb)
>  static void
>  print_column_headers ()
>  {
> +  if ( vnlog )
> +      printf ("# ");
> +
>    if (print_full_line)
>      {
>        /* Print the headers of all the input fields */
> @@ -514,7 +518,9 @@ process_input_header (FILE *stream)
>    struct line_record_t lr;
>  
>    line_record_init (&lr);
> -  if (line_record_fread (&lr, stream, eolchar, skip_comments))
> +
> +  if ( (!vnlog && line_record_fread (&lr, stream, eolchar, skip_comments )) 
> ||
> +       ( vnlog && line_record_fread_vnlog_prologue (&lr, stream, eolchar )) )
>      {
>        build_input_line_headers (&lr, true);
>        line_number++;
> @@ -971,7 +977,8 @@ remove_dups_in_file ()
>  
>    if (input_header)
>      {
> -      if (line_record_fread (thisline, input_stream, eolchar, skip_comments))
> +      if ( (!vnlog && line_record_fread (thisline, input_stream, eolchar, 
> skip_comments )) ||
> +           ( vnlog && line_record_fread_vnlog_prologue (thisline, 
> input_stream, eolchar )) )
>          {
>            line_number++;
>  
> @@ -1157,6 +1164,15 @@ int main (int argc, char* argv[])
>      {
>        switch (optc)
>          {
> +        case 'v':
> +          skip_comments        = true;
> +          input_header         = output_header = true;
> +          missing_field_filler = "-";
> +          in_tab               = ' ';
> +          out_tab              = ' ';
> +          vnlog                = true;
> +          break;
> +
>       case 'C':
>         skip_comments = true;
>         break;
> diff --git a/src/text-lines.c b/src/text-lines.c
> index dc9ea5f..c7aa2eb 100644
> --- a/src/text-lines.c
> +++ b/src/text-lines.c
> @@ -34,6 +34,7 @@
>  
>  #include "text-options.h"
>  #include "text-lines.h"
> +#include "die.h"
>  
>  void
>  line_record_init (struct line_record_t* lr)
> @@ -91,12 +92,19 @@ line_record_reserve_fields (struct line_record_t* lr, 
> const size_t n)
>  }
>  
>  static void
> -line_record_parse_fields (struct line_record_t *lr, int field_delim)
> +line_record_parse_fields (/* The buffer. May or may not be the one in the
> +                             following argument */
> +                          const struct linebuffer* lbuf,
> +
> +                          /* Used ONLY for the fields. The buffer is picked 
> up
> +                             from the above argument */
> +                          struct line_record_t *lr,
> +                          int field_delim)
>  {
>    size_t num_fields = 0;
>    size_t pos = 0;
> -  const size_t buflen = line_record_length (lr);
> -  const char* fptr = line_record_buffer (lr);
> +  const size_t buflen = lbuf->length;
> +  const char*  fptr   = lbuf->buffer;
>  
>    /* Move 'fptr' to point to the beginning of 'field' */
>    if (field_delim != TAB_WHITESPACE)
> @@ -157,33 +165,91 @@ line_record_parse_fields (struct line_record_t *lr, int 
> field_delim)
>  }
>  
>  
> -static bool
> -line_record_is_comment (const struct line_record_t* lr)
> +// returns 0 if not a comment, 1 if a single comment, 2 if a double comment
> +static int
> +line_comment_count (const struct line_record_t* lr)
>  {
>    const char* pch = line_record_buffer (lr);
>  
>    /* Skip white space at beginning of line */
>    size_t s = strspn (pch, " \t");
>    /* First non-whitespace character */
> -  char c = pch[s];
> -  return (c=='#' || c==';');
> +  const char* c = &pch[s];
> +  if (!(c[0]=='#' || c[0]==';'))
> +      // not any comment
> +      return 0;
> +  if(c[0] == '\0')
> +      return 1;
> +  if( c[0] == '#' && (c[1] == '#' || c[1] == '!') )
> +      return 2;
> +  if( c[0] == ';' && c[1] == ';')
> +      return 2;
> +  return 1;
>  }
>  
> -bool
> -line_record_fread (struct /* in/out */ line_record_t* lr,
> -                  FILE *stream, char delimiter, bool skip_comments)
> +static bool
> +_line_record_fread (struct /* in/out */ line_record_t* lr,
> +                    FILE *stream, char delimiter,
> +                    bool skip_single_comments,
> +                    bool vnlog_prologue)
>  {
> -  do {
> +  while(1) {
>      if (readlinebuffer_delim (&lr->lbuf, stream, delimiter) == 0)
>        return false;
>      linebuffer_nullify (&lr->lbuf);
> -  } while (skip_comments && line_record_is_comment (lr));
> +    int comment_count = line_comment_count (lr);
> +    if( skip_single_comments && comment_count>=1)
> +        continue;
> +    if( vnlog_prologue )
> +    {
> +        // I skip double-comments
> +        //
> +        // I read single-commented lines that have anything following the 
> single
> +        // comment character. And I strip out the comment character
> +        //
> +        // I barf on anything else. No data before the header allowed
> +        if( comment_count >= 2 )
> +            continue;
> +        if( comment_count == 1 )
> +        {
> +            // one comment. I need to strip the comment characters. Skip 
> leading
> +            // regex '^\s*#\s*'
> +            const char* pch = line_record_buffer (lr);
> +            size_t s = strspn (pch, " \t#");
> +            struct linebuffer lbuf = lr->lbuf;
> +            lbuf.buffer += s;
> +            lbuf.length -= s;
> +            if(lbuf.buffer[0] == '\0')
> +                // empty comment line. ignore.
> +                continue;
> +            line_record_parse_fields (&lbuf, lr, in_tab);
> +            return true;
> +        }
> +        // No comment. This is an illegal data line. Barf.
> +        die (EXIT_FAILURE, 0, _("invalid vnlog data: received data line 
> prior to the header: '%s'"),
> +             line_record_buffer (lr));
>  
> +    }
> +    break;
> +  }
>  
> -  line_record_parse_fields (lr, in_tab);
> +  line_record_parse_fields (&lr->lbuf, lr, in_tab);
>    return true;
>  }
>  
> +bool
> +line_record_fread (struct /* in/out */ line_record_t* lr,
> +                   FILE *stream, char delimiter, bool skip_comments)
> +{
> +    return _line_record_fread(lr, stream, delimiter, skip_comments, false);
> +}
> +bool
> +line_record_fread_vnlog_prologue (struct /* in/out */ line_record_t* lr,
> +                                  FILE *stream, char delimiter)
> +{
> +    return _line_record_fread(lr, stream, delimiter, false, true);
> +}
> +
>  void
>  line_record_free (struct line_record_t* lr)
>  {
> diff --git a/src/text-lines.h b/src/text-lines.h
> index e318293..9d98c10 100644
> --- a/src/text-lines.h
> +++ b/src/text-lines.h
> @@ -83,6 +83,9 @@ line_record_init (struct line_record_t* lr);
>  bool
>  line_record_fread (struct /* in/out */ line_record_t* lr,
>                     FILE *stream, char delimiter, bool skip_comments);
> +bool
> +line_record_fread_vnlog_prologue (struct /* in/out */ line_record_t* lr,
> +                                  FILE *stream, char delimiter);
>  
>  void
>  line_record_free (struct line_record_t* lr);
> diff --git a/src/text-options.c b/src/text-options.c
> index 3e811b7..a775db1 100644
> --- a/src/text-options.c
> +++ b/src/text-options.c
> @@ -68,6 +68,8 @@ char* missing_field_filler = "N/A";
>     followed by '#' or ';'. See line_record_is_comment().  */
>  bool skip_comments = false;
>  
> +bool vnlog = false;
> +
>  #define UCHAR_LIM (UCHAR_MAX + 1)
>  bool blanks[UCHAR_LIM];
>  
> diff --git a/src/text-options.h b/src/text-options.h
> index dc275cd..6cde4aa 100644
> --- a/src/text-options.h
> +++ b/src/text-options.h
> @@ -66,6 +66,8 @@ extern char* missing_field_filler;
>     followed by '#' or ';'. See line_record_is_comment().  */
>  extern bool skip_comments;
>  
> +extern bool vnlog;
> +
>  #define UCHAR_LIM (UCHAR_MAX + 1)
>  extern bool blanks[UCHAR_LIM];
>  




reply via email to

[Prev in Thread] Current Thread [Next in Thread]