bug-recutils
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: Adding an aggregate for variance


From: John Darrington
Subject: Re: Adding an aggregate for variance
Date: Tue, 13 Oct 2020 20:38:31 +0200
User-agent: Mutt/1.10.1 (2018-07-13)

Much as I think this change is of a generally good quality, I wonder
if it really belongs in recutils.

Recutils is a database, and not a statistical analysis tool.  It could
be the start of a slippery slope ... what would come next  ... ?
Sample Variance? Sample Std. Dev?  Population Std. Dev?  Skewness?
Kurtosis?  Covariance?  Correlation Coefficients? Chi-square  ....

There are other GNU tools which do these calculations.  Rather than
pretending that recutils is a statistical analysis tool, I think it
would be a better idea to implement easier ways to interface recutils
to the tools which are specifically designed for this job.


Just my $0.02

J'



On Tue, Oct 13, 2020 at 10:01:54AM -0700, Frank Pursel wrote:
     Addressing some of the issues from the first try.  I hope this
     is better.
     
     Regards,
     Frank

     diff --git a/recutils-1.8/ChangeLog b/recutils-1.8/ChangeLog
     index d234bae..f3b8414 100644
     --- a/recutils-1.8/ChangeLog
     +++ b/recutils-1.8/ChangeLog
     @@ -1,3 +1,10 @@
     +2020-10-13  Frank Pursel <purself@yahoo.com>
     +
     +  * src/rec-aggregate.c: Added Var aggregate functions,
     +  rec_aggregate_std_var and rec_aggregate_std_var_record.
     +  * torture/utils/recsel.sh: Added test cases for the Var aggregate.
     +  * doc/recutils.texi: Basic documentation for same.
     +  
      2019-01-03  Jose E. Marchesi  <jose.marchesi@oracle.com>
      
        * configure.ac: Bump version to 1.8.
     diff --git a/recutils-1.8/doc/recutils.texi 
b/recutils-1.8/doc/recutils.texi
     index 38877a3..3762fb8 100644
     --- a/recutils-1.8/doc/recutils.texi
     +++ b/recutils-1.8/doc/recutils.texi
     @@ -3527,6 +3527,8 @@ The supported aggregate functions are the following:
      Counts the number of occurrences of a field.
      @item Avg(FIELD)
      Calculates the average (mean) of the numerical values of a field.
     +@item Var(FIELD)
     +Calculates the population variance of the numerical values of a field.
      @item Sum(FIELD)
      Calculates the sum of the numerical values of a field.
      @item Min(FIELD)
     diff --git a/recutils-1.8/src/rec-aggregate.c 
b/recutils-1.8/src/rec-aggregate.c
     index e28f9d8..5194cc3 100644
     --- a/recutils-1.8/src/rec-aggregate.c
     +++ b/recutils-1.8/src/rec-aggregate.c
     @@ -52,6 +52,12 @@ struct rec_aggregate_reg_s
        size_t num_functions;
      };
      
     +struct rec_aggregate_reg_var_s
     +{
     +  int n;
     +  double values[10000];
     +};
     +  
      /* Static functions defined in this file.  */
      
      static char *rec_aggregate_std_count (rec_rset_t rset,
     @@ -64,6 +70,12 @@ static char *rec_aggregate_std_avg (rec_rset_t rset,
      static double rec_aggregate_std_avg_record (rec_record_t record,
                                                  const char *field_name);
      
     +static char *rec_aggregate_std_var (rec_rset_t rset,
     +                                    rec_record_t record,
     +                                    const char *field_name);
     +static struct rec_aggregate_reg_var_s rec_aggregate_std_var_record 
(rec_record_t record,
     +                                                       const char 
*field_name);
     +
      static char *rec_aggregate_std_sum (rec_rset_t rset,
                                          rec_record_t record,
                                          const char *field_name);
     @@ -96,11 +108,12 @@ struct rec_aggregate_descriptor_s
        rec_aggregate_t func;
      };
      
     -#define NUM_STD_AGGREGATES 5
     +#define NUM_STD_AGGREGATES 6
      
      static struct rec_aggregate_descriptor_s std_aggregates[] =
        {{"count", &rec_aggregate_std_count},
         {"avg",   &rec_aggregate_std_avg},
     +   {"var",   &rec_aggregate_std_var},
         {"sum",   &rec_aggregate_std_sum},
         {"min",   &rec_aggregate_std_min},
         {"max",   &rec_aggregate_std_max}};
     @@ -305,6 +318,102 @@ rec_aggregate_std_avg_record (rec_record_t record,
        return avg;
      }
      
     +static char *
     +rec_aggregate_std_var (rec_rset_t rset,
     +                       rec_record_t record,
     +                       const char *field_name)
     +{
     +  char *result = NULL;
     +  double var = 0;
     +  struct rec_aggregate_reg_var_s vals;
     +  struct rec_aggregate_reg_var_s mval;
     +  mval.n = 0;
     +  
     +  if (record)
     +    {
     +      vals = rec_aggregate_std_var_record (record, field_name);
     +      if (vals.n < 2)
     +  {
     +    var = 0;
     +  }
     +      else
     +  {
     +    double avg = 0;
     +    for (int i=0; i < vals.n; i++) { avg += vals.values[i]; }
     +    avg = avg / vals.n;
     +    for (int i=0; i < vals.n; i++)
     +      {
     +        var += ((vals.values[i] - avg) * (vals.values[i] - avg));
     +      }
     +    var = var / vals.n;
     +  }
     +    }
     +  else if (rset)
     +    {
     +      int num_records = 0;
     +      rec_record_t rec = NULL;
     +      rec_mset_iterator_t iter = rec_mset_iterator (rec_rset_mset (rset));
     +
     +      while (rec_mset_iterator_next (&iter, MSET_RECORD, (void *) &rec, 
NULL))
     +        {
     +    
     +    vals = rec_aggregate_std_var_record (rec, field_name);
     +    for (int i=vals.n; i >= 0; i--)
     +      {
     +        mval.values[mval.n + i] = vals.values[i];
     +      }
     +    mval.n += vals.n;
     +
     +        }
     +      rec_mset_iterator_free (&iter);
     +
     +      if (mval.n > 1) {
     +  double avg = 0;
     +  for (int i=0; i<mval.n; i++) { avg += mval.values[i]; }
     +  avg = avg / mval.n;
     +  for (int i=0; i<mval.n; i++) {
     +    var += (mval.values[i] - avg) * (mval.values[i] - avg);
     +  }
     +  var = var / mval.n;
     +      }
     +      else
     +  var = 0;
     +    }
     +  /* Return the average as a string.  Note that if NULL is returned it
     +     will be returned by this function below to signal the
     +     end-of-memory condition.  */
     +  asprintf (&result, "%g", var);
     +
     +  return result;
     + 
     +}
     +
     +static struct rec_aggregate_reg_var_s
     +rec_aggregate_std_var_record (rec_record_t record,
     +                              const char *field_name)
     +{
     +  struct rec_aggregate_reg_var_s part_var;
     +  
     +  rec_field_t field;
     +  int num_fields = 0;
     +  rec_mset_iterator_t iter = rec_mset_iterator (rec_record_mset (record));
     +
     +  while (rec_mset_iterator_next (&iter, MSET_FIELD, (void *) &field, 
NULL))
     +    {
     +      double field_value_double = 0;
     +      const char *field_value = rec_field_value (field);
     +
     +      if (rec_field_name_equal_p (rec_field_name (field), field_name)
     +          && rec_atod (field_value, &field_value_double))
     +        {
     +    part_var.values[part_var.n++] = field_value_double;
     +        }
     +    }
     +  rec_mset_iterator_free (&iter);
     +
     +  return part_var;
     +}
     +
      #define REC_AGGREGATE_ACCUM_FUNC(NAME, OP, INIT_VAL)                    \
        static char *                                                         \
        rec_aggregate_std_##NAME (rec_rset_t rset,                            \
     diff --git a/recutils-1.8/torture/utils/recsel.sh 
b/recutils-1.8/torture/utils/recsel.sh
     index 2bedc18..2a9c720 100755
     --- a/recutils-1.8/torture/utils/recsel.sh
     +++ b/recutils-1.8/torture/utils/recsel.sh
     @@ -1579,6 +1579,13 @@ test_tool recsel-aggregate-avg-overall ok \
      '39
      '
      
     +test_tool recsel-aggregate-var-overall ok \
     +    recsel \
     +    '-P "Var(Cost)"' \
     +    sales \
     +'1133.6
     +'
     +
      test_tool recsel-aggregate-avg-grouped ok \
                recsel \
                '-p "Item,Avg(Cost)" -G Item' \
     @@ -1596,6 +1603,23 @@ Item: D
      Avg_Cost: 100
      '
      
     +test_tool recsel-aggregate-avg-grouped ok \
     +    recsel \
     +    '-p "Item,Var(Cost)" -G Item' \
     +    sales \
     +'Item: A
     +Var_Cost: 42.25
     +
     +Item: B
     +Var_Cost: 0
     +
     +Item: C
     +Var_Cost: 0
     +
     +Item: D
     +Var_Cost: 0
     +'
     +
      test_tool recsel-aggregate-sum-overall ok \
                recsel \
                '-P "Sum(Cost)"' \




reply via email to

[Prev in Thread] Current Thread [Next in Thread]