gnuastro-commits
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[gnuastro-commits] master 7b9950bc: Statistics: new option to calculate


From: Mohammad Akhlaghi
Subject: [gnuastro-commits] master 7b9950bc: Statistics: new option to calculate concentration of distribution
Date: Sat, 27 Apr 2024 21:47:50 -0400 (EDT)

branch: master
commit 7b9950bcf8d952044fa87010c7576e053a5557eb
Author: Mohammad Akhlaghi <mohammad@akhlaghi.org>
Commit: Mohammad Akhlaghi <mohammad@akhlaghi.org>

    Statistics: new option to calculate concentration of distribution
    
    Until now, there was no objective (independent of distribution's values)
    way to measure the "concentration" of the given distribution around the
    median. This is necessary in many contexts in addition to '--quantofmean'
    (to make sure that the median and mean are the same and thus that the
    median can be interpreted as the "center").
    
    With this commit, a new function has been added for this purpose. The main
    workhorse is in a newly added function in the library, which the Statistics
    program uses.
---
 NEWS                        |   9 ++++
 bin/statistics/args.h       |  14 ++++++
 bin/statistics/statistics.c |  15 ++++--
 bin/statistics/ui.c         |  14 ++++++
 bin/statistics/ui.h         |   1 +
 doc/gnuastro.texi           |  70 ++++++++++++++++++++++++++++
 lib/gnuastro/statistics.h   |  10 ++++
 lib/statistics.c            | 111 +++++++++++++++++++++++++++++++++++++++++---
 8 files changed, 234 insertions(+), 10 deletions(-)

diff --git a/NEWS b/NEWS
index 868c20a3..132424b7 100644
--- a/NEWS
+++ b/NEWS
@@ -26,6 +26,12 @@ See the end of the file for license conditions.
       operands. This is useful in combination with operators that produce
       more than one output operand.
 
+*** Statistics
+
+  --concentration: measure the "concentration" of values in a distribution
+    around the median. See the book for the full description of this
+    option.
+
 *** astscript-fits-view
   --globalhdu: use the same HDU in any number of input files (with the
     short format of '-g'); similar to the same option in Arithmetic or
@@ -62,6 +68,9 @@ See the end of the file for license conditions.
     with very different RAM and/or CPU threads. See the minimal working
     example in the book for more.
 
+*** Library
+- gal_statistics_concentration: measure the concentration of values around
+  the median; see the book for the details.
 ** Removed features
 ** Changed features
 *** All programs
diff --git a/bin/statistics/args.h b/bin/statistics/args.h
index d2468274..2a00059d 100644
--- a/bin/statistics/args.h
+++ b/bin/statistics/args.h
@@ -462,6 +462,20 @@ struct argp_option program_options[] =
       GAL_OPTIONS_NOT_SET,
       ui_add_to_single_value
     },
+    {
+      "concentration",
+      UI_KEY_CONCENTRATION,
+      "FLT[,...]",
+      0,
+      "Uniform dist: 1.0; higher: concentrated.",
+      UI_GROUP_SINGLE_VALUE,
+      &p->singlevalue,
+      GAL_TYPE_FLOAT32,
+      GAL_OPTIONS_RANGE_GT_0_LT_1,
+      GAL_OPTIONS_NOT_MANDATORY,
+      GAL_OPTIONS_NOT_SET,
+      ui_add_to_single_value
+    },
 
 
 
diff --git a/bin/statistics/statistics.c b/bin/statistics/statistics.c
index 5f0b512f..a0c6cc02 100644
--- a/bin/statistics/statistics.c
+++ b/bin/statistics/statistics.c
@@ -212,6 +212,7 @@ statistics_print_one_row(struct statisticsparams *p)
       /* These will be calculated as printed. */
       case UI_KEY_QUANTILE:
       case UI_KEY_QUANTFUNC:
+      case UI_KEY_CONCENTRATION:
         break;
 
       /* The option isn't recognized. */
@@ -294,7 +295,13 @@ statistics_print_one_row(struct statisticsparams *p)
                                  GAL_STATISTICS_CLIP_OUTCOL_NUMBER_USED);
           mustfree=1; break;
 
-        /* Not previously calculated. */
+        /* Not previously calculated; because they can have multiple input
+           arguments. */
+        case UI_KEY_CONCENTRATION:
+          arg=statistics_read_check_args(p);
+          out=gal_statistics_concentration(p->sorted, arg, 1);
+          break;
+
         case UI_KEY_QUANTILE:
           mustfree=1;
           arg = statistics_read_check_args(p);
@@ -321,9 +328,9 @@ statistics_print_one_row(struct statisticsparams *p)
 
         /* The option isn't recognized. */
         default:
-          error(EXIT_FAILURE, 0, "%s: a bug! Please contact us at %s so we "
-                "can address the problem. Operation code %d not recognized",
-                __func__, PACKAGE_BUGREPORT, tmp->v);
+          error(EXIT_FAILURE, 0, "%s: a bug! Please contact us at %s so "
+                "we can address the problem. Operation code %d not "
+                "recognized", __func__, PACKAGE_BUGREPORT, tmp->v);
         }
 
       /* Print the number. Note that we don't want any extra white space
diff --git a/bin/statistics/ui.c b/bin/statistics/ui.c
index ac07a1b0..ca63694e 100644
--- a/bin/statistics/ui.c
+++ b/bin/statistics/ui.c
@@ -266,6 +266,19 @@ ui_add_to_single_value(struct argp_option *option, char 
*arg,
       d=inputs->array;
       switch(option->key)
         {
+        case UI_KEY_CONCENTRATION:
+          for(i=0;i<inputs->size;++i)
+            {
+              if(d[i]>0.5f || d[i]<=0.0f)
+                error(EXIT_FAILURE, 0, "%g (value given to "
+                      "'--concentration': quantile range around median) "
+                      "should be larger than 0 and smaller than 0.5",
+                      d[i]);
+              gal_list_f64_add(&p->tp_args, d[i]);
+              gal_list_i32_add(&p->singlevalue, option->key);
+            }
+          break;
+
         case UI_KEY_QUANTILE:
         case UI_KEY_QUANTFUNC:
           /* For the quantile and the quantile function, its possible to
@@ -825,6 +838,7 @@ ui_make_sorted_if_necessary(struct statisticsparams *p)
       case UI_KEY_SIGCLIPNUMBER:
       case UI_KEY_MADCLIPMEDIAN:
       case UI_KEY_SIGCLIPMEDIAN:
+      case UI_KEY_CONCENTRATION:
         is_necessary=1;
         break;
       }
diff --git a/bin/statistics/ui.h b/bin/statistics/ui.h
index 06d7a7b9..2e40c041 100644
--- a/bin/statistics/ui.h
+++ b/bin/statistics/ui.h
@@ -128,6 +128,7 @@ enum option_keys_enum
   UI_KEY_FITESTIMATEHDU,
   UI_KEY_FITESTIMATECOL,
   UI_KEY_FITROBUST,
+  UI_KEY_CONCENTRATION,
 };
 
 
diff --git a/doc/gnuastro.texi b/doc/gnuastro.texi
index 2d5a71bd..c794e9a6 100644
--- a/doc/gnuastro.texi
+++ b/doc/gnuastro.texi
@@ -27336,6 +27336,69 @@ Calculate the desired statistic after applying median 
absolute deviation (MAD) c
 MAD-clipping configuration is done with the @option{--mclipparams} option.
 
 This option behaves similarly to @option{--sigclip-*} options, read their 
description for usage examples.
+
+@item --concentration=FLT[,FLT[,...]]
+Return the ``concentration'' around the median (see rest of this description 
for the definition); the input value(s) are the quantile width(s) where it is 
measured.
+
+For a uniform distribution, the output of this operation will be approximately 
@mymath{1.0}.
+With a higher density of values around the median, the value will be larger 
for a Gaussian distribution, and even larger for more concentrated 
distributions (than a Gaussian).
+
+This is the algorithm used to measure this value:
+@enumerate
+@item
+Sort the input dataset and remove all blank values.
+If there is one non-blank value or less, then return NaN.
+@item
+The minimum and maximum are respectively selected to be the second and 
second-to-last elements in the sorted array.
+The first and last elements are not selected as minimum and maximum because 
they are affected too strongly by scatter.
+@item
+Subtract each element from the minimum, and divide it by the difference 
between the minimum and maximum.
+After this operation, the input's values@footnote{Technically, the second 
sorted value will be 0 and the second-to-last value will be 1.} will be between 
0 and 1.
+
+This scaling does not change the order of the input elements; instead, each 
element's value now shows its relation to the range of the whole distribution's 
values (the minimum and maximum values above).
+@item
+Calculate the scaled values corresponding to quantiles that are defined by the 
width above.
+For example, if the given width (value to this option) is 0.2, the quantiles 
of @mymath{0.5-(0.2/2)=0.4} and @mymath{0.5+(0.2/2)=0.5} will be measured.
+@item
+The width is divided by the difference between the quantiles and returned as 
the concentration.
+@end enumerate
+
+In a uniform distribution, the scaling step will convert each input into its 
quantile: the spacing between scaled values will be uniform.
+As a result, the difference between the quantiles measured around the median 
will be equal to the input width and the result will be approximately 
@mymath{1.0}.
+However, if the distribution is concentrated around the median, the spacing 
between the scaled values will be much less around the median and the quantile 
difference will be less than the width.
+Therefore, when we divide the width by the quantile difference, the value will 
be larger than one.
+
+The example commands below create two randomly distributed ``noisy'' images, 
one with a Gaussian distribution and one with a uniform distribution.
+We will then run this option on both to see the different 
concentrations@footnote{The values you get will be slightly different because 
of the different random seeeds.
+To get a reproducible result, see @ref{Generating random numbers}.}.
+See @ref{Generating histograms and cumulative frequency plots} on how you can 
generate the histogram of these two images on the command-line to visualize the 
distribution.
+
+@example
+$ astarithmetic 1000 1000 2 makenew 10 mknoise-sigma \
+                --output=gaussian.fits
+
+$ astarithmetic 1000 1000 2 makenew 10 mknoise-uniform \
+                --output=uniform.fits
+
+$ aststatistics gaussian.fits --concentration=0.25
+3.71347573489440e+00
+
+$ aststatistics uniform.fits  --concentration=0.25
+9.99988794452348e-01
+@end example
+
+Note that this option is primarily designed for symmetric distributions, not 
skewed ones (where the mode and median will be distant).
+Here, we define the ``center'' in ``concentration'' as the median, not the 
mode.
+To check if the distribution is symmetric (that the mode and median are 
similar), you can use the @option{--quantofmean} option described above.
+Recall that you can call all the options in this section in one call to the 
Statistics program like below:
+
+@example
+$ aststatistics gaussian.fits \
+                --quantofmean --concentration=0.25
+5.00260500260500e-01    3.71347573489440e+00
+@end example
+
+From the quantile-of-mean value of approximately 0.5, we see that the 
distribution is symmetric and from the concentration, we see that it is not a 
uniform one.
 @end table
 
 @node Generating histograms and cumulative frequency plots, Fitting options, 
Single value measurements, Invoking aststatistics
@@ -43074,6 +43137,13 @@ If it is @code{NULL}, then the histogram will be 
calculated internally and freed
 When a histogram is given and it is normalized, the CFP will also be 
normalized (even if the normalized flag is not set here): note that a 
normalized CFP's maximum value is 1.
 @end deftypefun
 
+@deftypefun {gal_data_t *} gal_statistics_concentration (gal_data_t 
@code{*input}, double @code{width}, int @code{inplace})
+Return the concentration around the median for the input distribution.
+For more on the algorithm and @code{width}, see the description of 
@option{--concentration} in @ref{Single value measurements}.
+
+If @code{inplace!=0}, then this function will use the actual input data's 
allocated space and not internally allocate a new dataset (which can have 
memory and CPU benefits); but will alter (sort and remove blank elements from) 
your input dataset.
+@end deftypefun
+
 
 @deftypefun {gal_data_t *} gal_statistics_clip_sigma (gal_data_t 
@code{*input}, float @code{multip}, float @code{param}, float 
@code{extrastats}, int @code{inplace}, int @code{quiet})
 Apply @mymath{\sigma}-clipping on a given dataset and return a dataset that 
contains the results.
diff --git a/lib/gnuastro/statistics.h b/lib/gnuastro/statistics.h
index d65da442..6fd84267 100644
--- a/lib/gnuastro/statistics.h
+++ b/lib/gnuastro/statistics.h
@@ -199,6 +199,16 @@ gal_statistics_cfp(gal_data_t *data, gal_data_t *bins, int 
normalize);
 
 
 
+/****************************************************************
+ *****************     Distribution shape    ********************
+ ****************************************************************/
+gal_data_t *
+gal_statistics_concentration(gal_data_t *input, double width,
+                             int inplace);
+
+
+
+
 /****************************************************************
  *****************         Outliers          ********************
  ****************************************************************/
diff --git a/lib/statistics.c b/lib/statistics.c
index 73ac061d..489b4c2c 100644
--- a/lib/statistics.c
+++ b/lib/statistics.c
@@ -2325,10 +2325,10 @@ gal_statistics_cfp(gal_data_t *input, gal_data_t *bins, 
int normalize)
                       ( hist->type==GAL_TYPE_FLOAT32
                         ? "frac" : "count" ),
                       ( hist->type==GAL_TYPE_FLOAT32
-                        ? "Fraction of data elements from the start to this "
-                        "bin (inclusive)."
-                        : "Number of data elements from the start to this "
-                        "bin (inclusive).") );
+                        ? "Fraction of data elements from the start to "
+                        "this bin (inclusive)."
+                        : "Number of data elements from the start to "
+                        "this bin (inclusive).") );
 
 
   /* Fill in the cumulative frequency plot. */
@@ -2393,6 +2393,102 @@ gal_statistics_cfp(gal_data_t *input, gal_data_t *bins, 
int normalize)
 
 
 
+/****************************************************************
+ *****************     Distribution shape    ********************
+ ****************************************************************/
+#define STATISTICS_CONCENTRATION_OP(TYPE) {                           \
+    size_t i;                                                           \
+    TYPE *a=nbs->array;                  /* The raw min and max */      \
+    TYPE min=a[1], max=a[nbs->size-2];   /* have too much scatter. */   \
+                                                                        \
+    /* Put the values in a range of 0 to 1 and get the values on the */ \
+    /* desired quantiles. */                                            \
+    for(i=0;i<nbs->size;++i) a[i]=(a[i]-min)/(max-min);                 \
+    vhigh=a[ihigh];                                                     \
+    vlow=a[ilow];                                                       \
+                                                                        \
+    /* If this operation was done in-place, undo the scaling because */ \
+    /* the caller may need to do other operations on the dataset. */    \
+    if(nbs!=input)                                                      \
+      { for(i=0;i<nbs->size;++i) a[i]=(a[i]-min)/(max-min); }           \
+  }
+
+gal_data_t *
+gal_statistics_concentration(gal_data_t *input, double q_width,
+                             int inplace)
+{
+  double vlow, vhigh;
+  gal_data_t *out, *nbs;
+  size_t one=1, ilow, ihigh;
+
+  /* Allocate the output. */
+  out=gal_data_alloc(NULL, GAL_TYPE_FLOAT64, 1, &one, NULL, 0, -1, 1,
+                     NULL, NULL, NULL);
+
+  /* Remove the blanks and sort the input; then convert the data to the
+     desired floating point precision. In case there are no non-blank
+     elements, return with a NaN. */
+  nbs=gal_statistics_no_blank_sorted(input, inplace);
+  if(nbs==NULL || nbs->size<=1)
+    { ((double *)(out->array))[0]=NAN; return out; }
+
+  /* If the input is not floating point, we cannot do the operation
+     in-place because we will be changing all the values into a range of
+     0.0 to 1.0. Integers are converted to 32-bit floats because by
+     definition, we are dealing with large quantile differences so even if
+     32-bit floats cannot fully preserve the integer differences, it should
+     not make any statistical significance, but it makes a large difference
+     in RAM and CPU usage.*/
+  if(input->type!=GAL_TYPE_FLOAT32 || input->type!=GAL_TYPE_FLOAT64)
+    {
+      if(nbs==input) /* Was in-place. */
+        nbs=gal_data_copy_to_new_type(nbs, GAL_TYPE_FLOAT32);
+      else           /* Not in-place: free the old 'nbs'. */
+        nbs=gal_data_copy_to_new_type_free(nbs, GAL_TYPE_FLOAT32);
+    }
+
+  /* Get the index of the desired quantile indexs. */
+  ilow=gal_statistics_quantile_index(nbs->size, 0.5-(q_width/2));
+  ihigh=gal_statistics_quantile_index(nbs->size, 0.5+(q_width/2));
+
+  /* Find the values at the low and high quantiles and write the output
+     value. */
+  switch(nbs->type)
+    {
+    case GAL_TYPE_FLOAT32: STATISTICS_CONCENTRATION_OP(float)  break;
+    case GAL_TYPE_FLOAT64: STATISTICS_CONCENTRATION_OP(double) break;
+    default:
+      error(EXIT_FAILURE, 0, "%s: a bug! Please contact us at '%s' to "
+            "fix this problem. 'nbs->type' of '%s' is not expected "
+            "at this point of the function", __func__,
+            PACKAGE_BUGREPORT, gal_type_name(nbs->type, 1));
+    }
+  ((double *)(out->array))[0]=q_width/(vhigh-vlow);
+
+  /* Clean up and return the output. */
+  if(nbs!=input) gal_data_free(nbs);
+  return out;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 /****************************************************************
  *****************         Outliers          ********************
  ****************************************************************/
@@ -2628,9 +2724,12 @@ statistics_clip(gal_data_t *input, float multip, float 
param,
       while(num<maxnum && size)
         {
           /* 'start' and 'size' will be different in the next round
-             (updated within 'CLIPALL'). */
+             (updated within 'CLIPALL'). We are also setting 'dsize[0]'
+             because the 'nbs' dataset is one dimensional and for future
+             steps (like writing values in a table); dsize[0] is
+             important.*/
           nbs->array = start;
-          nbs->size = oldsize = size;
+          nbs->dsize[0] = nbs->size = oldsize = size;
 
           /* For a detailed check, just correct the type).
           if(!quiet)



reply via email to

[Prev in Thread] Current Thread [Next in Thread]