pspp-dev
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [patch 05/19] sparse array of cases


From: John Darrington
Subject: Re: [patch 05/19] sparse array of cases
Date: Thu, 7 Jun 2007 08:15:34 +0800
User-agent: Mutt/1.5.13 (2006-08-11)

No problems here.

On Tue, Jun 05, 2007 at 11:27:32PM -0700, address@hidden wrote:
     We have a sparse_array data structure that can easily give us a sparse
     array of cases.  The sparse_cases data structure augments this with
     the ability to dump cases to disk if we get too many cases in memory.
     
     sparse_cases is used by the datasheet code.
     
     Index: merge/src/data/automake.mk
     ===================================================================
     --- merge.orig/src/data/automake.mk        2007-06-03 17:42:16.000000000 
-0700
     +++ merge/src/data/automake.mk     2007-06-03 17:42:17.000000000 -0700
     @@ -61,6 +61,8 @@
        src/data/scratch-writer.h \
        src/data/settings.c \
        src/data/settings.h \
     +  src/data/sparse-cases.c \
     +  src/data/sparse-cases.h \
        src/data/storage-stream.c \
        src/data/storage-stream.h \
        src/data/sys-file-private.c \
     Index: merge/src/data/sparse-cases.c
     ===================================================================
     --- /dev/null      1970-01-01 00:00:00.000000000 +0000
     +++ merge/src/data/sparse-cases.c  2007-06-03 18:14:31.000000000 -0700
     @@ -0,0 +1,334 @@
     +/* PSPP - computes sample statistics.
     +   Copyright (C) 2007 Free Software Foundation, Inc.
     +
     +   This program is free software; you can redistribute it and/or
     +   modify it under the terms of the GNU General Public License as
     +   published by the Free Software Foundation; either version 2 of the
     +   License, or (at your option) any later version.
     +
     +   This program is distributed in the hope that it will be useful, but
     +   WITHOUT ANY WARRANTY; without even the implied warranty of
     +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     +   General Public License for more details.
     +
     +   You should have received a copy of the GNU General Public License
     +   along with this program; if not, write to the Free Software
     +   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     +   02110-1301, USA. */
     +
     +#include <config.h>
     +
     +#include <data/sparse-cases.h>
     +
     +#include <stdlib.h>
     +#include <string.h>
     +
     +#include <data/settings.h>
     +#include <data/case-tmpfile.h>
     +#include <libpspp/assertion.h>
     +#include <libpspp/range-set.h>
     +#include <libpspp/sparse-array.h>
     +
     +#include "xalloc.h"
     +
     +/* A sparse array of cases. */
     +struct sparse_cases 
     +  {
     +    size_t column_cnt;                  /* Number of values per case. */
     +    union value *default_columns;       /* Defaults for unwritten cases. 
*/
     +    casenumber max_memory_cases;        /* Max cases before dumping to 
disk. */
     +    struct sparse_array *memory;        /* Backing, if stored in memory. 
*/
     +    struct case_tmpfile *disk;          /* Backing, if stored on disk. */
     +    struct range_set *disk_cases;       /* Allocated cases, if on disk. */
     +  };
     +
     +/* Creates and returns a new sparse array of cases with
     +   COLUMN_CNT values per case. */
     +struct sparse_cases *
     +sparse_cases_create (size_t column_cnt) 
     +{
     +  struct sparse_cases *sc = xmalloc (sizeof *sc);
     +  sc->column_cnt = column_cnt;
     +  sc->default_columns = NULL;
     +  sc->max_memory_cases = get_workspace_cases (column_cnt);
     +  sc->memory = sparse_array_create (sizeof (struct ccase));
     +  sc->disk = NULL;
     +  sc->disk_cases = NULL;
     +  return sc;
     +}
     +
     +/* Creates and returns a new sparse array of cases that contains
     +   the same data as OLD. */
     +struct sparse_cases *
     +sparse_cases_clone (const struct sparse_cases *old) 
     +{
     +  struct sparse_cases *new = xmalloc (sizeof *new);
     +
     +  new->column_cnt = old->column_cnt;
     +
     +  if (old->default_columns != NULL)
     +    new->default_columns
     +      = xmemdup (old->default_columns,
     +                 old->column_cnt * sizeof *old->default_columns);
     +  else
     +    new->default_columns = NULL;
     +
     +  new->max_memory_cases = old->max_memory_cases;
     +
     +  if (old->memory != NULL) 
     +    {
     +      unsigned long int idx;
     +      struct ccase *c;
     +
     +      new->memory = sparse_array_create (sizeof (struct ccase));
     +      for (c = sparse_array_scan (old->memory, NULL, &idx); c != NULL;
     +           c = sparse_array_scan (old->memory, &idx, &idx)) 
     +        case_clone (sparse_array_insert (new->memory, idx), c);
     +    }
     +  else
     +    new->memory = NULL;
     +
     +  if (old->disk != NULL) 
     +    {
     +      const struct range_set_node *node;
     +      
     +      new->disk = case_tmpfile_create (old->column_cnt);
     +      new->disk_cases = range_set_create ();
     +      for (node = range_set_first (old->disk_cases); node != NULL;
     +           node = range_set_next (old->disk_cases, node)) 
     +        {
     +          unsigned long int start = range_set_node_get_start (node);
     +          unsigned long int end = range_set_node_get_end (node);
     +          unsigned long int idx;
     +          struct ccase c;
     +
     +          for (idx = start; idx < end; idx++) 
     +            if (!case_tmpfile_get_case (old->disk, idx, &c)
     +                || !case_tmpfile_put_case (new->disk, idx, &c))
     +              {
     +                sparse_cases_destroy (new);
     +                return NULL;
     +              }
     +        }
     +    }
     +  else 
     +    {
     +      new->disk = NULL;
     +      new->disk_cases = NULL;
     +    }
     +  
     +  return new;
     +}
     +
     +/* Destroys sparse array of cases SC. */
     +void
     +sparse_cases_destroy (struct sparse_cases *sc) 
     +{
     +  if (sc != NULL) 
     +    {
     +      if (sc->memory != NULL) 
     +        {
     +          unsigned long int idx;
     +          struct ccase *c;
     +          for (c = sparse_array_scan (sc->memory, NULL, &idx); c != NULL;
     +               c = sparse_array_scan (sc->memory, &idx, &idx)) 
     +            case_destroy (c);
     +          sparse_array_destroy (sc->memory);
     +        }
     +      free (sc->default_columns);
     +      case_tmpfile_destroy (sc->disk);
     +      range_set_destroy (sc->disk_cases);
     +      free (sc);
     +    }
     +}
     +
     +/* Returns the number of `union value's in each case in SC. */
     +size_t
     +sparse_cases_get_value_cnt (const struct sparse_cases *sc) 
     +{
     +  return sc->column_cnt;
     +}
     +
     +/* Dumps the cases in SC, which must currently be stored in
     +   memory, to disk.  Returns true if successful, false on I/O
     +   error. */ 
     +static bool
     +dump_sparse_cases_to_disk (struct sparse_cases *sc) 
     +{
     +  unsigned long int idx;
     +  struct ccase *c;
     +
     +  assert (sc->memory != NULL);
     +  assert (sc->disk == NULL);
     +
     +  sc->disk = case_tmpfile_create (sc->column_cnt);
     +  sc->disk_cases = range_set_create ();
     +
     +  for (c = sparse_array_scan (sc->memory, NULL, &idx); c != NULL;
     +       c = sparse_array_scan (sc->memory, &idx, &idx)) 
     +    {
     +      if (!case_tmpfile_put_case (sc->disk, idx, c))
     +        { 
     +          case_tmpfile_destroy (sc->disk);
     +          sc->disk = NULL;
     +          range_set_destroy (sc->disk_cases);
     +          sc->disk_cases = NULL;
     +          return false;
     +        }
     +      range_set_insert (sc->disk_cases, idx, 1); 
     +    }
     +  sparse_array_destroy (sc->memory);
     +  sc->memory = NULL;
     +  return true;
     +}
     +
     +/* Returns true if any data has ever been written to ROW in SC,
     +   false otherwise. */
     +bool
     +sparse_cases_contains_row (const struct sparse_cases *sc, casenumber row) 
     +{
     +  return (sc->memory != NULL
     +          ? sparse_array_get (sc->memory, row) != NULL
     +          : range_set_contains (sc->disk_cases, row));
     +}
     +
     +/* Reads columns COLUMNS...(COLUMNS + VALUE_CNT), exclusive, in
     +   the given ROW in SC, into the VALUE_CNT values in VALUES.
     +   Returns true if successful, false on I/O error. */
     +bool
     +sparse_cases_read (struct sparse_cases *sc, casenumber row, size_t column,
     +                   union value values[], size_t value_cnt) 
     +{
     +  assert (value_cnt <= sc->column_cnt);
     +  assert (column + value_cnt <= sc->column_cnt);
     +
     +  if (sparse_cases_contains_row (sc, row)) 
     +    {
     +      struct ccase c;
     +      if (sc->memory != NULL) 
     +        case_clone (&c, sparse_array_get (sc->memory, row));
     +      else if (!case_tmpfile_get_case (sc->disk, row, &c))
     +        return false;
     +      case_copy_out (&c, column, values, value_cnt);
     +      case_destroy (&c);
     +    }
     +  else 
     +    {
     +      assert (sc->default_columns != NULL);
     +      memcpy (values, sc->default_columns + column,
     +              sizeof *values * value_cnt);
     +    }
     +
     +  return true;
     +}
     +
     +/* Implements sparse_cases_write for an on-disk sparse_cases. */
     +static bool
     +write_disk_case (struct sparse_cases *sc, casenumber row, size_t column,
     +                 const union value values[], size_t value_cnt)
     +{
     +  struct ccase c;
     +  bool ok;
     +
     +  /* Get current case data. */
     +  if (column == 0 && value_cnt == sc->column_cnt) 
     +    case_create (&c, sc->column_cnt); 
     +  else if (!case_tmpfile_get_case (sc->disk, row, &c))
     +    return false; 
     +
     +  /* Copy in new data. */
     +  case_copy_in (&c, column, values, value_cnt);
     +
     +  /* Write new case. */
     +  ok = case_tmpfile_put_case (sc->disk, row, &c); 
     +  if (ok)
     +    range_set_insert (sc->disk_cases, row, 1);
     +  
     +  return ok;
     +}
     +
     +/* Writes the VALUE_CNT values in VALUES into columns
     +   COLUMNS...(COLUMNS + VALUE_CNT), exclusive, in the given ROW
     +   in SC.
     +   Returns true if successful, false on I/O error. */
     +bool
     +sparse_cases_write (struct sparse_cases *sc, casenumber row, size_t 
column,
     +                    const union value values[], size_t value_cnt) 
     +{
     +  if (sc->memory != NULL)
     +    {
     +      struct ccase *c = sparse_array_get (sc->memory, row);
     +      if (c == NULL) 
     +        {
     +          if (sparse_array_count (sc->memory) >= sc->max_memory_cases) 
     +            {
     +              if (!dump_sparse_cases_to_disk (sc))
     +                return false;
     +              return write_disk_case (sc, row, column, values, value_cnt);
     +            }
     +          
     +          c = sparse_array_insert (sc->memory, row);
     +          case_create (c, sc->column_cnt);
     +          if (sc->default_columns != NULL
     +              && (column != 0 || value_cnt != sc->column_cnt))
     +            case_copy_in (c, 0, sc->default_columns, sc->column_cnt);
     +        }
     +      case_copy_in (c, column, values, value_cnt);
     +      return true;
     +    }
     +  else
     +    return write_disk_case (sc, row, column, values, value_cnt);
     +}
     +
     +/* Writes the VALUE_CNT values in VALUES to columns
     +   START_COLUMN...(START_COLUMN + VALUE_CNT), exclusive, in every
     +   row in SC, even those rows that have not yet been written.  
     +   Returns true if successful, false on I/O error.
     +
     +   The runtime of this function is linear in the number of rows
     +   in SC that have already been written. */
     +bool
     +sparse_cases_write_columns (struct sparse_cases *sc, size_t start_column,
     +                            const union value values[], size_t value_cnt)
     +{
     +  assert (value_cnt <= sc->column_cnt);
     +  assert (start_column + value_cnt <= sc->column_cnt);
     +
     +  /* Set defaults. */
     +  if (sc->default_columns == NULL)
     +    sc->default_columns = xnmalloc (sc->column_cnt,
     +                                    sizeof *sc->default_columns);
     +  memcpy (sc->default_columns + start_column, values,
     +          value_cnt * sizeof *sc->default_columns);
     +
     +  /* Set individual rows. */
     +  if (sc->memory != NULL) 
     +    {
     +      struct ccase *c;
     +      unsigned long int idx;
     +      
     +      for (c = sparse_array_scan (sc->memory, NULL, &idx); c != NULL;
     +           c = sparse_array_scan (sc->memory, &idx, &idx)) 
     +        case_copy_in (c, start_column, values, value_cnt);
     +    }
     +  else 
     +    {
     +      const struct range_set_node *node;
     +
     +      for (node = range_set_first (sc->disk_cases); node != NULL;
     +           node = range_set_next (sc->disk_cases, node)) 
     +        {
     +          unsigned long int start = range_set_node_get_start (node);
     +          unsigned long int end = range_set_node_get_end (node);
     +          unsigned long int row;
     +
     +          for (row = start; row < end; row++) 
     +            case_tmpfile_put_values (sc->disk, row,
     +                                     start_column, values, value_cnt);
     +        }
     +              
     +      if (case_tmpfile_error (sc->disk))
     +        return false;
     +    }
     +  return true;
     +}
     Index: merge/src/data/sparse-cases.h
     ===================================================================
     --- /dev/null      1970-01-01 00:00:00.000000000 +0000
     +++ merge/src/data/sparse-cases.h  2007-06-03 18:14:32.000000000 -0700
     @@ -0,0 +1,68 @@
     +/* PSPP - computes sample statistics.
     +   Copyright (C) 2007 Free Software Foundation, Inc.
     +
     +   This program is free software; you can redistribute it and/or
     +   modify it under the terms of the GNU General Public License as
     +   published by the Free Software Foundation; either version 2 of the
     +   License, or (at your option) any later version.
     +
     +   This program is distributed in the hope that it will be useful, but
     +   WITHOUT ANY WARRANTY; without even the implied warranty of
     +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     +   General Public License for more details.
     +
     +   You should have received a copy of the GNU General Public License
     +   along with this program; if not, write to the Free Software
     +   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     +   02110-1301, USA. */
     +
     +/* Sparse array of cases.
     +
     +   Implements a 2-d sparse array in which each row represents a
     +   case, each column represents a variable, and each intersection
     +   contains a `union value'.  Data in the array may be accessed
     +   randomly by column and row.  When the number of cases stored
     +   in the array is small, the data is stored in memory in memory;
     +   when it is large, the data is stored in a temporary file.
     +
     +   The sparse_cases_write_columns function provides a somewhat
     +   unusual ability: to write a given value to every row in a
     +   column or set of columns.  This overwrites any values
     +   previously written into those columns.  For rows that have
     +   never been written, this function sets "default" values that
     +   later writes can override.
     +
     +   The array keeps track of which row have been written.  If
     +   sparse_cases_write_columns has been used, reading from a row
     +   that has never been written yields the default values;
     +   otherwise, reading from such a row in an error.  It is
     +   permissible to write to only some columns in a row and leave
     +   the rest of the row's data undefined (or, if
     +   sparse_cases_write_columns has been used, at the default
     +   values).  The array does not keep track of which columns in a
     +   row have never been written, but reading values that have
     +   never been written or set as defaults yields undefined
     +   behavior. */
     +
     +#ifndef DATA_SPARSE_CASES_H
     +#define DATA_SPARSE_CASES_H 1
     +
     +#include <stddef.h>
     +#include <stdbool.h>
     +#include <data/case.h>
     +
     +struct sparse_cases *sparse_cases_create (size_t value_cnt);
     +struct sparse_cases *sparse_cases_clone (const struct sparse_cases *);
     +void sparse_cases_destroy (struct sparse_cases *);
     +
     +size_t sparse_cases_get_value_cnt (const struct sparse_cases *);
     +
     +bool sparse_cases_contains_row (const struct sparse_cases *, casenumber 
row);
     +bool sparse_cases_read (struct sparse_cases *, casenumber row, size_t 
column,
     +                        union value[], size_t value_cnt);
     +bool sparse_cases_write (struct sparse_cases *, casenumber row, size_t 
column,
     +                         const union value[], size_t value_cnt);
     +bool sparse_cases_write_columns (struct sparse_cases *, size_t 
start_column,
     +                                 const union value[], size_t value_cnt);
     +
     +#endif /* data/sparse-cases.h */
     
     --
     
     
     
     _______________________________________________
     pspp-dev mailing list
     address@hidden
     http://lists.gnu.org/mailman/listinfo/pspp-dev

-- 
PGP Public key ID: 1024D/2DE827B3 
fingerprint = 8797 A26D 0854 2EAB 0285  A290 8A67 719C 2DE8 27B3
See http://pgp.mit.edu or any PGP keyserver for public key.


Attachment: signature.asc
Description: Digital signature


reply via email to

[Prev in Thread] Current Thread [Next in Thread]