pspp-dev
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[encodings 3/4] Add ENCODING subcommand to several commands.


From: Ben Pfaff
Subject: [encodings 3/4] Add ENCODING subcommand to several commands.
Date: Tue, 19 Jun 2012 23:11:36 -0700

I believe that this completes support for encodings for data input
commands such as DATA LIST and GET DATA/TYPE=TXT.  Support is still
incomplete for data output commands such as PRINT, which confuse
byte offsets with character positions.

Debian bug #676371.
Reported-by: Gunther Furtado <address@hidden>
---
 NEWS                                   |    3 +-
 doc/data-io.texi                       |   47 +++++++++---
 doc/files.texi                         |    5 +
 src/data/file-handle-def.c             |   30 ++++---
 src/data/file-handle-def.h             |    4 +-
 src/language/data-io/data-list.c       |   13 ++--
 src/language/data-io/data-parser.c     |    6 +-
 src/language/data-io/data-reader.c     |  129 +++++++++++++++++++++++---------
 src/language/data-io/data-reader.h     |    7 +-
 src/language/data-io/data-writer.c     |   50 ++++++++++---
 src/language/data-io/data-writer.h     |    7 +-
 src/language/data-io/file-handle.q     |   16 +++-
 src/language/data-io/get-data.c        |   20 ++++-
 src/language/data-io/inpt-pgm.c        |   35 ++++++---
 src/language/data-io/print-space.c     |   16 ++++-
 src/language/data-io/print.c           |   16 ++++-
 tests/language/data-io/get-data-txt.at |   17 ++++
 17 files changed, 311 insertions(+), 110 deletions(-)

diff --git a/NEWS b/NEWS
index 3974f54..71ba057 100644
--- a/NEWS
+++ b/NEWS
@@ -42,7 +42,8 @@ Changes from 0.6.2 to 0.7.9:
   
    - HOST has been updated to use more modern syntax.
 
-   - GET, INCLUDE, and INSERT have a new ENCODING subcommand.
+   - Most commands that work with data files now support a new
+     ENCODING subcommand.
 
    - MISSING VALUES can now assign missing values to long string
      variables.
diff --git a/doc/data-io.texi b/doc/data-io.texi
index 4862ccc..da1712e 100644
--- a/doc/data-io.texi
+++ b/doc/data-io.texi
@@ -277,8 +277,9 @@ external file.  It may be used to specify a file name as a 
string or a
 file handle (@pxref{File Handles}).  If the @subcmd{FILE} subcommand is not 
used,
 then input is assumed to be specified within the command file using
 @cmd{BEGIN address@hidden@cmd{END DATA} (@pxref{BEGIN DATA}).
-The @subcmd{ENCODING} subcommand may only be used if the @subcmd{FILE} 
subcommand is also used.
-It specifies the character encoding of the file.
+The @subcmd{ENCODING} subcommand may only be used if the @subcmd{FILE}
+subcommand is also used.  It specifies the character encoding of the
+file.  @xref{INSERT}, for information on supported encodings.
 
 The optional @subcmd{RECORDS} subcommand, which takes a single integer as an
 argument, is used to specify the number of lines per record.
@@ -503,7 +504,8 @@ of quoting is allowed.
 The @subcmd{NOTABLE} and @subcmd{TABLE} subcommands are as in @cmd{DATA LIST 
FIXED} above.
 @subcmd{NOTABLE} is the default.
 
-The @subcmd{FILE} and @subcmd{SKIP} subcommands are as in @cmd{DATA LIST 
FIXED} above.
+The @subcmd{FILE}, @subcmd{SKIP}, and @subcmd{ENCODING} subcommands
+are as in @cmd{DATA LIST FIXED} above.
 
 The variables to be parsed are given as a single list of variable names.
 This list must be introduced by a single slash (@samp{/}).  The set of
@@ -525,7 +527,7 @@ on field width apply, but they are honored on output.
 DATA LIST LIST
         [(@{TAB,'@var{c}'@}, @dots{})]
         address@hidden,address@hidden
-        [FILE='@var{file_name'} [ENCODING='@var{encoding}']]
+        [FILE='@var{file_name}' [ENCODING='@var{encoding}']]
         address@hidden
         /@address@hidden
 
@@ -572,18 +574,21 @@ For text files:
                 /NAME='@var{file_name}
                 [/MODE=CHARACTER]
                 /address@hidden
+                [ENCODING='@var{encoding}']
 
 For binary files in native encoding with fixed-length records:
         FILE HANDLE @var{handle_name}
                 /NAME='@var{file_name}'
                 /MODE=IMAGE
                 [/address@hidden
+                [ENCODING='@var{encoding}']
 
 For binary files in native encoding with variable-length records:
         FILE HANDLE @var{handle_name}
                 /NAME='@var{file_name}'
                 /MODE=BINARY
                 [/address@hidden
+                [ENCODING='@var{encoding}']
 
 For binary files encoded in EBCDIC:
         FILE HANDLE @var{handle_name}
@@ -591,6 +596,7 @@ For binary files encoded in EBCDIC:
                 /MODE=360
                 /address@hidden,VARIABLE,address@hidden
                 [/address@hidden
+                [ENCODING='@var{encoding}']
 @end display
 
 Use @cmd{FILE HANDLE} to associate a file handle name with a file and
@@ -726,6 +732,14 @@ The @subcmd{NAME} subcommand specifies the name of the 
file associated with the
 handle.  It is required in all modes but SCRATCH mode, in which its
 use is forbidden.
 
+The ENCODING subcommand specifies the encoding of text in the file.
+For reading text files in CHARACTER mode, all of the forms described
+for ENCODING on the INSERT command are supported (@pxref{INSERT}).
+For reading in other file-based modes, encoding autodetection is not
+supported; if the specified encoding requests autodetection then the
+default encoding will be used.  This is also true when a file handle
+is used for writing a file in any mode.
+
 @node INPUT PROGRAM
 @section INPUT PROGRAM
 @vindex INPUT PROGRAM
@@ -942,9 +956,10 @@ active dataset.
 
 @display
 PRINT 
-        OUTFILE='@var{file_name}'
-        address@hidden
-        @{NOTABLE,address@hidden
+        [OUTFILE='@var{file_name}']
+        address@hidden
+        address@hidden,address@hidden
+        [ENCODING='@var{encoding}']
         [/address@hidden @address@hidden
 
 @var{arg} takes one of the following forms:
@@ -969,6 +984,11 @@ Handles}).  If @subcmd{OUTFILE} is not present then output 
will be sent to
 inserted at beginning of each output line, even lines that otherwise
 would be blank.
 
+The @subcmd{ENCODING} subcommand may only be used if the
address@hidden subcommand is also used.  It specifies the character
+encoding of the file.  @xref{INSERT}, for information on supported
+encodings.
+
 The @subcmd{RECORDS} subcommand specifies the number of lines to be output.  
The
 number of lines may optionally be surrounded by parentheses.
 
@@ -983,7 +1003,6 @@ line number, the next line number will be specified.  
Multiple lines may
 be specified using multiple slashes with the intended output for a line
 following its respective slash.
 
-
 Literal strings may be printed.  Specify the string itself.  Optionally
 the string may be followed by a column number or range of column
 numbers, specifying the location on the line for the string to be
@@ -1043,7 +1062,7 @@ written with a space inserted in the first column, as 
with @subcmd{PRINT}.
 @vindex PRINT SPACE
 
 @display
-PRINT SPACE OUTFILE='file_name' n_lines.
+PRINT SPACE [OUTFILE='file_name'] [ENCODING='@var{encoding}'] [n_lines].
 @end display
 
 @cmd{PRINT SPACE} prints one or more blank lines to an output file.
@@ -1053,6 +1072,10 @@ a file specified by file name as a string or file handle 
(@pxref{File
 Handles}).  If OUTFILE is not specified then output will be directed to
 the listing file.
 
+The @subcmd{ENCODING} subcommand may only be used if @subcmd{OUTFILE}
+is also used.  It specifies the character encoding of the file.
address@hidden, for information on supported encodings.
+
 n_lines is also optional.  If present, it is an expression
 (@pxref{Expressions}) specifying the number of blank lines to be
 printed.  The expression must evaluate to a nonnegative value.
@@ -1062,7 +1085,7 @@ printed.  The expression must evaluate to a nonnegative 
value.
 @vindex REREAD
 
 @display
-REREAD FILE=handle COLUMN=column.
+REREAD [FILE=handle] [COLUMN=column] [ENCODING='@var{encoding}'].
 @end display
 
 The @cmd{REREAD} transformation allows the previous input line in a
@@ -1082,6 +1105,10 @@ re-reading.  Specify an expression (@pxref{Expressions}) 
evaluating to
 the first column that should be included in the re-read line.  Columns
 are numbered from 1 at the left margin.
 
+The @subcmd{ENCODING} subcommand may only be used if the @subcmd{FILE}
+subcommand is also used.  It specifies the character encoding of the
+file.   @xref{INSERT}, for information on supported encodings.
+
 Issuing @code{REREAD} multiple times will not back up in the data
 file.  Instead, it will re-read the same line multiple times.
 
diff --git a/doc/files.texi b/doc/files.texi
index 04bbff4..511cf5d 100644
--- a/doc/files.texi
+++ b/doc/files.texi
@@ -366,6 +366,7 @@ GET DATA /TYPE=PSQL
 @display
 GET DATA /TYPE=TXT
         /address@hidden'@var{file_name}',@address@hidden
+        [ENCODING='@var{encoding}']
         [/address@hidden,address@hidden
         [/address@hidden@address@hidden
         [/address@hidden,FIRST @var{max_cases},PERCENT @address@hidden
@@ -381,6 +382,10 @@ The @subcmd{FILE} subcommand is mandatory.  Specify the 
file to be read as
 a string file name or (for textual data only) a
 file handle (@pxref{File Handles}).
 
+The @subcmd{ENCODING} subcommand specifies the character encoding of
+the file to be read.  @xref{INSERT}, for information on supported
+encodings.
+
 The @subcmd{ARRANGEMENT} subcommand determines the file's basic format.
 DELIMITED, the default setting, specifies that fields in the input
 data are separated by spaces, tabs, or other user-specified
diff --git a/src/data/file-handle-def.c b/src/data/file-handle-def.c
index 6ca6977..9a46bfa 100644
--- a/src/data/file-handle-def.c
+++ b/src/data/file-handle-def.c
@@ -1,5 +1,5 @@
 /* PSPP - a program for statistical analysis.
-   Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011 Free Software 
Foundation, Inc.
+   Copyright (C) 1997-9, 2000, 2006, 2009, 2010, 2011, 2012 Free Software 
Foundation, Inc.
 
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -26,12 +26,13 @@
 #include "data/dataset.h"
 #include "data/file-name.h"
 #include "data/variable.h"
+#include "libpspp/cast.h"
 #include "libpspp/compiler.h"
+#include "libpspp/hash-functions.h"
 #include "libpspp/hmap.h"
 #include "libpspp/i18n.h"
 #include "libpspp/message.h"
 #include "libpspp/str.h"
-#include "libpspp/hash-functions.h"
 
 #include "gl/xalloc.h"
 
@@ -50,11 +51,11 @@ struct file_handle
     /* FH_REF_FILE only. */
     char *file_name;           /* File name as provided by user. */
     enum fh_mode mode;         /* File mode. */
-    const char *encoding;       /* File encoding. */
 
     /* FH_REF_FILE and FH_REF_INLINE only. */
     size_t record_width;        /* Length of fixed-format records. */
     size_t tab_width;           /* Tab width, 0=do not expand tabs. */
+    char *encoding;             /* Charset for contents. */
 
     /* FH_REF_DATASET only. */
     struct dataset *ds;         /* Dataset. */
@@ -71,7 +72,8 @@ static struct file_handle *default_handle;
 static struct file_handle *inline_file;
 
 static struct file_handle *create_handle (const char *id,
-                                          char *name, enum fh_referent);
+                                          char *name, enum fh_referent,
+                                          const char *encoding);
 static void free_handle (struct file_handle *);
 static void unname_handle (struct file_handle *);
 
@@ -82,7 +84,8 @@ static struct hmap locks = HMAP_INITIALIZER (locks);
 void
 fh_init (void)
 {
-  inline_file = create_handle ("INLINE", xstrdup ("INLINE"), FH_REF_INLINE);
+  inline_file = create_handle ("INLINE", xstrdup ("INLINE"), FH_REF_INLINE,
+                               "Auto");
   inline_file->record_width = 80;
   inline_file->tab_width = 8;
 }
@@ -110,6 +113,7 @@ free_handle (struct file_handle *handle)
   free (handle->id);
   free (handle->name);
   free (handle->file_name);
+  free (handle->encoding);
   free (handle);
 }
 
@@ -189,7 +193,8 @@ fh_from_id (const char *id)
    The new handle is not fully initialized.  The caller is
    responsible for completing its initialization. */
 static struct file_handle *
-create_handle (const char *id, char *handle_name, enum fh_referent referent)
+create_handle (const char *id, char *handle_name, enum fh_referent referent,
+               const char *encoding)
 {
   struct file_handle *handle = xzalloc (sizeof *handle);
 
@@ -197,6 +202,7 @@ create_handle (const char *id, char *handle_name, enum 
fh_referent referent)
   handle->id = id != NULL ? xstrdup (id) : NULL;
   handle->name = handle_name;
   handle->referent = referent;
+  handle->encoding = xstrdup (encoding);
 
   if (id != NULL)
     {
@@ -231,12 +237,11 @@ fh_create_file (const char *id, const char *file_name,
   struct file_handle *handle;
 
   handle_name = id != NULL ? xstrdup (id) : xasprintf ("`%s'", file_name);
-  handle = create_handle (id, handle_name, FH_REF_FILE);
+  handle = create_handle (id, handle_name, FH_REF_FILE, properties->encoding);
   handle->file_name = xstrdup (file_name);
   handle->mode = properties->mode;
   handle->record_width = properties->record_width;
   handle->tab_width = properties->tab_width;
-  handle->encoding = properties->encoding;
   return handle;
 }
 
@@ -253,7 +258,7 @@ fh_create_dataset (struct dataset *ds)
   if (name[0] == '\0')
     name = _("active dataset");
 
-  handle = create_handle (NULL, xstrdup (name), FH_REF_DATASET);
+  handle = create_handle (NULL, xstrdup (name), FH_REF_DATASET, C_ENCODING);
   handle->ds = ds;
   return handle;
 }
@@ -263,7 +268,7 @@ const struct fh_properties *
 fh_default_properties (void)
 {
   static const struct fh_properties default_properties
-    = {FH_MODE_TEXT, 1024, 4, C_ENCODING};
+    = {FH_MODE_TEXT, 1024, 4, (char *) "Auto"};
   return &default_properties;
 }
 
@@ -333,10 +338,9 @@ fh_get_tab_width (const struct file_handle *handle)
 
 /* Returns the encoding of characters read from HANDLE. */
 const char *
-fh_get_legacy_encoding (const struct file_handle *handle)
+fh_get_encoding (const struct file_handle *handle)
 {
-  assert (handle->referent & (FH_REF_FILE | FH_REF_INLINE));
-  return (handle->referent == FH_REF_FILE ? handle->encoding : C_ENCODING);
+  return handle->encoding;
 }
 
 /* Returns the dataset handle associated with HANDLE.
diff --git a/src/data/file-handle-def.h b/src/data/file-handle-def.h
index 11898ef..9a60e72 100644
--- a/src/data/file-handle-def.h
+++ b/src/data/file-handle-def.h
@@ -55,7 +55,7 @@ struct fh_properties
     enum fh_mode mode;          /* File mode. */
     size_t record_width;        /* Length of fixed-format records. */
     size_t tab_width;           /* Tab width, 0=do not expand tabs. */
-    const char *encoding;       /* ASCII or EBCDIC? */
+    char *encoding;             /* Charset for contents. */
   };
 
 void fh_init (void);
@@ -82,6 +82,7 @@ struct file_handle *fh_inline_file (void);
 const char *fh_get_id (const struct file_handle *);
 const char *fh_get_name (const struct file_handle *);
 enum fh_referent fh_get_referent (const struct file_handle *);
+const char *fh_get_encoding (const struct file_handle *);
 
 /* Properties of FH_REF_FILE file handles. */
 const char *fh_get_file_name (const struct file_handle *);
@@ -90,7 +91,6 @@ enum fh_mode fh_get_mode (const struct file_handle *) ;
 /* Properties of FH_REF_FILE and FH_REF_INLINE file handles. */
 size_t fh_get_record_width (const struct file_handle *);
 size_t fh_get_tab_width (const struct file_handle *);
-const char *fh_get_legacy_encoding (const struct file_handle *);
 
 /* Properties of FH_REF_DATASET file handles. */
 struct dataset *fh_get_dataset (const struct file_handle *);
diff --git a/src/language/data-io/data-list.c b/src/language/data-io/data-list.c
index f16c606..17c6032 100644
--- a/src/language/data-io/data-list.c
+++ b/src/language/data-io/data-list.c
@@ -78,7 +78,7 @@ cmd_data_list (struct lexer *lexer, struct dataset *ds)
   struct dfm_reader *reader;
   struct variable *end = NULL;
   struct file_handle *fh = NULL;
-  struct string encoding = DS_EMPTY_INITIALIZER;
+  char *encoding = NULL;
 
   int table;
   enum data_parser_type type;
@@ -111,7 +111,8 @@ cmd_data_list (struct lexer *lexer, struct dataset *ds)
          if (!lex_force_string (lexer))
            goto error;
 
-         ds_init_substring (&encoding, lex_tokss (lexer));
+          free (encoding);
+          encoding = ss_xstrdup (lex_tokss (lexer));
 
          lex_get (lexer);
        }
@@ -241,7 +242,7 @@ cmd_data_list (struct lexer *lexer, struct dataset *ds)
     }
   type = data_parser_get_type (parser);
 
-  if (! ds_is_empty (&encoding) && NULL == fh)
+  if (encoding && NULL == fh)
     msg (MW, _("Encoding should not be specified for inline data. It will be "
                "ignored."));
 
@@ -278,7 +279,7 @@ cmd_data_list (struct lexer *lexer, struct dataset *ds)
   if (table)
     data_parser_output_description (parser, fh);
 
-  reader = dfm_open_reader (fh, lexer);
+  reader = dfm_open_reader (fh, lexer, encoding);
   if (reader == NULL)
     goto error;
 
@@ -294,7 +295,7 @@ cmd_data_list (struct lexer *lexer, struct dataset *ds)
     data_parser_make_active_file (parser, ds, reader, dict);
 
   fh_unref (fh);
-  ds_destroy (&encoding);
+  free (encoding);
 
   return CMD_SUCCESS;
 
@@ -303,7 +304,7 @@ cmd_data_list (struct lexer *lexer, struct dataset *ds)
   if (!in_input_program ())
     dict_destroy (dict);
   fh_unref (fh);
-  ds_destroy (&encoding);
+  free (encoding);
   return CMD_CASCADING_FAILURE;
 }
 
diff --git a/src/language/data-io/data-parser.c 
b/src/language/data-io/data-parser.c
index aea3bbd..1dc7c93 100644
--- a/src/language/data-io/data-parser.c
+++ b/src/language/data-io/data-parser.c
@@ -527,7 +527,7 @@ static bool
 parse_fixed (const struct data_parser *parser, struct dfm_reader *reader,
              struct ccase *c)
 {
-  const char *input_encoding = dfm_reader_get_legacy_encoding (reader);
+  const char *input_encoding = dfm_reader_get_encoding (reader);
   const char *output_encoding = dict_get_encoding (parser->dict);
   struct field *f;
   int row;
@@ -579,7 +579,7 @@ static bool
 parse_delimited_span (const struct data_parser *parser,
                       struct dfm_reader *reader, struct ccase *c)
 {
-  const char *input_encoding = dfm_reader_get_legacy_encoding (reader);
+  const char *input_encoding = dfm_reader_get_encoding (reader);
   const char *output_encoding = dict_get_encoding (parser->dict);
   struct string tmp = DS_EMPTY_INITIALIZER;
   struct field *f;
@@ -623,7 +623,7 @@ static bool
 parse_delimited_no_span (const struct data_parser *parser,
                          struct dfm_reader *reader, struct ccase *c)
 {
-  const char *input_encoding = dfm_reader_get_legacy_encoding (reader);
+  const char *input_encoding = dfm_reader_get_encoding (reader);
   const char *output_encoding = dict_get_encoding (parser->dict);
   struct string tmp = DS_EMPTY_INITIALIZER;
   struct substring s;
diff --git a/src/language/data-io/data-reader.c 
b/src/language/data-io/data-reader.c
index 0f96e58..ea95bc9 100644
--- a/src/language/data-io/data-reader.c
+++ b/src/language/data-io/data-reader.c
@@ -1,5 +1,5 @@
 /* PSPP - a program for statistical analysis.
-   Copyright (C) 1997-2004, 2006, 2010, 2011 Free Software Foundation, Inc.
+   Copyright (C) 1997-2004, 2006, 2010, 2011, 2012 Free Software Foundation, 
Inc.
 
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -34,7 +34,9 @@
 #include "language/lexer/lexer.h"
 #include "libpspp/assertion.h"
 #include "libpspp/cast.h"
+#include "libpspp/encoding-guesser.h"
 #include "libpspp/integer-format.h"
+#include "libpspp/line-reader.h"
 #include "libpspp/message.h"
 #include "libpspp/str.h"
 
@@ -69,6 +71,10 @@ struct dfm_reader
     size_t pos;                 /* Offset in line of current character. */
     unsigned eof_cnt;           /* # of attempts to advance past EOF. */
     struct lexer *lexer;        /* The lexer reading the file */
+    char *encoding;             /* Current encoding. */
+
+    /* For FH_MODE_TEXT only. */
+    struct line_reader *line_reader;
 
     /* For FH_MODE_360_VARIABLE and FH_MODE_360_SPANNED files only. */
     size_t block_left;          /* Bytes left in current block. */
@@ -101,19 +107,28 @@ dfm_close_reader (struct dfm_reader *r)
         }
     }
 
+  line_reader_free (r->line_reader);
+  free (r->encoding);
   fh_unref (r->fh);
   ds_destroy (&r->line);
   ds_destroy (&r->scratch);
   free (r);
 }
 
-/* Opens the file designated by file handle FH for reading as a
-   data file.  Providing fh_inline_file() for FH designates the
-   "inline file", that is, data included inline in the command
-   file between BEGIN FILE and END FILE.  Returns a reader if
-   successful, or a null pointer otherwise. */
+/* Opens the file designated by file handle FH for reading as a data file.
+   Returns a reader if successful, or a null pointer otherwise.
+
+   If FH is fh_inline_file() then the new reader reads data included inline in
+   the command file between BEGIN FILE and END FILE, obtaining data from LEXER.
+   LEXER must remain valid as long as the new reader is in use.  ENCODING is
+   ignored.
+
+   If FH is not fh_inline_file(), then the encoding of the file read is by
+   default that of FH itself.  If ENCODING is nonnull, then it overrides the
+   default encoding.  LEXER is ignored. */
 struct dfm_reader *
-dfm_open_reader (struct file_handle *fh, struct lexer *lexer)
+dfm_open_reader (struct file_handle *fh, struct lexer *lexer,
+                 const char *encoding)
 {
   struct dfm_reader *r;
   struct fh_lock *lock;
@@ -147,10 +162,6 @@ dfm_open_reader (struct file_handle *fh, struct lexer 
*lexer)
         {
           msg (ME, _("Could not open `%s' for reading as a data file: %s."),
                fh_get_file_name (r->fh), strerror (errno));
-          fh_unlock (r->lock);
-          fh_unref (fh);
-          free (r);
-          return NULL;
         }
       r->file_size = fstat (fileno (r->file), &s) == 0 ? s.st_size : -1;
     }
@@ -158,14 +169,43 @@ dfm_open_reader (struct file_handle *fh, struct lexer 
*lexer)
     r->file_size = -1;
   fh_lock_set_aux (lock, r);
 
+  if (encoding == NULL)
+    encoding = fh_get_encoding (fh);
+  if (fh_get_referent (fh) == FH_REF_FILE && fh_get_mode (fh) == FH_MODE_TEXT)
+    {
+      r->line_reader = line_reader_for_fd (encoding, fileno (r->file));
+      if (r->line_reader == NULL)
+        {
+          msg (ME, _("Could not read `%s' as a text file with encoding `%s': "
+                     "%s."),
+               fh_get_file_name (r->fh), encoding, strerror (errno));
+          goto error;
+        }
+      r->encoding = xstrdup (line_reader_get_encoding (r->line_reader));
+    }
+  else
+    {
+      r->line_reader = NULL;
+      r->encoding = xstrdup (encoding_guess_parse_encoding (encoding));
+    }
+
   return r;
+
+error:
+  fh_unlock (r->lock);
+  fh_unref (fh);
+  free (r);
+  return NULL;
 }
 
 /* Returns true if an I/O error occurred on READER, false otherwise. */
 bool
 dfm_reader_error (const struct dfm_reader *r)
 {
-  return fh_get_referent (r->fh) == FH_REF_FILE && ferror (r->file);
+  return (fh_get_referent (r->fh) == FH_REF_FILE
+          && (r->line_reader != NULL
+              ? line_reader_error (r->line_reader) != 0
+              : ferror (r->file)));
 }
 
 /* Reads a record from the inline file into R.
@@ -211,17 +251,12 @@ read_inline_record (struct dfm_reader *r)
   return true;
 }
 
-/* Report a read error or unexpected end-of-file condition on R. */
+/* Report a read error on R. */
 static void
 read_error (struct dfm_reader *r)
 {
-  if (ferror (r->file))
-    msg (ME, _("Error reading file %s: %s."),
-         fh_get_name (r->fh), strerror (errno));
-  else if (feof (r->file))
-    msg (ME, _("Unexpected end of file reading %s."), fh_get_name (r->fh));
-  else
-    NOT_REACHED ();
+  msg (ME, _("Error reading file %s: %s."),
+       fh_get_name (r->fh), strerror (errno));
 }
 
 /* Report a partial read at end of file reading R. */
@@ -333,6 +368,34 @@ read_size (struct dfm_reader *r, size_t *size_out)
   return 1;
 }
 
+static bool
+read_text_record (struct dfm_reader *r)
+{
+  bool is_auto;
+  bool ok;
+
+  /* Read a line.  If the line reader's encoding changes, update r->encoding to
+     match. */
+  is_auto = line_reader_is_auto (r->line_reader);
+  ok = line_reader_read (r->line_reader, &r->line, SIZE_MAX);
+  if (is_auto && !line_reader_is_auto (r->line_reader))
+    {
+      free (r->encoding);
+      r->encoding = xstrdup (line_reader_get_encoding (r->line_reader));
+    }
+
+  /* Detect and report read error. */
+  if (!ok)
+    {
+      int error = line_reader_error (r->line_reader);
+      if (error != 0)
+        msg (ME, _("Error reading file %s: %s."),
+             fh_get_name (r->fh), strerror (error));
+    }
+
+  return ok;
+}
+
 /* Reads a record from a disk file into R.
    Returns true if successful, false on error or at end of file. */
 static bool
@@ -344,17 +407,7 @@ read_file_record (struct dfm_reader *r)
   switch (fh_get_mode (r->fh))
     {
     case FH_MODE_TEXT:
-      if (ds_read_line (&r->line, r->file, SIZE_MAX))
-        {
-          ds_chomp_byte (&r->line, '\n');
-          return true;
-        }
-      else
-        {
-          if (ferror (r->file))
-            read_error (r);
-          return false;
-        }
+      return read_text_record (r);
 
     case FH_MODE_FIXED:
       if (ds_read_stream (&r->line, 1, fh_get_record_width (r->fh), r->file))
@@ -597,11 +650,11 @@ dfm_expand_tabs (struct dfm_reader *r)
   r->pos = new_pos;
 }
 
-/* Returns the legacy character encoding of data read from READER. */
+/* Returns the character encoding of data read from READER. */
 const char *
-dfm_reader_get_legacy_encoding (const struct dfm_reader *reader)
+dfm_reader_get_encoding (const struct dfm_reader *reader)
 {
-  return fh_get_legacy_encoding (reader->fh);
+  return reader->encoding;
 }
 
 /* Returns a number between 0 and 100 that approximates the
@@ -615,7 +668,11 @@ dfm_get_percent_read (const struct dfm_reader *reader)
 {
   if (reader->file_size >= 0)
     {
-      off_t position = ftello (reader->file);
+      off_t position;
+
+      position = (reader->line_reader != NULL
+                  ? line_reader_tell (reader->line_reader)
+                  : ftello (reader->file));
       if (position >= 0)
         {
           double p = 100.0 * position / reader->file_size;
@@ -710,7 +767,7 @@ cmd_begin_data (struct lexer *lexer, struct dataset *ds)
   lex_match (lexer, T_ENDCMD);
 
   /* Open inline file. */
-  r = dfm_open_reader (fh_inline_file (), lexer);
+  r = dfm_open_reader (fh_inline_file (), lexer, NULL);
   r->flags |= DFM_SAW_BEGIN_DATA;
   r->flags &= ~DFM_CONSUME;
 
diff --git a/src/language/data-io/data-reader.h 
b/src/language/data-io/data-reader.h
index affff78..a199f01 100644
--- a/src/language/data-io/data-reader.h
+++ b/src/language/data-io/data-reader.h
@@ -1,5 +1,5 @@
 /* PSPP - a program for statistical analysis.
-   Copyright (C) 1997-9, 2000, 2010, 2011 Free Software Foundation, Inc.
+   Copyright (C) 1997-9, 2000, 2010, 2011, 2012 Free Software Foundation, Inc.
 
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -31,13 +31,14 @@ struct string;
 struct lexer;
 
 /* Input. */
-struct dfm_reader *dfm_open_reader (struct file_handle *, struct lexer *);
+struct dfm_reader *dfm_open_reader (struct file_handle *, struct lexer *,
+                                    const char *encoding);
 void dfm_close_reader (struct dfm_reader *);
 bool dfm_reader_error (const struct dfm_reader *);
 unsigned dfm_eof (struct dfm_reader *);
 struct substring dfm_get_record (struct dfm_reader *);
 void dfm_expand_tabs (struct dfm_reader *);
-const char *dfm_reader_get_legacy_encoding (const struct dfm_reader *);
+const char *dfm_reader_get_encoding (const struct dfm_reader *);
 int dfm_get_percent_read (const struct dfm_reader *);
 
 /* Line control. */
diff --git a/src/language/data-io/data-writer.c 
b/src/language/data-io/data-writer.c
index 113be58..5270db0 100644
--- a/src/language/data-io/data-writer.c
+++ b/src/language/data-io/data-writer.c
@@ -1,5 +1,5 @@
 /* PSPP - a program for statistical analysis.
-   Copyright (C) 1997-2004, 2006, 2010, 2011 Free Software Foundation, Inc.
+   Copyright (C) 1997-2004, 2006, 2010, 2011, 2012 Free Software Foundation, 
Inc.
 
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -28,7 +28,9 @@
 #include "data/make-file.h"
 #include "language/data-io/file-handle.h"
 #include "libpspp/assertion.h"
+#include "libpspp/encoding-guesser.h"
 #include "libpspp/integer-format.h"
+#include "libpspp/i18n.h"
 #include "libpspp/message.h"
 #include "libpspp/str.h"
 
@@ -46,14 +48,31 @@ struct dfm_writer
     struct fh_lock *lock;       /* Exclusive access to file. */
     FILE *file;                 /* Associated file. */
     struct replace_file *rf;    /* Atomic file replacement support. */
+    char *encoding;             /* Encoding. */
+
+    int unit;                   /* Unit width, in bytes. */
+    char lf[MAX_UNIT];          /* \n in encoding, 'unit' bytes long. */
+    char spaces[32];            /* 32 bytes worth of ' ' in encoding. */
   };
 
-/* Opens a file handle for writing as a data file. */
+/* Opens a file handle for writing as a data file.
+
+   The encoding of the file written is by default that of FH itself.  If
+   ENCODING is nonnull, then it overrides the default encoding.
+
+   *However*: ENCODING directly affects only text strings written by the data
+   writer code itself, that is, new-lines in FH_MODE_TEXT and space padding in
+   FH_MODE_FIXED mode.  The client must do its own encoding translation for the
+   data that it writes.  (This is unavoidable because sometimes the data
+   written includes binary data that reencoding would mangle.)  The client can
+   obtain the encoding to re-encode into with dfm_writer_get_encoding(). */
 struct dfm_writer *
-dfm_open_writer (struct file_handle *fh)
+dfm_open_writer (struct file_handle *fh, const char *encoding)
 {
+  struct encoding_info ei;
   struct dfm_writer *w;
   struct fh_lock *lock;
+  int ofs;
 
   lock = fh_lock (fh, FH_REF_FILE, N_("data file"), FH_ACC_WRITE, false);
   if (lock == NULL)
@@ -63,11 +82,22 @@ dfm_open_writer (struct file_handle *fh)
   if (w != NULL)
     return w;
 
+  encoding = encoding_guess_parse_encoding (encoding != NULL
+                                            ? encoding
+                                            : fh_get_encoding (fh));
+  get_encoding_info (&ei, encoding);
+
   w = xmalloc (sizeof *w);
   w->fh = fh_ref (fh);
   w->lock = lock;
   w->rf = replace_file_start (fh_get_file_name (w->fh), "wb", 0666,
                               &w->file, NULL);
+  w->encoding = xstrdup (encoding);
+  w->unit = ei.unit;
+  memcpy (w->lf, ei.lf, sizeof w->lf);
+  for (ofs = 0; ofs + ei.unit <= sizeof w->spaces; ofs += ei.unit)
+    memcpy (&w->spaces[ofs], ei.space, ei.unit);
+
   if (w->rf == NULL)
     {
       msg (ME, _("An error occurred while opening `%s' for writing "
@@ -104,7 +134,7 @@ dfm_put_record (struct dfm_writer *w, const char *rec, 
size_t len)
     {
     case FH_MODE_TEXT:
       fwrite (rec, len, 1, w->file);
-      putc ('\n', w->file);
+      fwrite (w->lf, w->unit, 1, w->file);
       break;
 
     case FH_MODE_FIXED:
@@ -115,9 +145,8 @@ dfm_put_record (struct dfm_writer *w, const char *rec, 
size_t len)
         fwrite (rec, write_bytes, 1, w->file);
         while (pad_bytes > 0)
           {
-            static const char spaces[32] = "                                ";
-            size_t chunk = MIN (pad_bytes, sizeof spaces);
-            fwrite (spaces, chunk, 1, w->file);
+            size_t chunk = MIN (pad_bytes, sizeof w->spaces);
+            fwrite (w->spaces, chunk, 1, w->file);
             pad_bytes -= chunk;
           }
       }
@@ -193,14 +222,15 @@ dfm_close_writer (struct dfm_writer *w)
         ok = false;
     }
   fh_unref (w->fh);
+  free (w->encoding);
   free (w);
 
   return ok;
 }
 
-/* Returns the legacy character encoding of data written to WRITER. */
+/* Returns the encoding of data written to WRITER. */
 const char *
-dfm_writer_get_legacy_encoding (const struct dfm_writer *writer)
+dfm_writer_get_encoding (const struct dfm_writer *writer)
 {
-  return fh_get_legacy_encoding (writer->fh);
+  return writer->encoding;
 }
diff --git a/src/language/data-io/data-writer.h 
b/src/language/data-io/data-writer.h
index 045db31..10ad6cd 100644
--- a/src/language/data-io/data-writer.h
+++ b/src/language/data-io/data-writer.h
@@ -1,5 +1,5 @@
 /* PSPP - a program for statistical analysis.
-   Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
+   Copyright (C) 1997-9, 2000, 2011, 2012 Free Software Foundation, Inc.
 
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -23,10 +23,11 @@
 #include <stddef.h>
 
 struct file_handle;
-struct dfm_writer *dfm_open_writer (struct file_handle *);
+struct dfm_writer *dfm_open_writer (struct file_handle *,
+                                    const char *encoding);
 bool dfm_close_writer (struct dfm_writer *);
 bool dfm_write_error (const struct dfm_writer *);
 bool dfm_put_record (struct dfm_writer *, const char *rec, size_t len);
-const char *dfm_writer_get_legacy_encoding (const struct dfm_writer *);
+const char *dfm_writer_get_encoding (const struct dfm_writer *);
 
 #endif /* data-writer.h */
diff --git a/src/language/data-io/file-handle.q 
b/src/language/data-io/file-handle.q
index 0519803..26dfc97 100644
--- a/src/language/data-io/file-handle.q
+++ b/src/language/data-io/file-handle.q
@@ -1,5 +1,5 @@
 /* PSPP - a program for statistical analysis.
-   Copyright (C) 1997-9, 2000, 2006, 2010, 2011 Free Software Foundation, Inc.
+   Copyright (C) 1997-9, 2000, 2006, 2010, 2011, 2012 Free Software 
Foundation, Inc.
 
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -16,20 +16,22 @@
 
 #include <config.h>
 
+#include "data/file-handle-def.h"
+
 #include <limits.h>
 #include <errno.h>
 #include <stdlib.h>
 
 #include "data/file-name.h"
 #include "data/session.h"
+#include "data/variable.h"
 #include "language/command.h"
 #include "language/data-io/file-handle.h"
 #include "language/lexer/lexer.h"
 #include "libpspp/assertion.h"
+#include "libpspp/cast.h"
 #include "libpspp/message.h"
 #include "libpspp/str.h"
-#include "data/variable.h"
-#include "data/file-handle-def.h"
 
 #include "gl/xalloc.h"
 
@@ -45,7 +47,8 @@
      lrecl=integer;
      tabwidth=integer;
      mode=mode:!character/binary/image/360;
-     recform=recform:fixed/f/variable/v/spanned/vs.
+     recform=recform:fixed/f/variable/v/spanned/vs;
+     encoding=string.
 */
 /* (declarations) */
 /* (functions) */
@@ -109,7 +112,7 @@ cmd_file_handle (struct lexer *lexer, struct dataset *ds)
       properties.mode = FH_MODE_VARIABLE;
       break;
     case FH_360:
-      properties.encoding = "EBCDIC-US";
+      properties.encoding = CONST_CAST (char *, "EBCDIC-US");
       if (cmd.recform == FH_FIXED || cmd.recform == FH_F)
         properties.mode = FH_MODE_FIXED;
       else if (cmd.recform == FH_VARIABLE || cmd.recform == FH_V)
@@ -146,6 +149,9 @@ cmd_file_handle (struct lexer *lexer, struct dataset *ds)
         properties.record_width = cmd.n_lrecl[0];
     }
 
+  if (cmd.s_encoding != NULL)
+    properties.encoding = cmd.s_encoding;
+
   fh_create_file (handle_name, cmd.s_name, &properties);
 
   result = CMD_SUCCESS;
diff --git a/src/language/data-io/get-data.c b/src/language/data-io/get-data.c
index 4274f95..10d59aa 100644
--- a/src/language/data-io/get-data.c
+++ b/src/language/data-io/get-data.c
@@ -1,5 +1,5 @@
 /* PSPP - a program for statistical analysis.
-   Copyright (C) 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
+   Copyright (C) 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, 
Inc.
 
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -307,6 +307,7 @@ parse_get_txt (struct lexer *lexer, struct dataset *ds)
   struct dictionary *dict = dict_create (get_default_encoding ());
   struct file_handle *fh = NULL;
   struct dfm_reader *reader = NULL;
+  char *encoding = NULL;
   char *name = NULL;
 
   int record;
@@ -334,7 +335,18 @@ parse_get_txt (struct lexer *lexer, struct dataset *ds)
       if (!lex_force_match (lexer, T_SLASH))
         goto error;
 
-      if (lex_match_id (lexer, "ARRANGEMENT"))
+      if (lex_match_id (lexer, "ENCODING"))
+       {
+         lex_match (lexer, T_EQUALS);
+         if (!lex_force_string (lexer))
+           goto error;
+
+          free (encoding);
+          encoding = ss_xstrdup (lex_tokss (lexer));
+
+         lex_get (lexer);
+       }
+      else if (lex_match_id (lexer, "ARRANGEMENT"))
         {
           bool ok;
 
@@ -606,12 +618,13 @@ parse_get_txt (struct lexer *lexer, struct dataset *ds)
     }
   while (lex_token (lexer) != T_ENDCMD);
 
-  reader = dfm_open_reader (fh, lexer);
+  reader = dfm_open_reader (fh, lexer, encoding);
   if (reader == NULL)
     goto error;
 
   data_parser_make_active_file (parser, ds, reader, dict);
   fh_unref (fh);
+  free (encoding);
   return CMD_SUCCESS;
 
  error:
@@ -619,6 +632,7 @@ parse_get_txt (struct lexer *lexer, struct dataset *ds)
   dict_destroy (dict);
   fh_unref (fh);
   free (name);
+  free (encoding);
   return CMD_CASCADING_FAILURE;
 }
 
diff --git a/src/language/data-io/inpt-pgm.c b/src/language/data-io/inpt-pgm.c
index 6f2a99e..36c58c8 100644
--- a/src/language/data-io/inpt-pgm.c
+++ b/src/language/data-io/inpt-pgm.c
@@ -1,5 +1,5 @@
 /* PSPP - a program for statistical analysis.
-   Copyright (C) 1997-9, 2000, 2009, 2010, 2011 Free Software Foundation, Inc.
+   Copyright (C) 1997-9, 2000, 2009, 2010, 2011, 2012 Free Software 
Foundation, Inc.
 
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -266,6 +266,7 @@ cmd_reread (struct lexer *lexer, struct dataset *ds)
   struct file_handle *fh;       /* File to be re-read. */
   struct expression *e;         /* Expression for column to set. */
   struct reread_trns *t;        /* Created transformation. */
+  char *encoding = NULL;
 
   fh = fh_get_default_handle ();
   e = NULL;
@@ -278,13 +279,12 @@ cmd_reread (struct lexer *lexer, struct dataset *ds)
          if (e)
            {
               lex_sbc_only_once ("COLUMN");
-             expr_free (e);
-             return CMD_CASCADING_FAILURE;
+              goto error;
            }
 
          e = expr_parse (lexer, ds, EXPR_NUMBER);
          if (!e)
-           return CMD_CASCADING_FAILURE;
+            goto error;
        }
       else if (lex_match_id (lexer, "FILE"))
        {
@@ -292,26 +292,39 @@ cmd_reread (struct lexer *lexer, struct dataset *ds)
           fh_unref (fh);
           fh = fh_parse (lexer, FH_REF_FILE | FH_REF_INLINE, NULL);
          if (fh == NULL)
-           {
-             expr_free (e);
-             return CMD_CASCADING_FAILURE;
-           }
+            goto error;
+       }
+      else if (lex_match_id (lexer, "ENCODING"))
+       {
+         lex_match (lexer, T_EQUALS);
+         if (!lex_force_string (lexer))
+           goto error;
+
+          free (encoding);
+          encoding = ss_xstrdup (lex_tokss (lexer));
+
+         lex_get (lexer);
        }
       else
        {
          lex_error (lexer, NULL);
-         expr_free (e);
-          return CMD_CASCADING_FAILURE;
+          goto error;
        }
     }
 
   t = xmalloc (sizeof *t);
-  t->reader = dfm_open_reader (fh, lexer);
+  t->reader = dfm_open_reader (fh, lexer, encoding);
   t->column = e;
   add_transformation (ds, reread_trns_proc, reread_trns_free, t);
 
   fh_unref (fh);
+  free (encoding);
   return CMD_SUCCESS;
+
+error:
+  expr_free (e);
+  free (encoding);
+  return CMD_CASCADING_FAILURE;
 }
 
 /* Executes a REREAD transformation. */
diff --git a/src/language/data-io/print-space.c 
b/src/language/data-io/print-space.c
index edaf13e..adeb92b 100644
--- a/src/language/data-io/print-space.c
+++ b/src/language/data-io/print-space.c
@@ -51,6 +51,7 @@ cmd_print_space (struct lexer *lexer, struct dataset *ds)
   struct file_handle *handle = NULL;
   struct expression *expr = NULL;
   struct dfm_writer *writer;
+  char *encoding = NULL;
 
   if (lex_match_id (lexer, "OUTFILE"))
     {
@@ -59,6 +60,17 @@ cmd_print_space (struct lexer *lexer, struct dataset *ds)
       handle = fh_parse (lexer, FH_REF_FILE, NULL);
       if (handle == NULL)
        return CMD_FAILURE;
+
+      if (lex_match_id (lexer, "ENCODING"))
+       {
+         lex_match (lexer, T_EQUALS);
+         if (!lex_force_string (lexer))
+           goto error;
+
+          encoding = ss_xstrdup (lex_tokss (lexer));
+
+         lex_get (lexer);
+       }
     }
   else
     handle = NULL;
@@ -77,7 +89,7 @@ cmd_print_space (struct lexer *lexer, struct dataset *ds)
 
   if (handle != NULL)
     {
-      writer = dfm_open_writer (handle);
+      writer = dfm_open_writer (handle, encoding);
       if (writer == NULL)
         goto error;
     }
@@ -124,7 +136,7 @@ print_space_trns_proc (void *t_, struct ccase **c,
     if (trns->writer == NULL)
       text_item_submit (text_item_create (TEXT_ITEM_BLANK_LINE, ""));
     else
-      dfm_put_record (trns->writer, " ", 1);
+      dfm_put_record (trns->writer, " ", 1); /* XXX */
 
   if (trns->writer != NULL && dfm_write_error (trns->writer))
     return TRNS_ERROR;
diff --git a/src/language/data-io/print.c b/src/language/data-io/print.c
index 86952e0..cffa3bd 100644
--- a/src/language/data-io/print.c
+++ b/src/language/data-io/print.c
@@ -136,6 +136,7 @@ internal_cmd_print (struct lexer *lexer, struct dataset *ds,
   bool print_table = 0;
   struct print_trns *trns;
   struct file_handle *fh = NULL;
+  char *encoding = NULL;
   struct pool *tmp_pool;
 
   /* Fill in prt to facilitate error-handling. */
@@ -160,6 +161,17 @@ internal_cmd_print (struct lexer *lexer, struct dataset 
*ds,
          if (fh == NULL)
            goto error;
        }
+      else if (lex_match_id (lexer, "ENCODING"))
+       {
+         lex_match (lexer, T_EQUALS);
+         if (!lex_force_string (lexer))
+           goto error;
+
+          free (encoding);
+          encoding = ss_xstrdup (lex_tokss (lexer));
+
+         lex_get (lexer);
+       }
       else if (lex_match_id (lexer, "RECORDS"))
        {
          lex_match (lexer, T_EQUALS);
@@ -194,10 +206,10 @@ internal_cmd_print (struct lexer *lexer, struct dataset 
*ds,
 
   if (fh != NULL)
     {
-      trns->writer = dfm_open_writer (fh);
+      trns->writer = dfm_open_writer (fh, encoding);
       if (trns->writer == NULL)
         goto error;
-      trns->encoding = dfm_writer_get_legacy_encoding (trns->writer);
+      trns->encoding = dfm_writer_get_encoding (trns->writer);
     }
   else
     trns->encoding = UTF8;
diff --git a/tests/language/data-io/get-data-txt.at 
b/tests/language/data-io/get-data-txt.at
index 4418974..3ba508c 100644
--- a/tests/language/data-io/get-data-txt.at
+++ b/tests/language/data-io/get-data-txt.at
@@ -568,3 +568,20 @@ x
 100
 ])
 AT_CLEANUP
+
+AT_SETUP([GET DATA /TYPE=TXT with ENCODING subcommand])
+AT_CHECK([i18n-test supports_encodings UTF-8 ISO-8859-1])
+AT_DATA([get-data.sps], [dnl
+set locale='utf-8'
+get data /type=txt /file='data.txt' /encoding='iso-8859-1'
+  /delimiters="," /variables=s a8.
+list.
+])
+printf '\351' > data.txt       # é in ISO-8859-1.
+AT_CHECK([pspp -o pspp.csv get-data.sps])
+AT_CHECK([cat pspp.csv], [0], [dnl
+Table: Data List
+s
+é      @&t@
+])
+AT_CLEANUP
-- 
1.7.2.5




reply via email to

[Prev in Thread] Current Thread [Next in Thread]