guile-commits
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Guile-commits] GNU Guile branch, elisp, updated. release_1-9-1-77-g5b1e


From: Daniel Kraft
Subject: [Guile-commits] GNU Guile branch, elisp, updated. release_1-9-1-77-g5b1ee3b
Date: Wed, 26 Aug 2009 17:48:12 +0000

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU Guile".

http://git.savannah.gnu.org/cgit/guile.git/commit/?id=5b1ee3bef15380745f6c2fc598768d9f3a46e85e

The branch, elisp has been updated
       via  5b1ee3bef15380745f6c2fc598768d9f3a46e85e (commit)
       via  25512a940b30a9c4abe24b20b9ea4be78ab8127e (commit)
       via  157ffbd797b466d80c2a1002a1d40d9b6a89d102 (commit)
       via  a876e7dcea78e770bedba40017fbb225cf88bff5 (commit)
       via  f7118e35525e1c137f2fb96619233610549fae12 (commit)
       via  4c402b889eecaa7ffc61da6656f415c8c983507a (commit)
       via  64bad3f5a8d7351a41a5b9ccb1df5c393a48b4a9 (commit)
       via  5adcdb65192ba6e654ab2d1dd8b0840a33136a8a (commit)
      from  1b1195f29bd73885bcd9fd77e9b2dae0dfa003c0 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 5b1ee3bef15380745f6c2fc598768d9f3a46e85e
Author: Daniel Kraft <address@hidden>
Date:   Wed Aug 26 19:48:06 2009 +0200

    Character and string literal support for the elisp lexer.
    
    * module/language/elisp/lexer.scm: Handle character and string literals.
    * test-suite/tests/elisp-reader.test: Test it.

commit 25512a940b30a9c4abe24b20b9ea4be78ab8127e
Author: Daniel Kraft <address@hidden>
Date:   Wed Aug 26 14:32:48 2009 +0200

    A first, rough lexer for elisp still missing some stuff.
    
    * module/language/elisp/lexer.scm: New lexer file.
    * test-suite/Makefile.am: Register elisp-reader.test as new test.
    * test-suite/tests/elisp-reader.test: New test-case.

commit 157ffbd797b466d80c2a1002a1d40d9b6a89d102
Merge: 1b1195f29bd73885bcd9fd77e9b2dae0dfa003c0 
a876e7dcea78e770bedba40017fbb225cf88bff5
Author: Daniel Kraft <address@hidden>
Date:   Tue Aug 4 09:37:50 2009 +0200

    Merge branch 'master' of git://git.savannah.gnu.org/guile into elisp

-----------------------------------------------------------------------

Summary of changes:
 NEWS                               |   15 ++
 libguile/chars.c                   |   90 +++++-----
 libguile/chars.h                   |   19 +--
 libguile/numbers.h                 |    3 -
 libguile/print.c                   |   20 +--
 libguile/vm-i-system.c             |    8 +-
 module/language/elisp/lexer.scm    |  336 ++++++++++++++++++++++++++++++++++++
 test-suite/Makefile.am             |    1 +
 test-suite/tests/elisp-reader.test |  116 +++++++++++++
 9 files changed, 531 insertions(+), 77 deletions(-)
 create mode 100644 module/language/elisp/lexer.scm
 create mode 100644 test-suite/tests/elisp-reader.test

diff --git a/NEWS b/NEWS
index 445bb1c..96c3a9b 100644
--- a/NEWS
+++ b/NEWS
@@ -8,6 +8,21 @@ Please send Guile bug reports to address@hidden
 (During the 1.9 series, we will keep an incremental NEWS for the latest
 prerelease, and a full NEWS corresponding to 1.8 -> 2.0.)
 
+Changes in 1.9.2 (since the 1.9.1 prerelease):
+
+** Global variables `scm_charnames' and `scm_charnums' are removed.
+
+These variables contained the names of control characters and were
+used when writing characters.  While these were global, they were
+never intended to be public API.  They have been replaced with private
+functions.
+
+** EBCDIC support is removed.
+
+There was an EBCDIC compile flag that altered some of the character
+processing.  It appeared that full EBCDIC support was never completed
+and was unmaintained.
+
 Changes in 1.9.1 (since the 1.9.0 prerelease):
 
 ** `scm_set_port_seek' and `scm_set_port_truncate' use the `scm_t_off' type
diff --git a/libguile/chars.c b/libguile/chars.c
index 5a53c45..2103c54 100644
--- a/libguile/chars.c
+++ b/libguile/chars.c
@@ -312,51 +312,45 @@ scm_c_downcase (scm_t_wchar c)
    extensions for control characters, and leftover Guile extensions.
    They are listed in order of precedence.  */
 
-const char *const scm_r5rs_charnames[] = 
-  {
-    "space", "newline"
-  };
+static const char *const scm_r5rs_charnames[] = {
+  "space", "newline"
+};
 
-const scm_t_uint32 const scm_r5rs_charnums[] = 
-  {
-    0x20, 0x0A
-  };
+static const scm_t_uint32 const scm_r5rs_charnums[] = {
+  0x20, 0x0A
+};
 
-const int scm_n_r5rs_charnames = sizeof (scm_r5rs_charnames) / sizeof (char *);
+#define SCM_N_R5RS_CHARNAMES (sizeof (scm_r5rs_charnames) / sizeof (char *))
 
 /* The abbreviated names for control characters.  */
-const char *const scm_C0_control_charnames[] = 
-  {
-    /* C0 controls */
-    "nul", "soh", "stx", "etx", "eot", "enq", "ack", "bel",
-    "bs",  "ht",  "lf",  "vt",  "ff",  "cr",  "so",  "si",
-    "dle", "dc1", "dc2", "dc3", "dc4", "nak", "syn", "etb", 
-    "can", "em",  "sub", "esc", "fs",  "gs",  "rs",  "us",
-    "sp", "del"
-  };
-
-const scm_t_uint32 const scm_C0_control_charnums[] = 
-  {
-    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
-    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
-    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
-    0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
-    0x20, 0x7f
-  };
-
-int scm_n_C0_control_charnames = sizeof (scm_C0_control_charnames) / sizeof 
(char *);
-
-const char *const scm_alt_charnames[] = 
-  {
-    "null", "backspace", "tab", "nl", "newline", "np", "page", "return",
-  };
-  
-const scm_t_uint32 const scm_alt_charnums[] = 
-  {
-    0x00, 0x08, 0x09, 0x0a, 0x0a, 0x0c, 0x0c, 0x0d
-  };
-
-const int scm_n_alt_charnames = sizeof (scm_alt_charnames) / sizeof (char *);
+static const char *const scm_C0_control_charnames[] = {
+  /* C0 controls */
+  "nul", "soh", "stx", "etx", "eot", "enq", "ack", "bel",
+  "bs",  "ht",  "lf",  "vt",  "ff",  "cr",  "so",  "si",
+  "dle", "dc1", "dc2", "dc3", "dc4", "nak", "syn", "etb", 
+  "can", "em",  "sub", "esc", "fs",  "gs",  "rs",  "us",
+  "sp", "del"
+};
+
+static const scm_t_uint32 const scm_C0_control_charnums[] = {
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+  0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+  0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+  0x20, 0x7f
+};
+
+#define SCM_N_C0_CONTROL_CHARNAMES (sizeof (scm_C0_control_charnames) / sizeof 
(char *))
+
+static const char *const scm_alt_charnames[] = {
+  "null", "backspace", "tab", "nl", "newline", "np", "page", "return",
+};
+
+static const scm_t_uint32 const scm_alt_charnums[] = {
+  0x00, 0x08, 0x09, 0x0a, 0x0a, 0x0c, 0x0c, 0x0d
+};
+
+#define SCM_N_ALT_CHARNAMES (sizeof (scm_alt_charnames) / sizeof (char *))
 
 /* Returns the string charname for a character if it exists, or NULL
    otherwise.  */
@@ -366,15 +360,15 @@ scm_i_charname (SCM chr)
   int c;
   scm_t_uint32 i = SCM_CHAR (chr);
 
-  for (c = 0; c < scm_n_r5rs_charnames; c++)
+  for (c = 0; c < SCM_N_R5RS_CHARNAMES; c++)
     if (scm_r5rs_charnums[c] == i)
       return scm_r5rs_charnames[c];
 
-  for (c = 0; c < scm_n_C0_control_charnames; c++)
+  for (c = 0; c < SCM_N_C0_CONTROL_CHARNAMES; c++)
     if (scm_C0_control_charnums[c] == i)
       return scm_C0_control_charnames[c];
 
-  for (c = 0; c < scm_n_alt_charnames; c++)
+  for (c = 0; c < SCM_N_ALT_CHARNAMES; c++)
     if (scm_alt_charnums[c] == i)
       return scm_alt_charnames[i];
 
@@ -389,23 +383,23 @@ scm_i_charname_to_char (const char *charname, size_t 
charname_len)
 
   /* The R5RS charnames.  These are supposed to be case
      insensitive. */
-  for (c = 0; c < scm_n_r5rs_charnames; c++)
+  for (c = 0; c < SCM_N_R5RS_CHARNAMES; c++)
     if ((strlen (scm_r5rs_charnames[c]) == charname_len)
        && (!strncasecmp (scm_r5rs_charnames[c], charname, charname_len)))
       return SCM_MAKE_CHAR (scm_r5rs_charnums[c]);
 
   /* Then come the controls.  These are not case sensitive.  */
-  for (c = 0; c < scm_n_C0_control_charnames; c++)
+  for (c = 0; c < SCM_N_C0_CONTROL_CHARNAMES; c++)
     if ((strlen (scm_C0_control_charnames[c]) == charname_len)
        && (!strncasecmp (scm_C0_control_charnames[c], charname, charname_len)))
       return SCM_MAKE_CHAR (scm_C0_control_charnums[c]);
 
   /* Lastly are some old names carried over for compatibility.  */
-  for (c = 0; c < scm_n_alt_charnames; c++)
+  for (c = 0; c < SCM_N_ALT_CHARNAMES; c++)
     if ((strlen (scm_alt_charnames[c]) == charname_len)
        && (!strncasecmp (scm_alt_charnames[c], charname, charname_len)))
       return SCM_MAKE_CHAR (scm_alt_charnums[c]);
-  
+
   return SCM_BOOL_F;
 }
 
diff --git a/libguile/chars.h b/libguile/chars.h
index e68f06d..4d1be1d 100644
--- a/libguile/chars.h
+++ b/libguile/chars.h
@@ -24,28 +24,23 @@
 
 
 #include "libguile/__scm.h"
+#include "libguile/numbers.h"
 
 
 /* Immediate Characters
  */
-
-#ifndef SCM_WCHAR_DEFINED
-typedef scm_t_int32 scm_t_wchar;
-#define SCM_WCHAR_DEFINED
-#endif
-
 #define SCM_CHARP(x) (SCM_ITAG8(x) == scm_tc8_char)
 #define SCM_CHAR(x) ((scm_t_wchar)SCM_ITAG8_DATA(x))
 
-#define SCM_MAKE_CHAR(x) ({scm_t_int32 _x = (x);                        \
-      _x < 0                                                            \
-        ? SCM_MAKE_ITAG8((scm_t_bits)(unsigned char)_x, scm_tc8_char)   \
-        : SCM_MAKE_ITAG8((scm_t_bits)_x, scm_tc8_char);})
+#define SCM_MAKE_CHAR(x)                                              \
+  (x < 0                                                              \
+   ? SCM_MAKE_ITAG8 ((scm_t_bits) (unsigned char) x, scm_tc8_char)    \
+   : SCM_MAKE_ITAG8 ((scm_t_bits) x, scm_tc8_char))
 
 #define SCM_CODEPOINT_MAX (0x10ffff)
 #define SCM_IS_UNICODE_CHAR(c)                                          \
-  ((scm_t_wchar)(c)<=0xd7ff ||                                          \
-   ((scm_t_wchar)(c)>=0xe000 && (scm_t_wchar)(c)<=SCM_CODEPOINT_MAX))
+  ((scm_t_wchar) (c) <= 0xd7ff                                          \
+   || ((scm_t_wchar) (c) >= 0xe000 && (scm_t_wchar) (c) <= SCM_CODEPOINT_MAX))
 
 
 
diff --git a/libguile/numbers.h b/libguile/numbers.h
index f30f7d0..bb72d7a 100644
--- a/libguile/numbers.h
+++ b/libguile/numbers.h
@@ -174,10 +174,7 @@ typedef struct scm_t_complex
   double imag;
 } scm_t_complex;
 
-#ifndef SCM_WCHAR_DEFINED
 typedef scm_t_int32 scm_t_wchar;
-#define SCM_WCHAR_DEFINED
-#endif
 
 
 
diff --git a/libguile/print.c b/libguile/print.c
index 1a5aebe..f43856b 100644
--- a/libguile/print.c
+++ b/libguile/print.c
@@ -454,22 +454,16 @@ iprin1 (SCM exp, SCM port, scm_print_state *pstate)
                 /* Print the character if is graphic character.  */
                 {
                   if (i<256)
-                    {
-                      /* Character is graphic.  Print it.  */
-                      scm_putc (i, port);
-                    }
+                    /* Character is graphic.  Print it.  */
+                    scm_putc (i, port);
                   else
-                    {
-                      /* Character is graphic but unrepresentable in
-                         this port's encoding.  */
-                      scm_intprint (i, 8, port);
-                    }
+                    /* Character is graphic but unrepresentable in
+                       this port's encoding.  */
+                    scm_intprint (i, 8, port);
                 }
               else
-                {
-                  /* Character is a non-graphical character.  */
-                  scm_intprint (i, 8, port);
-                }
+                /* Character is a non-graphical character.  */
+                scm_intprint (i, 8, port);
            }
          else
            scm_putc (i, port);
diff --git a/libguile/vm-i-system.c b/libguile/vm-i-system.c
index 5a4809d..ad0ec54 100644
--- a/libguile/vm-i-system.c
+++ b/libguile/vm-i-system.c
@@ -177,7 +177,13 @@ VM_DEFINE_INSTRUCTION (15, make_uint64, "make-uint64", 8, 
0, 1)
 
 VM_DEFINE_INSTRUCTION (16, make_char8, "make-char8", 1, 0, 1)
 {
-  PUSH (SCM_MAKE_CHAR (FETCH ()));
+  scm_t_uint8 v = 0;
+  v = FETCH ();
+
+  PUSH (SCM_MAKE_CHAR (v));
+  /* Don't simplify this to PUSH (SCM_MAKE_CHAR (FETCH ())).  The
+     contents of SCM_MAKE_CHAR may be evaluated more than once,
+     resulting in a double fetch.  */
   NEXT;
 }
 
diff --git a/module/language/elisp/lexer.scm b/module/language/elisp/lexer.scm
new file mode 100644
index 0000000..3dbce86
--- /dev/null
+++ b/module/language/elisp/lexer.scm
@@ -0,0 +1,336 @@
+;;; Guile Emac Lisp
+
+;; Copyright (C) 2001 Free Software Foundation, Inc.
+
+;; This program is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 2, or (at your option)
+;; any later version.
+;; 
+;; This program is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+;; 
+;; You should have received a copy of the GNU General Public License
+;; along with this program; see the file COPYING.  If not, write to
+;; the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+;; Boston, MA 02111-1307, USA.
+
+;;; Code:
+
+(define-module (language elisp lexer)
+  #:use-module (ice-9 regex)
+  #:export (get-lexer))
+
+; This is the lexical analyzer for the elisp reader.  It is hand-written
+; instead of using some generator because I think that's most viable in this
+; case and easy enough.
+
+; Characters are handled internally as integers representing their
+; code value.  This is necessary because elisp allows a lot of fancy modifiers
+; that set certain high-range bits and the resulting values would not fit
+; into a real Scheme character range.  Additionally, elisp wants characters
+; as integers, so we just do the right thing...
+
+; TODO: Circular syntax markers like #1= or #1#
+; TODO: address@hidden comments
+
+
+; Report an error from the lexer (that is, invalid input given).
+
+(define (lexer-error port msg . args)
+  (apply error msg args))
+
+
+; In a character, set a given bit.  This is just some bit-wise or'ing on the
+; characters integer code and converting back to character.
+
+(define (set-char-bit chr bit)
+  (logior chr (ash 1 bit)))
+
+
+; Check if a character equals some other.  This is just like char=? except that
+; the tested one could be EOF in which case it simply isn't equal.
+
+(define (is-char? tested should-be)
+  (and (not (eof-object? tested))
+       (char=? tested should-be)))
+
+
+; For a character (as integer code), find the real character it represents or
+; #\nul if out of range.  This is used to work with Scheme character functions
+; like char-numeric?.
+
+(define (real-character chr)
+  (if (< chr 256)
+    (integer->char chr)
+    #\nul))
+
+
+; Return the control modified version of a character.  This is not just setting
+; a modifier bit, because ASCII conrol characters must be handled as such, and
+; in elisp C-? is the delete character for historical reasons.
+; Otherwise, we set bit 26.
+
+(define (add-control chr)
+  (let ((real (real-character chr)))
+    (if (char-alphabetic? real)
+      (- (char->integer (char-upcase real)) (char->integer #\@))
+      (case real
+        ((#\?) 127)
+        ((#\@) 0)
+        (else (set-char-bit chr 26))))))
+
+
+; Parse a charcode given in some base, basically octal or hexadecimal are
+; needed.  A requested number of digits can be given (#f means it does
+; not matter and arbitrary many are allowed), and additionally early
+; return allowed (if fewer valid digits are found).
+; These options are all we need to handle the \u, \U, \x and \ddd (octal 
digits)
+; escape sequences.
+
+(define (charcode-escape port base digits early-return)
+  (let iterate ((result 0)
+                (procdigs 0))
+    (if (and digits (>= procdigs digits))
+      result
+      (let* ((cur (read-char port))
+             (value (cond
+                      ((char-numeric? cur)
+                       (- (char->integer cur) (char->integer #\0)))
+                      ((char-alphabetic? cur)
+                       (let ((code (- (char->integer (char-upcase cur))
+                                      (char->integer #\A))))
+                         (if (< code 0)
+                           #f
+                           (+ code 10))))
+                      (else #f)))
+             (valid (and value (< value base))))
+        (if (not valid)
+          (if (or (not digits) early-return)
+            (begin
+              (unread-char cur port)
+              result)
+            (lexer-error port "invalid digit in escape-code" base cur))
+          (iterate (+ (* result base) value) (1+ procdigs)))))))
+
+
+; Read a character and process escape-sequences when necessary.  The special
+; in-string argument defines if this character is part of a string literal or
+; a single character literal, the difference being that in strings the
+; meta modifier sets bit 7, while it is bit 27 for characters.
+
+(define basic-escape-codes
+  '((#\a . 7) (#\b . 8) (#\t . 9)
+    (#\n . 10) (#\v . 11) (#\f . 12) (#\r . 13)
+    (#\e . 27) (#\s . 32) (#\d . 127)))
+
+(define (get-character port in-string)
+  (let ((meta-bits `((#\A . 22) (#\s . 23) (#\H . 24)
+                     (#\S . 25) (#\M . ,(if in-string 7 27))))
+        (cur (read-char port)))
+    (if (char=? cur #\\)
+
+      ; Handle an escape-sequence.
+      (let* ((escaped (read-char port))
+             (esc-code (assq-ref basic-escape-codes escaped))
+             (meta (assq-ref meta-bits escaped)))
+        (cond
+
+          ; Meta-check must be before esc-code check because \s- must be
+          ; recognized as the super-meta modifier if a - follows.
+          ; If not, it will be caught as \s -> space escape code.
+          ((and meta (is-char? (peek-char port) #\-))
+           (if (not (char=? (read-char port) #\-))
+             (error "expected - after control sequence"))
+           (set-char-bit (get-character port in-string) meta))
+
+          ; One of the basic control character escape names?
+          (esc-code esc-code)
+
+          ; Handle \ddd octal code if it is one.
+          ((and (char>=? escaped #\0) (char<? escaped #\8))
+           (begin
+             (unread-char escaped port)
+             (charcode-escape port 8 3 #t)))
+
+          ; Check for some escape-codes directly or otherwise
+          ; use the escaped character literally.
+          (else
+            (case escaped
+              ((#\^) (add-control (get-character port in-string)))
+              ((#\C)
+               (if (is-char? (peek-char port) #\-)
+                 (begin
+                   (if (not (char=? (read-char port) #\-))
+                     (error "expected - after control sequence"))
+                   (add-control (get-character port in-string)))
+                 escaped))
+              ((#\x) (charcode-escape port 16 #f #t))
+              ((#\u) (charcode-escape port 16 4 #f))
+              ((#\U) (charcode-escape port 16 8 #f))
+              (else (char->integer escaped))))))
+
+      ; No escape-sequence, just the literal character.
+      ; But remember to get the code instead!
+      (char->integer cur))))
+
+
+; Read a symbol or number from a port until something follows that marks the
+; start of a new token (like whitespace or parentheses).  The data read is
+; returned as a string for further conversion to the correct type, but we also
+; return what this is (integer/float/symbol).
+; If any escaped character is found, it must be a symbol.  Otherwise we
+; at the end check the result-string against regular expressions to determine
+; if it is possibly an integer or a float.
+
+(define integer-regex (make-regexp "^[+-]?[0-9]+\\.?$"))
+(define float-regex
+  (make-regexp "^[+-]?([0-9]+\\.?[0-9]*|[0-9]*\\.?[0-9]+)(e[+-]?[0-9]+)?$"))
+
+; A dot is also allowed literally, only a single dort alone is parsed as the
+; 'dot' terminal for dotted lists.
+(define no-escape-punctuation (string->char-set "-+=*/address@hidden&:<>{}?."))
+
+(define (get-symbol-or-number port)
+  (let iterate ((result-chars '())
+                (had-escape #f))
+    (let* ((c (read-char port))
+           (finish (lambda ()
+                     (let ((result (list->string (reverse result-chars))))
+                       (values
+                         (cond
+                           ((and (not had-escape)
+                                 (regexp-exec integer-regex result))
+                            'integer)
+                           ((and (not had-escape)
+                                 (regexp-exec float-regex result))
+                            'float)
+                           (else 'symbol))
+                         result))))
+           (need-no-escape? (lambda (c)
+                              (or (char-numeric? c)
+                                  (char-alphabetic? c)
+                                  (char-set-contains? no-escape-punctuation
+                                                      c)))))
+      (cond
+        ((eof-object? c) (finish))
+        ((need-no-escape? c) (iterate (cons c result-chars) had-escape))
+        ((char=? c #\\) (iterate (cons (read-char port) result-chars) #t))
+        (else
+          (unread-char c port)
+          (finish))))))
+  
+
+; Main lexer routine, which is given a port and does look for the next token.
+
+(define (lex port)
+  (let ((return (let ((file (if (file-port? port) (port-filename port) #f))
+                      (line (1+ (port-line port)))
+                      (column (1+ (port-column port))))
+                  (lambda (token value)
+                    (let ((obj (cons token value)))
+                      (set-source-property! obj 'filename file)
+                      (set-source-property! obj 'line line)
+                      (set-source-property! obj 'column column)
+                      obj))))
+        ; Read afterwards so the source-properties are correct above
+        ; and actually point to the very character to be read.
+        (c (read-char port)))
+    (cond
+
+      ; End of input must be specially marked to the parser.
+      ((eof-object? c) '*eoi*)
+
+      ; Whitespace, just skip it.
+      ((char-whitespace? c) (lex port))
+
+      ; The dot is only the one for dotted lists if followed by
+      ; whitespace.  Otherwise it is considered part of a number of symbol.
+      ((and (char=? c #\.)
+            (char-whitespace? (peek-char port)))
+       (return 'dot #f))
+
+      ; Continue checking for literal character values.
+      (else
+        (case c
+
+          ; A line comment, skip until end-of-line is found.
+          ((#\;)
+           (let iterate ()
+             (let ((cur (read-char port)))
+               (if (or (eof-object? cur) (char=? cur #\newline))
+                 (lex port)
+                 (iterate)))))
+
+          ; A character literal.
+          ((#\?)
+           (return 'character (get-character port #f)))
+
+          ; A literal string.  This is mainly a sequence of characters just
+          ; as in the character literals, the only difference is that escaped
+          ; newline and space are to be completely ignored and that 
meta-escapes
+          ; set bit 7 rather than bit 27.
+          ((#\")
+           (let iterate ((result-chars '()))
+             (let ((cur (read-char port)))
+               (case cur
+                 ((#\")
+                  (return 'string (list->string (reverse result-chars))))
+                 ((#\\)
+                  (let ((escaped (read-char port)))
+                    (case escaped
+                      ((#\newline #\space)
+                       (iterate result-chars))
+                      (else
+                        (unread-char escaped port)
+                        (unread-char cur port)
+                        (iterate (cons (integer->char (get-character port #t))
+                                       result-chars))))))
+                 (else (iterate (cons cur result-chars)))))))
+
+          ; Parentheses and other special-meaning single characters.
+          ((#\() (return 'paren-open #f))
+          ((#\)) (return 'paren-close #f))
+          ((#\[) (return 'square-open #f))
+          ((#\]) (return 'square-close #f))
+          ((#\') (return 'quote #f))
+          ((#\`) (return 'backquote #f))
+          ((#\,) (return 'unquote #f))
+
+          ; Remaining are numbers and symbols.  Process input until next
+          ; whitespace is found, and see if it looks like a number
+          ; (float/integer) or symbol and return accordingly.
+          (else
+            (unread-char c port)
+            (call-with-values
+              (lambda ()
+                (get-symbol-or-number port))
+              (lambda (type str)
+                (case type
+                  ((symbol) (return 'symbol (string->symbol str)))
+                  ((integer)
+                   ; In elisp, something like "1." is an integer, while
+                   ; string->number returns an inexact real.  Thus we
+                   ; need a conversion here, but it should always result in
+                   ; an integer!
+                   (return 'integer
+                           (let ((num (inexact->exact (string->number str))))
+                             (if (not (integer? num))
+                               (error "expected integer" str num))
+                             num)))
+                  ((float)
+                   (return 'float (let ((num (string->number str)))
+                                    (if (exact? num)
+                                      (error "expected inexact float" str num))
+                                    num)))
+                  (else (error "wrong number/symbol type" type)))))))))))
+
+
+; Build a lexer thunk for a port.  This is the exported routine which can be
+; used to create a lexer for the parser to use.
+
+(define (get-lexer port)
+  (lambda ()
+    (lex port)))
diff --git a/test-suite/Makefile.am b/test-suite/Makefile.am
index 17e5f1b..cf575a2 100644
--- a/test-suite/Makefile.am
+++ b/test-suite/Makefile.am
@@ -33,6 +33,7 @@ SCM_TESTS = tests/alist.test                  \
            tests/continuations.test            \
            tests/elisp.test                    \
            tests/elisp-compiler.text           \
+           tests/elisp-reader.text             \
            tests/environments.test             \
            tests/eval.test                     \
            tests/exceptions.test               \
diff --git a/test-suite/tests/elisp-reader.test 
b/test-suite/tests/elisp-reader.test
new file mode 100644
index 0000000..15d5344
--- /dev/null
+++ b/test-suite/tests/elisp-reader.test
@@ -0,0 +1,116 @@
+;;;; elisp-reader.test --- Test the reader used by the Elisp compiler.
+;;;;
+;;;; Copyright (C) 2009 Free Software Foundation, Inc.
+;;;; Daniel Kraft
+;;;;
+;;;; This library is free software; you can redistribute it and/or
+;;;; modify it under the terms of the GNU Lesser General Public
+;;;; License as published by the Free Software Foundation; either
+;;;; version 3 of the License, or (at your option) any later version.
+;;;; 
+;;;; This library is distributed in the hope that it will be useful,
+;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;;;; Lesser General Public License for more details.
+;;;; 
+;;;; You should have received a copy of the GNU Lesser General Public
+;;;; License along with this library; if not, write to the Free Software
+;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
USA
+
+(define-module (test-elisp-reader)
+  :use-module (test-suite lib)
+  :use-module (language elisp lexer))
+
+
+; 
==============================================================================
+; Test the lexer.
+
+; This is of course somewhat redundant with the full parser checks, but 
probably
+; can't hurt and is useful in developing the lexer itself.
+
+(define (get-string-lexer str)
+  (call-with-input-string str get-lexer))
+
+(define (lex-string str)
+  (let ((lexer (get-string-lexer str)))
+    (let iterate ((result '()))
+      (let ((token (lexer)))
+        (if (eq? token '*eoi*)
+          (reverse result)
+          (iterate (cons token result)))))))
+
+(with-test-prefix "Lexer"
+
+  (let ((lexer (get-string-lexer "")))
+    (pass-if "end-of-input"
+      (and (eq? (lexer) '*eoi*)
+           (eq? (lexer) '*eoi*)
+           (eq? (lexer) '*eoi*))))
+
+  (pass-if "single character tokens"
+    (equal? (lex-string "()[]'`, . ")
+      '((paren-open . #f) (paren-close . #f)
+        (square-open . #f) (square-close . #f)
+        (quote . #f) (backquote . #f) (unquote . #f) (dot . #f))))
+
+  (pass-if "whitespace and comments"
+    (equal? (lex-string "   (\n\t) ; this is a comment\n.   ; until eof")
+      '((paren-open . #f) (paren-close . #f) (dot . #f))))
+
+  (pass-if "source properties"
+    (let ((x (car (lex-string "\n\n  \n  .  \n"))))
+      (and (= (source-property x 'line) 4)
+           (= (source-property x 'column) 3))))
+
+  (pass-if "symbols"
+    (equal? (lex-string "foo FOO char-to-string 1+ \\+1
+                         \\(*\\ 1\\ 2\\)
+                         +-*/address@hidden&=:<>{}
+                         abc(def)ghi .e5")
+            `((symbol . foo) (symbol . FOO) (symbol . char-to-string)
+              (symbol . 1+) (symbol . ,(string->symbol "+1"))
+              (symbol . ,(string->symbol "(* 1 2)"))
+              (symbol . +-*/address@hidden&=:<>{})
+              (symbol . abc) (paren-open . #f) (symbol . def)
+              (paren-close . #f) (symbol . ghi) (symbol . .e5))))
+
+  ; Here we make use of the property that exact/inexact numbers are not equal?
+  ; even when they have the same numeric value!
+  (pass-if "integers"
+    (equal? (lex-string "-1 1 1. +1 01234")
+            '((integer . -1) (integer . 1) (integer . 1) (integer . 1)
+              (integer . 1234))))
+  (pass-if "floats"
+    (equal? (lex-string "1500.0 15e2 15.e2 1.5e3 .15e4 -.345e-2")
+            '((float . 1500.0) (float . 1500.0) (float . 1500.0)
+              (float . 1500.0) (float . 1500.0)
+              (float . -0.00345))))
+
+  ; Check string lexing, this also checks basic character escape sequences
+  ; that are then (hopefully) also correct for character literals.
+  (pass-if "strings"
+    (equal? (lex-string "\"foo\\nbar
+test\\
+\\\"ab\\\"\\\\ ab\\ cd
+\\418\\0415\\u0041\\U0000000A\\Xab\\x0000000000000004fG.\"  ")
+            '((string . "foo\nbar
+test\"ab\"\\ abcd
+!8!5A\nXabOG."))))
+  (pass-if "ASCII control characters and meta in strings"
+    (equal? (lex-string "\"address@hidden"")
+            '((string . "\x7F\x01\x01\x1A\xC2\x80\x81"))))
+
+  ; Character literals, taking into account that some escape sequences were
+  ; already checked in the strings.
+  (pass-if "characters"
+    (equal? (lex-string "?A?\\z ? ?\\x21 ?\\^j ?\\\\?\\n?\\\n")
+            `((character . 65) (character . ,(char->integer #\z))
+              (character . 32) (character . ,(char->integer #\!))
+              (character . 10) (character . ,(char->integer #\\))
+              (character . 10) (character . 10))))
+  (pass-if "meta characters"
+    (equal? (map cdr (lex-string "?\\C-[?\\M-\\S-Z?\\^X?\\A-\\s-\\H-\\s"))
+            `(,(+ (expt 2 26) (char->integer #\[))
+              ,(+ (expt 2 27) (expt 2 25) (char->integer #\Z))
+              ,(- (char->integer #\X) (char->integer #\@))
+              ,(+ (expt 2 22) (expt 2 23) (expt 2 24) 32)))))


hooks/post-receive
-- 
GNU Guile




reply via email to

[Prev in Thread] Current Thread [Next in Thread]