qemacs-commit
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemacs-commit] qemacs buffer.c charset.c dired.c extras.c orgm...


From: Charlie Gordon
Subject: [Qemacs-commit] qemacs buffer.c charset.c dired.c extras.c orgm...
Date: Wed, 05 Feb 2014 00:56:50 +0000

CVSROOT:        /sources/qemacs
Module name:    qemacs
Changes by:     Charlie Gordon <chqrlie>        14/02/05 00:56:50

Modified files:
        .              : buffer.c charset.c dired.c extras.c orgmode.c 
                         qe.c qe.h shell.c unihex.c 
        libqhtml       : xmlparse.c 

Log message:
        add support for end of line types
        
        * support 3 types of end of line: Unix, Dos and old style Mac
        * detect eol type automatically in detect_charset
        * set buffer eol_type together with charset
        * transparently convert eol sequence to \n upon reading buffers 
          characters in eb_nextc().
        * transparently convert \n to eol sequence in eb_encode_uchar()
        * handle eol types in convert-file-buffer-coding-system: either preserve
          current value or force it via charset suffix -unix, -doc and -mac
        * change read_charset() to accept eol_type suffix
        * add eol_type and eol_char to CharsetDecodeState
        * change charset methods to take CharsetDecodeState instead of charset 
pointers
        * display eol_type in mode line if not unix

CVSWeb URLs:
http://cvs.savannah.gnu.org/viewcvs/qemacs/buffer.c?cvsroot=qemacs&r1=1.70&r2=1.71
http://cvs.savannah.gnu.org/viewcvs/qemacs/charset.c?cvsroot=qemacs&r1=1.29&r2=1.30
http://cvs.savannah.gnu.org/viewcvs/qemacs/dired.c?cvsroot=qemacs&r1=1.35&r2=1.36
http://cvs.savannah.gnu.org/viewcvs/qemacs/extras.c?cvsroot=qemacs&r1=1.18&r2=1.19
http://cvs.savannah.gnu.org/viewcvs/qemacs/orgmode.c?cvsroot=qemacs&r1=1.11&r2=1.12
http://cvs.savannah.gnu.org/viewcvs/qemacs/qe.c?cvsroot=qemacs&r1=1.143&r2=1.144
http://cvs.savannah.gnu.org/viewcvs/qemacs/qe.h?cvsroot=qemacs&r1=1.135&r2=1.136
http://cvs.savannah.gnu.org/viewcvs/qemacs/shell.c?cvsroot=qemacs&r1=1.83&r2=1.84
http://cvs.savannah.gnu.org/viewcvs/qemacs/unihex.c?cvsroot=qemacs&r1=1.20&r2=1.21
http://cvs.savannah.gnu.org/viewcvs/qemacs/libqhtml/xmlparse.c?cvsroot=qemacs&r1=1.25&r2=1.26

Patches:
Index: buffer.c
===================================================================
RCS file: /sources/qemacs/qemacs/buffer.c,v
retrieving revision 1.70
retrieving revision 1.71
diff -u -b -r1.70 -r1.71
--- buffer.c    4 Feb 2014 22:47:31 -0000       1.70
+++ buffer.c    5 Feb 2014 00:56:48 -0000       1.71
@@ -485,6 +485,7 @@
     /* initialize default mode stuff */
     b->tab_width = qs->default_tab_width;
     b->fill_column = qs->default_fill_column;
+    b->eol_type = qs->default_eol_type;
 
     /* add buffer in global buffer list (at end for system buffers) */
     pb = &qs->first_buffer;
@@ -496,10 +497,10 @@
     *pb = b;
 
     if (flags & BF_UTF8) {
-        eb_set_charset(b, &charset_utf8);
+        eb_set_charset(b, &charset_utf8, b->eol_type);
     } else {
         /* CG: default charset should be selectable */
-        eb_set_charset(b, &charset_8859_1);
+        eb_set_charset(b, &charset_8859_1, b->eol_type);
     }
 
     /* add mark move callback */
@@ -1104,20 +1105,21 @@
 /************************************************************/
 /* line related functions */
 
-void eb_set_charset(EditBuffer *b, QECharset *charset)
+void eb_set_charset(EditBuffer *b, QECharset *charset, EOLType eol_type)
 {
     int n;
 
     if (b->charset) {
         charset_decode_close(&b->charset_state);
     }
+    b->eol_type = eol_type;
     b->charset = charset;
     b->flags &= ~BF_UTF8;
     if (charset == &charset_utf8)
         b->flags |= BF_UTF8;
 
     if (charset)
-        charset_decode_init(&b->charset_state, charset);
+        charset_decode_init(&b->charset_state, charset, eol_type);
 
     b->char_bytes = 1;
     b->char_shift = 0;
@@ -1168,11 +1170,21 @@
         /* we use the charset conversion table directly to go faster */
         ch = b->charset_state.table[buf[0]];
         offset++;
-        if (ch == ESCAPE_CHAR) {
+        if (ch == ESCAPE_CHAR || ch == '\r') {
             eb_read(b, offset, buf + 1, MAX_CHAR_BYTES - 1);
             b->charset_state.p = buf;
             ch = b->charset_state.decode_func(&b->charset_state);
             offset += (b->charset_state.p - buf) - 1;
+            if (ch == '\r') {
+                if (b->eol_type == EOL_DOS
+                &&  b->charset_state.decode_func(&b->charset_state) == '\n') {
+                    ch = '\n';
+                    offset += b->charset_state.char_size;
+                } else
+                if (b->eol_type == EOL_MAC) {
+                    ch = '\n';
+                }
+            }                    
         }
     }
     *next_ptr = offset;
@@ -1259,6 +1271,12 @@
             b->charset_state.p = q;
             ch = b->charset_state.decode_func(&b->charset_state);
         }
+        if (ch == '\n' && b->eol_type == EOL_DOS && offset >= char_size) {
+            eb_read(b, offset - char_size, buf, char_size);
+            b->charset_state.p = buf;
+            if (b->charset_state.decode_func(&b->charset_state) == '\r')
+                offset -= char_size;
+        }
     }
  the_end:
     *prev_ptr = offset;
@@ -1290,7 +1308,7 @@
             /* compute offset */
             if (line < line1) {
                 /* seek to the correct line */
-                offset += b->charset->goto_line_func(b->charset,
+                offset += b->charset->goto_line_func(&b->charset_state,
                     p->data, p->size, line1 - line);
                 line = line1;
                 col = 0;
@@ -1359,7 +1377,7 @@
     int offset;
     Page *p, *p_end;
 
-    if (!b->charset->variable_size) {
+    if (!b->charset->variable_size && b->eol_type != EOL_DOS) {
         offset = min(pos * b->charset->char_size, b->total_size);
     } else {
         offset = 0;
@@ -1368,10 +1386,10 @@
         while (p < p_end) {
             if (!(p->flags & PG_VALID_CHAR)) {
                 p->flags |= PG_VALID_CHAR;
-                p->nb_chars = b->charset->get_chars_func(b->charset, p->data, 
p->size);
+                p->nb_chars = b->charset->get_chars_func(&b->charset_state, 
p->data, p->size);
             }
             if (pos < p->nb_chars) {
-                offset += b->charset->goto_char_func(b->charset, p->data, 
p->size, pos);
+                offset += b->charset->goto_char_func(&b->charset_state, 
p->data, p->size, pos);
                 break;
             } else {
                 pos -= p->nb_chars;
@@ -1392,10 +1410,12 @@
     if (offset < 0)
         offset = 0;
 
-    if (!b->charset->variable_size) {
+    if (!b->charset->variable_size && b->eol_type != EOL_DOS) {
         /* offset is round down to character boundary */
         pos = min(offset, b->total_size) / b->charset->char_size;
     } else {
+        /* XXX: should handle rounding if EOL_DOS */
+        /* XXX: should fix buffer offset via charset specific method */
         if (b->charset == &charset_utf8) {
             /* Round offset down to character boundary */
             u8 buf[1];
@@ -1413,10 +1433,10 @@
         while (p < p_end) {
             if (!(p->flags & PG_VALID_CHAR)) {
                 p->flags |= PG_VALID_CHAR;
-                p->nb_chars = b->charset->get_chars_func(b->charset, p->data, 
p->size);
+                p->nb_chars = b->charset->get_chars_func(&b->charset_state, 
p->data, p->size);
             }
             if (offset < p->size) {
-                pos += b->charset->get_chars_func(b->charset, p->data, offset);
+                pos += b->charset->get_chars_func(&b->charset_state, p->data, 
offset);
                 break;
             } else {
                 pos += p->nb_chars;
@@ -1716,7 +1736,7 @@
     eb_set_buffer_name(b, get_basename(filename));
 }
 
-/* Encode unicode character according to buffer charset */
+/* Encode unicode character according to buffer charset and eol_type */
 /* Return number of bytes of conversion */
 /* the function uses '?' to indicate that no match could be found in
    buffer charset */
@@ -1725,6 +1745,14 @@
     QECharset *charset = b->charset;
     u8 *q = (u8 *)buf;
 
+    if (c == '\n') {
+        if (b->eol_type == EOL_MAC)
+            c = '\r';
+        else
+        if (b->eol_type == EOL_DOS) {
+            q = charset->encode_func(charset, q, '\r');
+        }
+    }
     q = charset->encode_func(charset, q, c);
     if (!q) {
         q = (u8 *)buf;
@@ -1749,7 +1777,7 @@
 /* Return number of bytes inserted */
 int eb_insert_utf8_buf(EditBuffer *b, int offset, const char *buf, int len)
 {
-    if (b->charset == &charset_utf8) {
+    if (b->charset == &charset_utf8 && b->eol_type == EOL_UNIX) {
         return eb_insert(b, offset, buf, len);
     } else {
         char buf1[1024];
@@ -1835,7 +1863,7 @@
         vsnprintf(buf, size, fmt, ap);
         va_end(ap);
     }
-    /* CG: insert buffer translating according b->charset.
+    /* CG: insert buf encoding according to b->charset and b->eol_type.
      * buf may contain \0 characters via the %c modifer.
      * XXX: %c does not encode non ASCII characters as utf8.
      */
@@ -1867,11 +1895,12 @@
 }
 #endif
 
-/* Read the comtents of a buffer encoded in a utf8 string */
+/* Read the contents of a buffer encoded in a utf8 string */
 int eb_get_contents(EditBuffer *b, char *buf, int buf_size)
 {
     /* do not use eb_read if overflow to avoid partial characters */
-    if (b->charset == &charset_utf8 && b->total_size < buf_size) {
+    if (b->charset == &charset_utf8 && b->eol_type == EOL_UNIX
+    &&  b->total_size < buf_size) {
         int len = b->total_size;
         eb_read(b, 0, buf, len);
         buf[len] = '\0';
@@ -1902,7 +1931,9 @@
 {
     int styles_flags = min((dest->flags & BF_STYLES), (src->flags & 
BF_STYLES));
 
-    if (dest->charset == src->charset && !styles_flags) {
+    if (dest->charset == src->charset
+    &&  dest->eol_type == src->eol_type
+    &&  !styles_flags) {
         return eb_insert_buffer(dest, dest_offset, src, src_offset, size);
     } else {
         EditBuffer *b;
@@ -1912,7 +1943,7 @@
         if (!styles_flags
         &&  ((b->flags & BF_SAVELOG) || dest_offset != b->total_size)) {
             b = eb_new("*tmp*", BF_SYSTEM);
-            eb_set_charset(b, dest->charset);
+            eb_set_charset(b, dest->charset, dest->eol_type);
             offset1 = 0;
         }
 

Index: charset.c
===================================================================
RCS file: /sources/qemacs/qemacs/charset.c,v
retrieving revision 1.29
retrieving revision 1.30
diff -u -b -r1.29 -r1.30
--- charset.c   4 Feb 2014 22:47:31 -0000       1.29
+++ charset.c   5 Feb 2014 00:56:49 -0000       1.30
@@ -30,6 +30,9 @@
  * spacing and enclosing combining characters and control chars.
  */
 
+/* XXX: This table is incomplete, should compute from UnicodeData.txt
+ * via a specialized utility
+ */
 static unsigned int const unicode_glyph_ranges[] = {
     0x10FF, 1, 0x115f, 2,     /*  0: Hangul Jamo */
     0x2328, 1, 0x232a, 2,     /*  2: wide Angle brackets */
@@ -309,7 +312,7 @@
     line = 0;
     lp = p = buf;
     p1 = p + size;
-    nl = s->charset->eol_char;
+    nl = s->eol_char;
 
     for (;;) {
         p = memchr(p, nl, p1 - p);
@@ -330,16 +333,29 @@
     *col_ptr = col;
 }
 
-static int charset_get_chars_utf8(QECharset *charset, const u8 *buf, int size)
+static int charset_get_chars_utf8(CharsetDecodeState *s,
+                                  const u8 *buf, int size)
 {
     int nb_chars, c;
     const u8 *buf_end, *buf_ptr;
 
     nb_chars = 0;
     buf_ptr = buf;
-    buf_end = buf + size;
+    buf_end = buf_ptr + size;
     while (buf_ptr < buf_end) {
         c = *buf_ptr++;
+        if (c == '\n' && s->eol_type == EOL_DOS) {
+            /* ignore \n in EOL_DOS scan, but count \r.
+             * XXX: potentially incorrect if buffer contains
+             * \n not preceded by \r and requires special state
+             * data to handle \r\n sequence at page boundary.
+             */
+            continue;
+        }
+        /* ignoring trailing bytes: will produce incorrect
+         * count on isolated and trailing bytes and overlong
+         * sequences.
+         */
         if (c < 0x80 || c >= 0xc0)
             nb_chars++;
     }
@@ -347,21 +363,30 @@
      * utf-8 sequence at start of buffer is ignored in count while
      * incomplete utf-8 sequence at end of buffer is counted.  This may
      * cause problems when counting characters with eb_get_pos with an
-     * offset falling indside an utf-8 sequence.
+     * offset falling inside a utf-8 sequence, and will produce
+     * incorrect counts on broken utf-8 sequences spanning page
+     * boundaries.
      */
     return nb_chars;
 }
 
-static int charset_goto_char_utf8(QECharset *charset, const u8 *buf, int size, 
int pos)
+static int charset_goto_char_utf8(CharsetDecodeState *s,
+                                  const u8 *buf, int size, int pos)
 {
     int nb_chars, c;
     const u8 *buf_ptr, *buf_end;
 
     nb_chars = 0;
     buf_ptr = buf;
-    buf_end = buf + size;
+    buf_end = buf_ptr + size;
     while (buf_ptr < buf_end) {
         c = *buf_ptr;
+        if (c == '\n' && s->eol_type == EOL_DOS) {
+            /* ignore \n in EOL_DOS scan, but count \r.
+             * see comment above. 
+             */
+            continue;
+        }
         if (c < 0x80 || c >= 0xc0) {
             /* Test done here to skip initial trailing bytes if any */
             if (nb_chars >= pos)
@@ -396,6 +421,7 @@
 
 static int decode_ucs2le(CharsetDecodeState *s)
 {
+    /* XXX: should handle surrogates */
     const u8 *p;
 
     p = s->p;
@@ -405,6 +431,7 @@
 
 static u8 *encode_ucs2le(__unused__ QECharset *charset, u8 *p, int c)
 {
+    /* XXX: should handle surrogates */
     p[0] = c;
     p[1] = c >> 8;
     return p + 2;
@@ -423,9 +450,10 @@
     lp = p = (const uint16_t *)buf;
     p1 = p + (size >> 1);
     u.n = 0;
-    u.c[s->charset == &charset_ucs2be] = s->charset->eol_char;
+    u.c[s->charset == &charset_ucs2be] = s->eol_char;
     nl = u.n;
 
+    /* XXX: should handle surrogates */
     while (p < p1) {
         if (*p++ == nl) {
             lp = p;
@@ -437,8 +465,8 @@
     *col_ptr = col;
 }
 
-static int charset_goto_line_ucs2(QECharset *charset, const u8 *buf, int size,
-                                  int nlines)
+static int charset_goto_line_ucs2(CharsetDecodeState *s,
+                                  const u8 *buf, int size, int nlines)
 {
     const uint16_t *p, *p1, *lp;
     uint16_t nl;
@@ -447,7 +475,7 @@
     lp = p = (const uint16_t *)buf;
     p1 = p + (size >> 1);
     u.n = 0;
-    u.c[charset == &charset_ucs2be] = charset->eol_char;
+    u.c[s->charset == &charset_ucs2be] = s->eol_char;
     nl = u.n;
 
     while (nlines > 0 && p < p1) {
@@ -464,6 +492,7 @@
 
 static int decode_ucs2be(CharsetDecodeState *s)
 {
+    /* XXX: should handle surrogates */
     const u8 *p;
 
     p = s->p;
@@ -473,19 +502,70 @@
 
 static u8 *encode_ucs2be(__unused__ QECharset *charset, u8 *p, int c)
 {
+    /* XXX: should handle surrogates */
     p[0] = c >> 8;
     p[1] = c;
     return p + 2;
 }
 
-static int charset_get_chars_ucs2(__unused__ QECharset *charset, const u8 
*buf, int size)
+static int charset_get_chars_ucs2(CharsetDecodeState *s,
+                                  const u8 *buf, int size)
 {
+    /* XXX: should handle surrogates */
+    int nb_skip;
+    const uint16_t *buf_end, *buf_ptr;
+    uint16_t nl;
+    union { uint16_t n; char c[2]; } u;
+
+    if (s->eol_type != EOL_DOS)
     return size >> 1;
+
+    nb_skip = 0;
+    buf_ptr = (const uint16_t *)buf;
+    buf_end = buf_ptr + (size >> 1);
+    u.n = 0;
+    u.c[s->charset == &charset_ucs2be] = '\n';
+    nl = u.n;
+
+    while (buf_ptr < buf_end) {
+        if (*buf_ptr++ == nl) {
+            /* ignore \n in EOL_DOS scan, but count \r. (see above) */
+            nb_skip++;
+        }
+    }
+    return (size >> 1) - nb_skip;
 }
 
-static int charset_goto_char_ucs2(__unused__ QECharset *charset, const u8 
*buf, int size, int pos)
+static int charset_goto_char_ucs2(CharsetDecodeState *s,
+                                  const u8 *buf, int size, int pos)
 {
+    /* XXX: should handle surrogates */
+    int nb_chars;
+    const uint16_t *buf_ptr, *buf_end;
+    uint16_t nl;
+    union { uint16_t n; char c[2]; } u;
+
+    if (s->eol_type != EOL_DOS)
     return min(pos << 1, size);
+
+    nb_chars = 0;
+    buf_ptr = (const uint16_t *)buf;
+    buf_end = buf_ptr + (size >> 1);
+    u.n = 0;
+    u.c[s->charset == &charset_ucs2be] = '\n';
+    nl = u.n;
+
+    while (buf_ptr < buf_end) {
+        if (*buf_ptr == nl) {
+            /* ignore \n in EOL_DOS scan, but count \r. (see above) */
+            continue;
+        }
+        if (nb_chars >= pos)
+            break;
+        nb_chars++;
+        buf_ptr++;
+    }
+    return (const u8*)buf_ptr - buf;
 }
 
 QECharset charset_ucs2le = {
@@ -545,7 +625,7 @@
     lp = p = (const uint32_t *)buf;
     p1 = p + (size >> 2);
     u.n = 0;
-    u.c[(s->charset == &charset_ucs4be) * 3] = s->charset->eol_char;
+    u.c[(s->charset == &charset_ucs4be) * 3] = s->eol_char;
     nl = u.n;
 
     while (p < p1) {
@@ -559,8 +639,8 @@
     *col_ptr = col;
 }
 
-static int charset_goto_line_ucs4(QECharset *charset, const u8 *buf, int size,
-                                  int nlines)
+static int charset_goto_line_ucs4(CharsetDecodeState *s,
+                                  const u8 *buf, int size, int nlines)
 {
     const uint32_t *p, *p1, *lp;
     uint32_t nl;
@@ -569,7 +649,7 @@
     lp = p = (const uint32_t *)buf;
     p1 = p + (size >> 2);
     u.n = 0;
-    u.c[(charset == &charset_ucs4be) * 3] = charset->eol_char;
+    u.c[(s->charset == &charset_ucs4be) * 3] = s->eol_char;
     nl = u.n;
 
     while (nlines > 0 && p < p1) {
@@ -602,14 +682,62 @@
     return p + 4;
 }
 
-static int charset_get_chars_ucs4(__unused__ QECharset *charset, const u8 
*buf, int size)
+static int charset_get_chars_ucs4(CharsetDecodeState *s,
+                                  const u8 *buf, int size)
 {
+    int nb_skip;
+    const uint32_t *buf_end, *buf_ptr;
+    uint32_t nl;
+    union { uint32_t n; char c[4]; } u;
+
+    if (s->eol_type != EOL_DOS)
     return size >> 2;
+
+    nb_skip = 0;
+    buf_ptr = (const uint32_t *)buf;
+    buf_end = buf_ptr + (size >> 2);
+    u.n = 0;
+    u.c[(s->charset == &charset_ucs4be) * 3] = '\n';
+    nl = u.n;
+
+    while (buf_ptr < buf_end) {
+        if (*buf_ptr++ == nl) {
+            /* ignore \n in EOL_DOS scan, but count \r. (see above) */
+            nb_skip++;
+        }
+    }
+    return (size >> 2) - nb_skip;
 }
 
-static int charset_goto_char_ucs4(__unused__ QECharset *charset, const u8 
*buf, int size, int pos)
+static int charset_goto_char_ucs4(CharsetDecodeState *s,
+                                  const u8 *buf, int size, int pos)
 {
+    int nb_chars;
+    const uint32_t *buf_ptr, *buf_end;
+    uint32_t nl;
+    union { uint32_t n; char c[4]; } u;
+
+    if (s->eol_type != EOL_DOS)
     return min(pos << 2, size);
+
+    nb_chars = 0;
+    buf_ptr = (const uint32_t *)buf;
+    buf_end = buf_ptr + (size >> 2);
+    u.n = 0;
+    u.c[(s->charset == &charset_ucs4be) * 3] = '\n';
+    nl = u.n;
+
+    while (buf_ptr < buf_end) {
+        if (*buf_ptr == nl) {
+            /* ignore \n in EOL_DOS scan, but count \r. (see above) */
+            continue;
+        }
+        if (nb_chars >= pos)
+            break;
+        nb_chars++;
+        buf_ptr++;
+    }
+    return (const u8*)buf_ptr - buf;
 }
 
 QECharset charset_ucs4le = {
@@ -699,7 +827,8 @@
     return NULL;
 }
 
-void charset_decode_init(CharsetDecodeState *s, QECharset *charset)
+void charset_decode_init(CharsetDecodeState *s, QECharset *charset,
+                         EOLType eol_type)
 {
     s->table = NULL; /* fail safe */
     if (charset->table_alloc) {
@@ -710,6 +839,10 @@
     }
     s->charset = charset;
     s->char_size = charset->char_size;
+    s->eol_type = eol_type;
+    s->eol_char = charset->eol_char;
+    if (s->eol_char == '\n' && (s->eol_type == EOL_MAC || s->eol_type == 
EOL_DOS))
+        s->eol_char = '\r';
     s->decode_func = charset->decode_func;
     s->get_pos_func = charset->get_pos_func;
     if (charset->decode_init)
@@ -725,10 +858,48 @@
 }
 
 /* detect the charset. Actually only UTF8 is detected */
-QECharset *detect_charset(const u8 *buf, int size)
+QECharset *detect_charset(const u8 *buf, int size, EOLType *eol_typep)
 {
     int i, l, c, has_utf8;
 
+    if (eol_typep) {
+        /* XXX: delay test after charset match */
+        /* XXX: only works for 8 bit charsets */
+        int eol_bits = 0;
+        for (i = 0; i < size - 1; i++) {
+            c = buf[i++];
+            if (c == '\r') {
+                if (buf[i] == '\n') {
+                    eol_bits |= 1 << EOL_DOS;
+                    i++;
+                } else {
+                    eol_bits |= 1 << EOL_MAC;
+                }
+            } else
+            if (buf[i] == '\n') {
+                eol_bits |= 1 << EOL_UNIX;
+            }
+        }
+        switch (eol_bits) {
+        case 0:
+            /* no change, keep default value */
+            break;
+        case 1 << EOL_UNIX:
+            *eol_typep = EOL_UNIX;
+            break;
+        case 1 << EOL_DOS:
+            *eol_typep = EOL_DOS;
+            break;
+        case 1 << EOL_MAC:
+            *eol_typep = EOL_MAC;
+            break;
+        default:
+            /* A mixture of different styles, binary / unix */
+            *eol_typep = EOL_UNIX;
+            break;
+        }
+    }
+
     has_utf8 = 0;
     for (i = 0; i < size;) {
         c = buf[i++];
@@ -855,7 +1026,7 @@
     line = 0;
     lp = p = buf;
     p1 = p + size;
-    nl = s->charset->eol_char;
+    nl = s->eol_char;
 
     for (;;) {
         p = memchr(p, nl, p1 - p);
@@ -870,14 +1041,15 @@
     *col_ptr = col;
 }
 
-int charset_goto_line_8bit(QECharset *charset, const u8 *buf, int size, int 
nlines)
+int charset_goto_line_8bit(CharsetDecodeState *s,
+                           const u8 *buf, int size, int nlines)
 {
     const u8 *p, *p1, *lp;
     int nl;
 
     lp = p = buf;
     p1 = p + size;
-    nl = charset->eol_char;
+    nl = s->eol_char;
 
     while (nlines > 0) {
         p = memchr(p, nl, p1 - p);
@@ -890,14 +1062,50 @@
     return lp - buf;
 }
 
-int charset_get_chars_8bit(QECharset *charset, const u8 *buf, int size)
+int charset_get_chars_8bit(CharsetDecodeState *s,
+                           const u8 *buf, int size)
 {
+    int nb_skip;
+    const u8 *buf_end, *buf_ptr;
+
+    if (s->eol_type != EOL_DOS)
     return size;
+
+    nb_skip = 0;
+    buf_ptr = buf;
+    buf_end = buf_ptr + size;
+    while (buf_ptr < buf_end) {
+        if (*buf_ptr++ == '\n') {
+            /* ignore \n in EOL_DOS scan, but count \r. (see above) */
+            nb_skip++;
+        }
+    }
+    return size - nb_skip;
 }
 
-int charset_goto_char_8bit(QECharset *charset, const u8 *buf, int size, int 
pos)
+int charset_goto_char_8bit(CharsetDecodeState *s,
+                           const u8 *buf, int size, int pos)
 {
+    int nb_chars;
+    const u8 *buf_ptr, *buf_end;
+
+    if (s->eol_type != EOL_DOS)
     return min(pos, size);
+
+    nb_chars = 0;
+    buf_ptr = buf;
+    buf_end = buf_ptr + size;
+    while (buf_ptr < buf_end) {
+        if (*buf_ptr == '\n') {
+            /* ignore \n in EOL_DOS scan, but count \r. */
+            continue;
+        }
+        if (nb_chars >= pos)
+            break;
+        nb_chars++;
+        buf_ptr++;
+    }
+    return buf_ptr - buf;
 }
 
 /********************************************************/

Index: dired.c
===================================================================
RCS file: /sources/qemacs/qemacs/dired.c,v
retrieving revision 1.35
retrieving revision 1.36
diff -u -b -r1.35 -r1.36
--- dired.c     23 Jan 2014 12:56:22 -0000      1.35
+++ dired.c     5 Feb 2014 00:56:49 -0000       1.36
@@ -493,7 +493,7 @@
     list_mode.mode_init(s, saved_data);
 
     /* XXX: File system charset should be detected automatically */
-    eb_set_charset(s->b, &charset_utf8);
+    eb_set_charset(s->b, &charset_utf8, s->b->eol_type);
 
     hs = s->mode_data;
     hs->sort_mode = DIRED_SORT_GROUP | DIRED_SORT_NAME;

Index: extras.c
===================================================================
RCS file: /sources/qemacs/qemacs/extras.c,v
retrieving revision 1.18
retrieving revision 1.19
diff -u -b -r1.18 -r1.19
--- extras.c    31 Jan 2014 14:50:13 -0000      1.18
+++ extras.c    5 Feb 2014 00:56:49 -0000       1.19
@@ -405,7 +405,7 @@
     } else {
         EditBuffer *b1 = eb_new("*tmp*", BF_SYSTEM | (b->flags & BF_STYLES));
 
-        eb_set_charset(b1, b->charset);
+        eb_set_charset(b1, b->charset, b->eol_type);
         /* Use eb_insert_buffer_convert to copy styles.
          * This conversion should not change sizes */
         eb_insert_buffer_convert(b1, 0, b, offset2, size2);
@@ -712,6 +712,11 @@
     s->b->flags &= ~BF_STYLES;
 }
 
+static void do_set_eol_type(EditState *s, int eol_type)
+{
+    eb_set_charset(s->b, s->b->charset, eol_type);
+}
+
 static CmdDef extra_commands[] = {
     CMD2( KEY_META('='), KEY_NONE,
           "compare-windows", do_compare_windows, ESi, "ui" )
@@ -770,6 +775,10 @@
     CMD0( KEY_NONE, KEY_NONE,
           "drop-styles", do_drop_styles)
 
+    CMD2( KEY_NONE, KEY_NONE,
+          "set-eol-type", do_set_eol_type, ESi,
+         "ui{EOL Type [0=Unix, 1=Dos, 2=Mac]: }")
+
     CMD_DEF_END,
 };
 

Index: orgmode.c
===================================================================
RCS file: /sources/qemacs/qemacs/orgmode.c,v
retrieving revision 1.11
retrieving revision 1.12
diff -u -b -r1.11 -r1.12
--- orgmode.c   29 Jan 2014 23:24:00 -0000      1.11
+++ orgmode.c   5 Feb 2014 00:56:49 -0000       1.12
@@ -617,7 +617,7 @@
         offset2 = org_next_heading(s, offset1, level, &level2);
     }
     b1 = eb_new("*tmp*", BF_SYSTEM | (s->b->flags & BF_STYLES));
-    eb_set_charset(b1, s->b->charset);
+    eb_set_charset(b1, s->b->charset, s->b->eol_type);
     eb_insert_buffer_convert(b1, 0, s->b, offset, size);
     eb_delete(s->b, offset, size);
     if (offset2 > offset)

Index: qe.c
===================================================================
RCS file: /sources/qemacs/qemacs/qe.c,v
retrieving revision 1.143
retrieving revision 1.144
diff -u -b -r1.143 -r1.144
--- qe.c        4 Feb 2014 22:47:31 -0000       1.143
+++ qe.c        5 Feb 2014 00:56:49 -0000       1.144
@@ -1485,7 +1485,7 @@
     }
     snprintf(bufname, sizeof(bufname), "*kill-%d*", qs->yank_current + 1);
     b = eb_new(bufname, base->flags & BF_STYLES);
-    eb_set_charset(b, base->charset);
+    eb_set_charset(b, base->charset, base->eol_type);
     qs->yank_buffers[qs->yank_current] = b;
     return b;
 }
@@ -1823,26 +1823,49 @@
 }
 #endif
 
-QECharset *read_charset(EditState *s, const char *charset_str)
+QECharset *read_charset(EditState *s, const char *charset_str,
+                        EOLType *eol_typep)
 {
+    char buf[64];
+    const char *p;
     QECharset *charset;
+    EOLType eol_type = *eol_typep;
+
+    p = NULL;
+
+    if (strend(charset_str, "-mac", &p))
+        eol_type = EOL_MAC;
+    else
+    if (strend(charset_str, "-dos", &p))
+        eol_type = EOL_DOS;
+    else
+    if (strend(charset_str, "-unix", &p))
+        eol_type = EOL_UNIX;
+
+    if (p) {
+        pstrncpy(buf, sizeof(buf), charset_str, p - charset_str);
+        charset_str = buf;
+    }
 
     charset = find_charset(charset_str);
     if (!charset) {
         put_status(s, "Unknown charset '%s'", charset_str);
         return NULL;
     }
+    *eol_typep = eol_type;
     return charset;
 }
 
 void do_set_buffer_file_coding_system(EditState *s, const char *charset_str)
 {
     QECharset *charset;
+    EOLType eol_type;
 
-    charset = read_charset(s, charset_str);
+    eol_type = s->b->eol_type;
+    charset = read_charset(s, charset_str, &eol_type);
     if (!charset)
         return;
-    eb_set_charset(s->b, charset);
+    eb_set_charset(s->b, charset, eol_type);
     put_status(s, "Charset is now %s for this buffer", s->b->charset->name);
 }
 
@@ -1851,20 +1874,22 @@
                                           const char *charset_str)
 {
     QECharset *charset;
+    EOLType eol_type;
     EditBuffer *b1, *b;
     int offset, c, len, i;
     EditBufferCallbackList *cb;
     int pos[32];
     char buf[MAX_CHAR_BYTES];
 
-    charset = read_charset(s, charset_str);
+    eol_type = s->b->eol_type;
+    charset = read_charset(s, charset_str, &eol_type);
     if (!charset)
         return;
 
     b = s->b;
 
     b1 = eb_new("*tmp*", b->flags & BF_STYLES);
-    eb_set_charset(b1, charset);
+    eb_set_charset(b1, charset, eol_type);
 
     /* preserve positions */
     cb = b->first_callback;
@@ -1887,7 +1912,7 @@
     /* quick hack to transfer styles from tmp buffer to b */
     eb_free(&b->b_styles);
     eb_delete(b, 0, b->total_size);
-    eb_set_charset(b, charset);
+    eb_set_charset(b, charset, eol_type);
     eb_insert_buffer(b, 0, b1, 0, b1->total_size);
     b->b_styles = b1->b_styles;
     b1->b_styles = NULL;
@@ -2145,6 +2170,10 @@
     eb_get_pos(s->b, &line_num, &col_num, s->offset);
     buf_printf(out, "L%d--C%d--%s",
                line_num + 1, col_num, s->b->charset->name);
+    if (s->b->eol_type == EOL_DOS)
+        buf_printf(out, "-dos");
+    if (s->b->eol_type == EOL_MAC)
+        buf_printf(out, "-mac");
     if (s->bidir)
         buf_printf(out, "--%s", s->cur_rtl ? "RTL" : "LTR");
 
@@ -5646,7 +5675,7 @@
     /* First we try to read the first block to determine the data type */
     if (stat(filename, &st) < 0) {
         /* XXX: default charset should be selectable.  Use utf8 for now */
-        eb_set_charset(b, &charset_utf8);
+        eb_set_charset(b, &charset_utf8, b->eol_type);
         /* CG: should check for wildcards and do dired */
         //if (strchr(filename, '*') || strchr(filename, '?'))
         //    goto dired;
@@ -5686,8 +5715,13 @@
     bdt = selected_mode->data_type;
 
     /* autodetect buffer charset (could move it to raw buffer loader) */
-    if (bdt == &raw_data_type)
-        eb_set_charset(b, detect_charset(buf, buf_size));
+    if (bdt == &raw_data_type) {
+        QECharset *charset;
+        EOLType eol_type;
+
+        charset = detect_charset(buf, buf_size, &eol_type);
+        eb_set_charset(b, charset, eol_type);
+    }
 
     /* now we can set the mode */
     edit_set_mode_full(s, selected_mode, NULL, f);

Index: qe.h
===================================================================
RCS file: /sources/qemacs/qemacs/qe.h,v
retrieving revision 1.135
retrieving revision 1.136
diff -u -b -r1.135 -r1.136
--- qe.h        4 Feb 2014 22:47:31 -0000       1.135
+++ qe.h        5 Feb 2014 00:56:49 -0000       1.136
@@ -491,9 +491,9 @@
     u8 *(*encode_func)(QECharset *charset, u8 *buf, int size);
     void (*get_pos_func)(CharsetDecodeState *s, const u8 *buf, int size,
                          int *line_ptr, int *col_ptr);
-    int (*get_chars_func)(QECharset *charset, const u8 *buf, int size);
-    int (*goto_char_func)(QECharset *charset, const u8 *buf, int size, int 
pos);
-    int (*goto_line_func)(QECharset *charset, const u8 *buf, int size, int 
lines);
+    int (*get_chars_func)(CharsetDecodeState *s, const u8 *buf, int size);
+    int (*goto_char_func)(CharsetDecodeState *s, const u8 *buf, int size, int 
pos);
+    int (*goto_line_func)(CharsetDecodeState *s, const u8 *buf, int size, int 
lines);
     unsigned int char_size : 3;
     unsigned int variable_size : 1;
     unsigned int table_alloc : 1; /* true if CharsetDecodeState.table must be 
malloced */
@@ -510,10 +510,18 @@
 extern QECharset charset_ucs2le, charset_ucs2be;
 extern QECharset charset_ucs4le, charset_ucs4be;
 
+typedef enum EOLType {
+    EOL_UNIX = 0,
+    EOL_DOS,
+    EOL_MAC,
+} EOLType;
+
 struct CharsetDecodeState {
     /* 256 ushort table for hyper fast decoding */
     unsigned short *table;
     int char_size;
+    EOLType eol_type;
+    int eol_char;
     const u8 *p;
     /* slower decode function for complicated cases */
     int (*decode_func)(CharsetDecodeState *s);
@@ -538,15 +546,16 @@
 
 void charset_completion(CompleteState *cp);
 QECharset *find_charset(const char *str);
-void charset_decode_init(CharsetDecodeState *s, QECharset *charset);
+void charset_decode_init(CharsetDecodeState *s, QECharset *charset,
+                         EOLType eol_type);
 void charset_decode_close(CharsetDecodeState *s);
 void charset_get_pos_8bit(CharsetDecodeState *s, const u8 *buf, int size,
                           int *line_ptr, int *col_ptr);
-int charset_get_chars_8bit(QECharset *charset, const u8 *buf, int size);
-int charset_goto_char_8bit(QECharset *charset, const u8 *buf, int size, int 
pos);
-int charset_goto_line_8bit(QECharset *charset, const u8 *buf, int size, int 
nlines);
+int charset_get_chars_8bit(CharsetDecodeState *s, const u8 *buf, int size);
+int charset_goto_char_8bit(CharsetDecodeState *s, const u8 *buf, int size, int 
pos);
+int charset_goto_line_8bit(CharsetDecodeState *s, const u8 *buf, int size, int 
nlines);
 
-QECharset *detect_charset(const u8 *buf, int size);
+QECharset *detect_charset(const u8 *buf, int size, EOLType *eol_typep);
 
 void decode_8bit_init(CharsetDecodeState *s);
 int decode_8bit(CharsetDecodeState *s);
@@ -815,6 +824,7 @@
 
     int tab_width;
     int fill_column;
+    EOLType eol_type;
 
     EditBuffer *next; /* next editbuffer in qe_state buffer list */
 
@@ -867,7 +877,7 @@
 EditBuffer *eb_find_file(const char *filename);
 EditState *eb_find_window(EditBuffer *b, EditState *e);
 
-void eb_set_charset(EditBuffer *b, QECharset *charset);
+void eb_set_charset(EditBuffer *b, QECharset *charset, EOLType eol_type);
 __attr_nonnull((3))
 int eb_nextc(EditBuffer *b, int offset, int *next_ptr);
 __attr_nonnull((3))
@@ -1314,6 +1324,7 @@
     int max_load_size;  /* maximum file size for loading in memory */
     int default_tab_width;      /* 8 */
     int default_fill_column;    /* 70 */
+    EOLType default_eol_type;  /* EOL_UNIX */
 };
 
 extern QEmacsState qe_state;
@@ -1744,7 +1755,8 @@
 void do_yank(EditState *s);
 void do_yank_pop(EditState *s);
 void do_exchange_point_and_mark(EditState *s);
-QECharset *read_charset(EditState *s, const char *charset_str);
+QECharset *read_charset(EditState *s, const char *charset_str,
+                        EOLType *eol_typep);
 void do_set_buffer_file_coding_system(EditState *s, const char *charset_str);
 void do_convert_buffer_file_coding_system(EditState *s,
     const char *charset_str);

Index: shell.c
===================================================================
RCS file: /sources/qemacs/qemacs/shell.c,v
retrieving revision 1.83
retrieving revision 1.84
diff -u -b -r1.83 -r1.84
--- shell.c     4 Feb 2014 22:47:31 -0000       1.83
+++ shell.c     5 Feb 2014 00:56:49 -0000       1.84
@@ -1362,9 +1362,9 @@
     /* Select shell output buffer encoding from LANG setting */
     if (((lang = getenv("LANG")) != NULL && strstr(lang, "UTF-8")) ||
           qs->screen->charset == &charset_utf8) {
-        eb_set_charset(b, &charset_utf8);
+        eb_set_charset(b, &charset_utf8, b->eol_type);
     } else {
-        eb_set_charset(b, &charset_vt100);
+        eb_set_charset(b, &charset_vt100, b->eol_type);
     }
 
     s = qe_mallocz(ShellState);

Index: unihex.c
===================================================================
RCS file: /sources/qemacs/qemacs/unihex.c,v
retrieving revision 1.20
retrieving revision 1.21
diff -u -b -r1.20 -r1.21
--- unihex.c    16 Jan 2014 14:00:28 -0000      1.20
+++ unihex.c    5 Feb 2014 00:56:50 -0000       1.21
@@ -27,6 +27,9 @@
 
     text_mode_init(s, saved_data);
 
+    /* unihex mode is incompatible with EOL_DOS eol type */
+    eb_set_charset(s->b, s->b->charset, EOL_UNIX);
+
     /* Compute max width of character in hex dump (limit to first 64K) */
     maxc = 0xFF;
     max_offset = min(65536, s->b->total_size);

Index: libqhtml/xmlparse.c
===================================================================
RCS file: /sources/qemacs/qemacs/libqhtml/xmlparse.c,v
retrieving revision 1.25
retrieving revision 1.26
diff -u -b -r1.25 -r1.26
--- libqhtml/xmlparse.c 23 Jan 2014 12:56:24 -0000      1.25
+++ libqhtml/xmlparse.c 5 Feb 2014 00:56:50 -0000       1.26
@@ -301,7 +301,7 @@
     pstrcpy(s->filename, sizeof(s->filename), filename);
     s->charset = charset;
     if (charset) {
-        charset_decode_init(&s->charset_state, charset);
+        charset_decode_init(&s->charset_state, charset, EOL_UNIX);
     }
     return s;
 }



reply via email to

[Prev in Thread] Current Thread [Next in Thread]