emacs-diffs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Emacs-diffs] Changes to emacs/src/coding.c,v


From: Kenichi Handa
Subject: [Emacs-diffs] Changes to emacs/src/coding.c,v
Date: Thu, 03 Apr 2008 12:30:06 +0000

CVSROOT:        /cvsroot/emacs
Module name:    emacs
Changes by:     Kenichi Handa <handa>   08/04/03 12:30:03

Index: coding.c
===================================================================
RCS file: /cvsroot/emacs/emacs/src/coding.c,v
retrieving revision 1.377
retrieving revision 1.378
diff -u -b -r1.377 -r1.378
--- coding.c    27 Mar 2008 20:26:59 -0000      1.377
+++ coding.c    3 Apr 2008 12:30:02 -0000       1.378
@@ -625,6 +625,7 @@
    | CATEGORY_MASK_ISO_7_ELSE          \
    | CATEGORY_MASK_ISO_8_ELSE          \
    | CATEGORY_MASK_UTF_8               \
+   | CATEGORY_MASK_UTF_16_AUTO         \
    | CATEGORY_MASK_UTF_16_BE           \
    | CATEGORY_MASK_UTF_16_LE           \
    | CATEGORY_MASK_UTF_16_BE_NOSIG     \
@@ -657,7 +658,8 @@
      | CATEGORY_MASK_ISO_ELSE)
 
 #define CATEGORY_MASK_UTF_16           \
-  (CATEGORY_MASK_UTF_16_BE             \
+  (CATEGORY_MASK_UTF_16_AUTO           \
+   | CATEGORY_MASK_UTF_16_BE           \
    | CATEGORY_MASK_UTF_16_LE           \
    | CATEGORY_MASK_UTF_16_BE_NOSIG     \
    | CATEGORY_MASK_UTF_16_LE_NOSIG)
@@ -1513,11 +1515,44 @@
                                | CATEGORY_MASK_UTF_16_BE_NOSIG
                                | CATEGORY_MASK_UTF_16_LE_NOSIG);
     }
-  else if (c1 >= 0 && c2 >= 0)
+  else
     {
+      /* We check the dispersion of Eth and Oth bytes where E is even and
+        O is odd.  If both are high, we assume binary data.*/
+      unsigned char e[256], o[256];
+      unsigned e_num = 1, o_num = 1;
+
+      memset (e, 0, 256);
+      memset (o, 0, 256);
+      e[c1] = 1;
+      o[c2] = 1;
+
       detect_info->rejected
        |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
+
+      while (1)
+       {
+         ONE_MORE_BYTE (c1);
+         ONE_MORE_BYTE (c2);
+         if (! e[c1])
+           {
+             e[c1] = 1;
+             e_num++;
+             if (e_num >= 128)
+               break;
+           }
+         if (! o[c2])
+           {
+             o[c1] = 1;
+             o_num++;
+             if (o_num >= 128)
+               break;
     }
+       }
+      detect_info->rejected |= CATEGORY_MASK_UTF_16;
+      return 0;
+    }
+
  no_more_source:
   return 1;
 }
@@ -5677,32 +5712,53 @@
     {
       int c, i;
       struct coding_detection_info detect_info;
+      int null_byte_found = 0, eight_bit_found = 0;
 
       detect_info.checked = detect_info.found = detect_info.rejected = 0;
-      for (i = 0, src = coding->source; src < src_end; i++, src++)
+      coding->head_ascii = -1;
+      for (src = coding->source; src < src_end; src++)
        {
          c = *src;
          if (c & 0x80)
+           {
+             eight_bit_found = 1;
+             if (coding->head_ascii < 0)
+               coding->head_ascii = src - coding->source;
+             if (null_byte_found)
            break;
-         if (c < 0x20
-             && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
+           }
+         else if (c < 0x20)
+           {
+             if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
              && ! inhibit_iso_escape_detection
              && ! detect_info.checked)
            {
-             coding->head_ascii = src - (coding->source + coding->consumed);
+                 if (coding->head_ascii < 0)
+                   coding->head_ascii = src - coding->source;
              if (detect_coding_iso_2022 (coding, &detect_info))
                {
                  /* We have scanned the whole data.  */
                  if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
-                   /* We didn't find an 8-bit code.  */
+                       /* We didn't find an 8-bit code.  We may have
+                          found a null-byte, but it's very rare that
+                          a binary file confirm to ISO-2022.  */
                    src = src_end;
                  break;
                }
            }
+             else if (! c)
+               {
+                 null_byte_found = 1;
+                 if (eight_bit_found)
+                   break;
+               }
        }
-      coding->head_ascii = src - (coding->source + coding->consumed);
+       }
+      if (coding->head_ascii < 0)
+       coding->head_ascii = src - coding->source;
 
-      if (coding->head_ascii < coding->src_bytes
+      if (null_byte_found || eight_bit_found
+         || coding->head_ascii < coding->src_bytes
          || detect_info.found)
        {
          enum coding_category category;
@@ -5718,6 +5774,12 @@
                  break;
              }
          else
+           {
+             if (null_byte_found)
+               {
+                 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
+                 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
+               }
            for (i = 0; i < coding_category_raw_text; i++)
              {
                category = coding_priorities[i];
@@ -5750,7 +5812,10 @@
          
          if (i < coding_category_raw_text)
            setup_coding_system (CODING_ID_NAME (this->id), coding);
-         else if (detect_info.rejected == CATEGORY_MASK_ANY)
+             else if (null_byte_found)
+               setup_coding_system (Qno_conversion, coding);
+             else if ((detect_info.rejected & CATEGORY_MASK_ANY)
+                      == CATEGORY_MASK_ANY)
            setup_coding_system (Qraw_text, coding);
          else if (detect_info.rejected)
            for (i = 0; i < coding_category_raw_text; i++)
@@ -5762,6 +5827,7 @@
                }
        }
     }
+    }
   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
           == coding_category_utf_16_auto)
     {
@@ -7472,6 +7538,7 @@
   int id;
   struct coding_detection_info detect_info;
   enum coding_category base_category;
+  int null_byte_found = 0, eight_bit_found = 0;
 
   if (NILP (coding_system))
     coding_system = Qundecided;
@@ -7497,33 +7564,54 @@
       struct coding_system *this;
       int c, i;
 
+      coding.head_ascii = -1;
       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
-      for (i = 0; src < src_end; i++, src++)
+      for (; src < src_end; src++)
        {
          c = *src;
          if (c & 0x80)
+           {
+             eight_bit_found = 1;
+             if (coding.head_ascii < 0)
+               coding.head_ascii = src - coding.source;
+             if (null_byte_found)
            break;
-         if (c < 0x20
-             && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
-             && ! inhibit_iso_escape_detection)
+           }
+         if (c < 0x20)
+           {
+             if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
+                 && ! inhibit_iso_escape_detection
+                 && ! detect_info.checked)
            {
+                 if (coding.head_ascii < 0)
              coding.head_ascii = src - coding.source;
              if (detect_coding_iso_2022 (&coding, &detect_info))
                {
                  /* We have scanned the whole data.  */
                  if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
-                   /* We didn't find an 8-bit code.  */
+                       /* We didn't find an 8-bit code.  We may have
+                          found a null-byte, but it's very rare that
+                          a binary file confirm to ISO-2022.  */
                    src = src_end;
                  break;
                }
            }
+             else if (! c)
+               {
+                 null_byte_found = 1;
+                 if (eight_bit_found)
+                   break;
+               }
+           }
        }
+      if (coding.head_ascii < 0)
       coding.head_ascii = src - coding.source;
 
-      if (src < src_end
+      if (null_byte_found || eight_bit_found
+         || coding.head_ascii < coding.src_bytes
          || detect_info.found)
        {
-         if (src == src_end)
+         if (coding.head_ascii == coding.src_bytes)
            /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
            for (i = 0; i < coding_category_raw_text; i++)
              {
@@ -7533,6 +7621,12 @@
                  break;
              }
          else
+           {
+             if (null_byte_found)
+               {
+                 detect_info.checked |= ~CATEGORY_MASK_UTF_16;
+                 detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
+               }
            for (i = 0; i < coding_category_raw_text; i++)
              {
                category = coding_priorities[i];
@@ -7551,9 +7645,7 @@
                        && (detect_info.found & (1 << category)))
                      break;
                  }
-               else
-                 {
-                   if ((*(this->detector)) (&coding, &detect_info)
+                 else if ((*(this->detector)) (&coding, &detect_info)
                        && highest
                        && (detect_info.found & (1 << category)))
                      {
@@ -7570,7 +7662,7 @@
              }
        }
 
-      if (detect_info.rejected == CATEGORY_MASK_ANY)
+      if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY)
        {
          detect_info.found = CATEGORY_MASK_RAW_TEXT;
          id = coding_categories[coding_category_raw_text].id;
@@ -7659,8 +7751,13 @@
     if (VECTORP (eol_type))
       {
        if (detect_info.found & ~CATEGORY_MASK_UTF_16)
+         {
+           if (null_byte_found)
+             normal_eol = EOL_SEEN_LF;
+           else
          normal_eol = detect_eol (coding.source, src_bytes,
                                   coding_category_raw_text);
+         }
        if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
                                 | CATEGORY_MASK_UTF_16_BE_NOSIG))
          utf_16_be_eol = detect_eol (coding.source, src_bytes,




reply via email to

[Prev in Thread] Current Thread [Next in Thread]