emacs-diffs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Emacs-diffs] Changes to emacs/src/coding.c,v


From: Kenichi Handa
Subject: [Emacs-diffs] Changes to emacs/src/coding.c,v
Date: Wed, 09 Jan 2008 06:05:24 +0000

CVSROOT:        /cvsroot/emacs
Module name:    emacs
Changes by:     Kenichi Handa <handa>   08/01/09 06:05:24

Index: coding.c
===================================================================
RCS file: /cvsroot/emacs/emacs/src/coding.c,v
retrieving revision 1.360
retrieving revision 1.361
diff -u -b -r1.360 -r1.361
--- coding.c    8 Jan 2008 20:44:33 -0000       1.360
+++ coding.c    9 Jan 2008 06:05:23 -0000       1.361
@@ -1406,12 +1406,17 @@
        CODING_CATEGORY_MASK_ISO_7_ELSE
        CODING_CATEGORY_MASK_ISO_8_ELSE
    are set.  If a code which should never appear in ISO2022 is found,
-   returns 0.  */
+   returns 0.
+
+   If *latin_extra_code_state is zero and Latin extra codes are found,
+   set *latin_extra_code_state to 1 and return 0.  If it is nonzero,
+   accept Latin extra codes.  */
 
 static int
-detect_coding_iso2022 (src, src_end, multibytep)
+detect_coding_iso2022 (src, src_end, multibytep, latin_extra_code_state)
      unsigned char *src, *src_end;
      int multibytep;
+     int *latin_extra_code_state;
 {
   int mask = CODING_CATEGORY_MASK_ISO;
   int mask_found = 0;
@@ -1574,6 +1579,11 @@
            if (VECTORP (Vlatin_extra_code_table)
                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
              {
+               if (! *latin_extra_code_state)
+                 {
+                   *latin_extra_code_state = 1;
+                   return 0;
+                 }
                if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
                    & CODING_FLAG_ISO_LATIN_EXTRA)
                  newmask |= CODING_CATEGORY_MASK_ISO_8_1;
@@ -1600,6 +1610,11 @@
                {
                  int newmask = 0;
 
+                 if (! *latin_extra_code_state)
+                   {
+                     *latin_extra_code_state = 1;
+                     return 0;
+                   }
                  if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
                      & CODING_FLAG_ISO_LATIN_EXTRA)
                    newmask |= CODING_CATEGORY_MASK_ISO_8_1;
@@ -4127,6 +4142,8 @@
   unsigned char *src = source, *src_end = source + src_bytes;
   unsigned int mask, utf16_examined_p, iso2022_examined_p;
   int i;
+  int null_byte_found;
+  int latin_extra_code_state = 1;
 
   /* At first, skip all ASCII characters and control characters except
      for three ISO2022 specific control characters.  */
@@ -4135,21 +4152,32 @@
   ascii_skip_code[ISO_CODE_ESC] = 0;
 
  label_loop_detect_coding:
-  while (src < src_end && ascii_skip_code[*src]) src++;
+  null_byte_found = 0;
+  while (src < src_end && ascii_skip_code[*src])
+    null_byte_found |= (! *src++);
+  if (! null_byte_found)
+    {
+      unsigned char *p = src + 1;
+      while (p < src_end)
+       null_byte_found |= (! *p++);
+    }
   *skip = src - source;
 
   if (src >= src_end)
-    /* We found nothing other than ASCII.  There's nothing to do.  */
+    /* We found nothing other than ASCII (and NULL byte).  There's
+       nothing to do.  */
     return 0;
 
   c = *src;
   /* The text seems to be encoded in some multilingual coding system.
      Now, try to find in which coding system the text is encoded.  */
-  if (c < 0x80)
+  if (! null_byte_found && c < 0x80)
     {
       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
       /* C is an ISO2022 specific control code of C0.  */
-      mask = detect_coding_iso2022 (src, src_end, multibytep);
+      latin_extra_code_state = 1;
+      mask = detect_coding_iso2022 (src, src_end, multibytep,
+                                   &latin_extra_code_state);
       if (mask == 0)
        {
          /* No valid ISO2022 code follows C.  Try again.  */
@@ -4177,7 +4205,12 @@
       if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
        c = src[1] - 0x20;
 
-      if (c < 0xA0)
+      if (null_byte_found)
+       {
+         try = (CODING_CATEGORY_MASK_UTF_16_BE
+                | CODING_CATEGORY_MASK_UTF_16_LE);
+       }
+      else if (c < 0xA0)
        {
          /* C is the first byte of SJIS character code,
             or a leading-code of Emacs' internal format (emacs-mule),
@@ -4191,7 +4224,8 @@
             or is an ISO2022 specific control code of C1 (SS2 or SS3),
             or is an ISO2022 control-sequence-introducer (CSI),
             we should also consider the possibility of ISO2022 codings.  */
-         if ((VECTORP (Vlatin_extra_code_table)
+         if ((latin_extra_code_state
+              && VECTORP (Vlatin_extra_code_table)
               && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
              || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
              || (c == ISO_CODE_CSI
@@ -4217,21 +4251,28 @@
                | CODING_CATEGORY_MASK_UTF_16_LE);
 
       /* Or, we may have to consider the possibility of CCL.  */
-      if (coding_system_table[CODING_CATEGORY_IDX_CCL]
+      if (! null_byte_found
+         && coding_system_table[CODING_CATEGORY_IDX_CCL]
          && (coding_system_table[CODING_CATEGORY_IDX_CCL]
              ->spec.ccl.valid_codes)[c])
        try |= CODING_CATEGORY_MASK_CCL;
 
       mask = 0;
-      utf16_examined_p = iso2022_examined_p = 0;
       if (priorities)
        {
+         /* At first try detection with Latin extra codes not-allowed.
+            If no proper coding system is found because of Latin extra
+            codes, try detection with Latin extra codes allowed.  */
+         latin_extra_code_state = 0;
+       label_retry:
+         utf16_examined_p = iso2022_examined_p = 0;
          for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
            {
              if (!iso2022_examined_p
                  && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
                {
-                 mask |= detect_coding_iso2022 (src, src_end, multibytep);
+                 mask |= detect_coding_iso2022 (src, src_end, multibytep,
+                                                &latin_extra_code_state);
                  iso2022_examined_p = 1;
                }
              else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
@@ -4252,16 +4293,40 @@
              else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
                mask |= detect_coding_ccl (src, src_end, multibytep);
              else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
+               {
+                 if (latin_extra_code_state == 1)
+                   {
+                     /* Detection of ISO-2022 based coding system
+                        failed because of Latin extra codes.  Before
+                        falling back to raw-text, try again with
+                        Latin extra codes allowed.  */
+                     latin_extra_code_state = 2;
+                     try = (mask | CODING_CATEGORY_MASK_ISO_8_ELSE
+                            | CODING_CATEGORY_MASK_ISO_8BIT);
+                     goto label_retry;
+                   }
                mask |= CODING_CATEGORY_MASK_RAW_TEXT;
+               }
              else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
+               {
+                 if (latin_extra_code_state == 1)
+                   {
+                     /* See the above comment.  */
+                     latin_extra_code_state = 2;
+                     try = (mask | CODING_CATEGORY_MASK_ISO_8_ELSE
+                            | CODING_CATEGORY_MASK_ISO_8BIT);
+                     goto label_retry;
+                   }
                mask |= CODING_CATEGORY_MASK_BINARY;
+               }
              if (mask & priorities[i])
                return priorities[i];
            }
          return CODING_CATEGORY_MASK_RAW_TEXT;
        }
       if (try & CODING_CATEGORY_MASK_ISO)
-       mask |= detect_coding_iso2022 (src, src_end, multibytep);
+       mask |= detect_coding_iso2022 (src, src_end, multibytep,
+                                      &latin_extra_code_state);
       if (try & CODING_CATEGORY_MASK_SJIS)
        mask |= detect_coding_sjis (src, src_end, multibytep);
       if (try & CODING_CATEGORY_MASK_BIG5)




reply via email to

[Prev in Thread] Current Thread [Next in Thread]