emacs-diffs
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Emacs-diffs] Changes to emacs/src/search.c


From: Kenichi Handa
Subject: [Emacs-diffs] Changes to emacs/src/search.c
Date: Thu, 31 Mar 2005 20:05:46 -0500

Index: emacs/src/search.c
diff -c emacs/src/search.c:1.190 emacs/src/search.c:1.191
*** emacs/src/search.c:1.190    Sat Nov 27 01:08:45 2004
--- emacs/src/search.c  Fri Apr  1 01:05:46 2005
***************
*** 293,299 ****
    CHECK_STRING (string);
    bufp = compile_pattern (string, &search_regs,
                          (!NILP (current_buffer->case_fold_search)
!                          ? DOWNCASE_TABLE : Qnil),
                          posix,
                          !NILP (current_buffer->enable_multibyte_characters));
  
--- 293,299 ----
    CHECK_STRING (string);
    bufp = compile_pattern (string, &search_regs,
                          (!NILP (current_buffer->case_fold_search)
!                          ? current_buffer->case_canon_table : Qnil),
                          posix,
                          !NILP (current_buffer->enable_multibyte_characters));
  
***************
*** 399,405 ****
  
    bufp = compile_pattern (regexp, &search_regs,
                          (!NILP (current_buffer->case_fold_search)
!                          ? DOWNCASE_TABLE : Qnil),
                          posix,
                          STRING_MULTIBYTE (string));
    immediate_quit = 1;
--- 399,405 ----
  
    bufp = compile_pattern (regexp, &search_regs,
                          (!NILP (current_buffer->case_fold_search)
!                          ? current_buffer->case_canon_table : Qnil),
                          posix,
                          STRING_MULTIBYTE (string));
    immediate_quit = 1;
***************
*** 499,505 ****
    regexp = string_make_unibyte (regexp);
    re_match_object = Qt;
    bufp = compile_pattern (regexp, 0,
!                         Vascii_downcase_table, 0,
                          0);
    immediate_quit = 1;
    val = re_search (bufp, string, len, 0, len, 0);
--- 499,505 ----
    regexp = string_make_unibyte (regexp);
    re_match_object = Qt;
    bufp = compile_pattern (regexp, 0,
!                         Vascii_canon_table, 0,
                          0);
    immediate_quit = 1;
    val = re_search (bufp, string, len, 0, len, 0);
***************
*** 516,522 ****
    int val;
    struct re_pattern_buffer *bufp;
  
!   bufp = compile_pattern (regexp, 0, Vascii_downcase_table,
                          0, STRING_MULTIBYTE (string));
    immediate_quit = 1;
    re_match_object = string;
--- 516,522 ----
    int val;
    struct re_pattern_buffer *bufp;
  
!   bufp = compile_pattern (regexp, 0, Vascii_canon_table,
                          0, STRING_MULTIBYTE (string));
    immediate_quit = 1;
    re_match_object = string;
***************
*** 1175,1181 ****
        unsigned char *patbuf;
        int multibyte = !NILP (current_buffer->enable_multibyte_characters);
        unsigned char *base_pat = SDATA (string);
!       int charset_base = -1;
        int boyer_moore_ok = 1;
  
        /* MULTIBYTE says whether the text to be searched is multibyte.
--- 1175,1183 ----
        unsigned char *patbuf;
        int multibyte = !NILP (current_buffer->enable_multibyte_characters);
        unsigned char *base_pat = SDATA (string);
!       /* Set to nozero if we find a non-ASCII char that need
!        translation.  */
!       int charset_base = 0;
        int boyer_moore_ok = 1;
  
        /* MULTIBYTE says whether the text to be searched is multibyte.
***************
*** 1221,1229 ****
        base_pat = raw_pattern;
        if (multibyte)
        {
          while (--len >= 0)
            {
!             unsigned char str[MAX_MULTIBYTE_LENGTH];
              int c, translated, inverse;
              int in_charlen, charlen;
  
--- 1223,1239 ----
        base_pat = raw_pattern;
        if (multibyte)
        {
+         /* Fill patbuf by translated characters in STRING while
+            checking if we can use boyer-moore search.  If TRT is
+            non-nil, we can use boyer-moore search only if TRT can be
+            represented by the byte array of 256 elements.  For that,
+            all non-ASCII case-equivalents of all case-senstive
+            characters in STRING must belong to the same charset and
+            row.  */
+ 
          while (--len >= 0)
            {
!             unsigned char str_base[MAX_MULTIBYTE_LENGTH], *str;
              int c, translated, inverse;
              int in_charlen, charlen;
  
***************
*** 1233,1282 ****
              if (RE && *base_pat == '\\')
                {
                  len--;
                  len_byte--;
                  base_pat++;
                }
  
              c = STRING_CHAR_AND_LENGTH (base_pat, len_byte, in_charlen);
  
!             /* Translate the character, if requested.  */
!             TRANSLATE (translated, trt, c);
!             /* If translation changed the byte-length, go back
!                to the original character.  */
!             charlen = CHAR_STRING (translated, str);
!             if (in_charlen != charlen)
!               {
!                 translated = c;
!                 charlen = CHAR_STRING (c, str);
!               }
! 
!             /* If we are searching for something strange,
!                an invalid multibyte code, don't use boyer-moore.  */
!             if (! ASCII_BYTE_P (translated)
!                 && (charlen == 1 /* 8bit code */
!                     || charlen != in_charlen /* invalid multibyte code */
!                     ))
!               boyer_moore_ok = 0;
! 
!             TRANSLATE (inverse, inverse_trt, c);
! 
!             /* Did this char actually get translated?
!                Would any other char get translated into it?  */
!             if (translated != c || inverse != c)
!               {
!                 /* Keep track of which character set row
!                    contains the characters that need translation.  */
!                 int charset_base_code = c & ~CHAR_FIELD3_MASK;
!                 int inverse_charset_base = inverse & ~CHAR_FIELD3_MASK;
! 
!                 if (charset_base_code != inverse_charset_base)
!                   boyer_moore_ok = 0;
!                 else if (charset_base == -1)
!                   charset_base = charset_base_code;
!                 else if (charset_base != charset_base_code)
!                   /* If two different rows appear, needing translation,
!                      then we cannot use boyer_moore search.  */
!                   boyer_moore_ok = 0;
                }
  
              /* Store this character into the translated pattern.  */
--- 1243,1304 ----
              if (RE && *base_pat == '\\')
                {
                  len--;
+                 raw_pattern_size--;
                  len_byte--;
                  base_pat++;
                }
  
              c = STRING_CHAR_AND_LENGTH (base_pat, len_byte, in_charlen);
  
!             if (NILP (trt))
!               {
!                 str = base_pat;
!                 charlen = in_charlen;
!               }
!             else
!               {
!                 /* Translate the character.  */
!                 TRANSLATE (translated, trt, c);
!                 charlen = CHAR_STRING (translated, str_base);
!                 str = str_base;
! 
!                 /* Check if C has any other case-equivalents.  */
!                 TRANSLATE (inverse, inverse_trt, c);
!                 /* If so, check if we can use boyer-moore.  */
!                 if (c != inverse && boyer_moore_ok)
!                   {
!                     /* Check if all equivalents belong to the same
!                        charset & row.  Note that the check of C
!                        itself is done by the last iteration.  Note
!                        also that we don't have to check ASCII
!                        characters because boyer-moore search can
!                        always handle their translation.  */
!                     while (1)
!                       {
!                         if (! ASCII_BYTE_P (inverse))
!                           {
!                             if (SINGLE_BYTE_CHAR_P (inverse))
!                               {
!                                 /* Boyer-moore search can't handle a
!                                    translation of an eight-bit
!                                    character.  */
!                                 boyer_moore_ok = 0;
!                                 break;
!                               }
!                             else if (charset_base == 0)
!                               charset_base = inverse & ~CHAR_FIELD3_MASK;
!                             else if ((inverse & ~CHAR_FIELD3_MASK)
!                                      != charset_base)
!                               {
!                                 boyer_moore_ok = 0;
!                                 break;
!                               }
!                           }
!                         if (c == inverse)
!                           break;
!                         TRANSLATE (inverse, inverse_trt, inverse);
!                       }
!                   }
                }
  
              /* Store this character into the translated pattern.  */
***************
*** 1300,1305 ****
--- 1322,1328 ----
              if (RE && *base_pat == '\\')
                {
                  len--;
+                 raw_pattern_size--;
                  base_pat++;
                }
              c = *base_pat++;
***************
*** 1533,1548 ****
      return n;
  }
  
! /* Do Boyer-Moore search N times for the string PAT,
     whose length is LEN/LEN_BYTE,
     from buffer position POS/POS_BYTE until LIM/LIM_BYTE.
     DIRECTION says which direction we search in.
     TRT and INVERSE_TRT are translation tables.
  
!    This kind of search works if all the characters in PAT that have
!    nontrivial translation are the same aside from the last byte.  This
!    makes it possible to translate just the last byte of a character,
!    and do so after just a simple test of the context.
  
     If that criterion is not satisfied, do not call this function.  */
  
--- 1556,1573 ----
      return n;
  }
  
! /* Do Boyer-Moore search N times for the string BASE_PAT,
     whose length is LEN/LEN_BYTE,
     from buffer position POS/POS_BYTE until LIM/LIM_BYTE.
     DIRECTION says which direction we search in.
     TRT and INVERSE_TRT are translation tables.
+    Characters in PAT are already translated by TRT.
  
!    This kind of search works if all the characters in BASE_PAT that
!    have nontrivial translation are the same aside from the last byte.
!    This makes it possible to translate just the last byte of a
!    character, and do so after just a simple test of the context.
!    CHARSET_BASE is nonzero iff there is such a non-ASCII character.
  
     If that criterion is not satisfied, do not call this function.  */
  
***************
*** 1569,1576 ****
    int multibyte = ! NILP (current_buffer->enable_multibyte_characters);
  
    unsigned char simple_translate[0400];
!   int translate_prev_byte = 0;
!   int translate_anteprev_byte = 0;
  
  #ifdef C_ALLOCA
    int BM_tab_space[0400];
--- 1594,1606 ----
    int multibyte = ! NILP (current_buffer->enable_multibyte_characters);
  
    unsigned char simple_translate[0400];
!   /* These are set to the preceding bytes of a byte to be translated
!      if charset_base is nonzero.  As the maximum byte length of a
!      multibyte character is 4, we have to check at most three previous
!      bytes.  */
!   int translate_prev_byte1 = 0;
!   int translate_prev_byte2 = 0;
!   int translate_prev_byte3 = 0;
  
  #ifdef C_ALLOCA
    int BM_tab_space[0400];
***************
*** 1636,1641 ****
--- 1666,1688 ----
    for (i = 0; i < 0400; i++)
      simple_translate[i] = i;
  
+   if (charset_base)
+     {
+       /* Setup translate_prev_byte1/2/3 from CHARSET_BASE.  Only a
+        byte following them are the target of translation.  */
+       int sample_char = charset_base | 0x20;
+       unsigned char str[MAX_MULTIBYTE_LENGTH];
+       int len = CHAR_STRING (sample_char, str);
+ 
+       translate_prev_byte1 = str[len - 2];
+       if (len > 2)
+       {
+         translate_prev_byte2 = str[len - 3];
+         if (len > 3)
+           translate_prev_byte3 = str[len - 4];
+       }
+     }
+ 
    i = 0;
    while (i != infinity)
      {
***************
*** 1645,1701 ****
        i = infinity;
        if (! NILP (trt))
        {
!         int ch;
!         int untranslated;
!         int this_translated = 1;
! 
!         if (multibyte
!             /* Is *PTR the last byte of a character?  */
!             && (pat_end - ptr == 1 || CHAR_HEAD_P (ptr[1])))
!           {
!             unsigned char *charstart = ptr;
!             while (! CHAR_HEAD_P (*charstart))
!               charstart--;
!             untranslated = STRING_CHAR (charstart, ptr - charstart + 1);
!             if (charset_base == (untranslated & ~CHAR_FIELD3_MASK))
!               {
!                 TRANSLATE (ch, trt, untranslated);
!                 if (! CHAR_HEAD_P (*ptr))
!                   {
!                     translate_prev_byte = ptr[-1];
!                     if (! CHAR_HEAD_P (translate_prev_byte))
!                       translate_anteprev_byte = ptr[-2];
!                   }
!               }
!             else
!               {
!                 this_translated = 0;
!                 ch = *ptr;
!               }
!           }
!         else if (!multibyte)
!           TRANSLATE (ch, trt, *ptr);
!         else
!           {
!             ch = *ptr;
!             this_translated = 0;
            }
  
!         if (ch > 0400)
!           j = ((unsigned char) ch) | 0200;
!         else
!           j = (unsigned char) ch;
! 
          if (i == infinity)
            stride_for_teases = BM_tab[j];
  
          BM_tab[j] = dirlen - i;
          /* A translation table is accompanied by its inverse -- see */
          /* comment following downcase_table for details */
!         if (this_translated)
            {
              int starting_ch = ch;
!             int starting_j = j;
              while (1)
                {
                  TRANSLATE (ch, inverse_trt, ch);
--- 1692,1728 ----
        i = infinity;
        if (! NILP (trt))
        {
!         /* If the byte currently looking at is a head of a character
!            to check case-equivalents, set CH to that character.  An
!            ASCII character and a non-ASCII character matching with
!            CHARSET_BASE are to be checked.  */
!         int ch = -1;
! 
!         if (ASCII_BYTE_P (*ptr) || ! multibyte)
!           ch = *ptr;
!         else if (charset_base && CHAR_HEAD_P (*ptr))
!           {
!             ch = STRING_CHAR (ptr, pat_end - ptr);
!             if (charset_base != (ch & ~CHAR_FIELD3_MASK))
!               ch = -1;
            }
  
!         j = *ptr;
          if (i == infinity)
            stride_for_teases = BM_tab[j];
  
          BM_tab[j] = dirlen - i;
          /* A translation table is accompanied by its inverse -- see */
          /* comment following downcase_table for details */
!         if (ch >= 0)
            {
              int starting_ch = ch;
!             int starting_j;
! 
!             if (ch > 0400)
!               starting_j = ((unsigned char) ch) | 0200;
!             else
!               starting_j = (unsigned char) ch;
              while (1)
                {
                  TRANSLATE (ch, inverse_trt, ch);
***************
*** 1821,1829 ****
                          || ((cursor == tail_end_ptr
                               || CHAR_HEAD_P (cursor[1]))
                              && (CHAR_HEAD_P (cursor[0])
!                                 || (translate_prev_byte == cursor[-1]
!                                     && (CHAR_HEAD_P (translate_prev_byte)
!                                         || translate_anteprev_byte == 
cursor[-2])))))
                        ch = simple_translate[*cursor];
                      else
                        ch = *cursor;
--- 1848,1860 ----
                          || ((cursor == tail_end_ptr
                               || CHAR_HEAD_P (cursor[1]))
                              && (CHAR_HEAD_P (cursor[0])
!                                 /* Check if this is the last byte of
!                                    a translable character.  */
!                                 || (translate_prev_byte1 == cursor[-1]
!                                     && (CHAR_HEAD_P (translate_prev_byte1)
!                                         || (translate_prev_byte2 == cursor[-2]
!                                             && (CHAR_HEAD_P 
(translate_prev_byte2)
!                                                 || (translate_prev_byte3 == 
cursor[-3]))))))))
                        ch = simple_translate[*cursor];
                      else
                        ch = *cursor;
***************
*** 1901,1909 ****
                      || ((ptr == tail_end_ptr
                           || CHAR_HEAD_P (ptr[1]))
                          && (CHAR_HEAD_P (ptr[0])
!                             || (translate_prev_byte == ptr[-1]
!                                 && (CHAR_HEAD_P (translate_prev_byte)
!                                     || translate_anteprev_byte == ptr[-2])))))
                    ch = simple_translate[*ptr];
                  else
                    ch = *ptr;
--- 1932,1944 ----
                      || ((ptr == tail_end_ptr
                           || CHAR_HEAD_P (ptr[1]))
                          && (CHAR_HEAD_P (ptr[0])
!                             /* Check if this is the last byte of a
!                                translable character.  */
!                             || (translate_prev_byte1 == ptr[-1]
!                                 && (CHAR_HEAD_P (translate_prev_byte1)
!                                     || (translate_prev_byte2 == ptr[-2]
!                                         && (CHAR_HEAD_P (translate_prev_byte2)
!                                             || translate_prev_byte3 == 
ptr[-3])))))))
                    ch = simple_translate[*ptr];
                  else
                    ch = *ptr;




reply via email to

[Prev in Thread] Current Thread [Next in Thread]