[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [PATCH] New function fstrcmp_if_higher.
From: |
Bruno Haible |
Subject: |
Re: [PATCH] New function fstrcmp_if_higher. |
Date: |
Sun, 14 Sep 2008 21:16:32 +0200 |
User-agent: |
KMail/1.5.4 |
Hi Ralf,
An excellent patch! I measure a speedup of "msgmerge af.po coreutils.pot"
from 376 sec. to 152 sec.
I changed your patch a bit
1. to avoid new function calls - more inlining.
2. to include a formal proof of the upper bound.
3. To leave the modified fstrcmp function the freedom what to return
when the result is known to be < bound - either 0 or the original
result. This saves a few instructions at the end.
I also changed the test code to be easier to read. Applied this:
2008-09-14 Ralf Wildenhues <address@hidden>
* lib/fstrcmp.h (fstrcmp_bounded): New declaration.
(fstrcmp): Define in terms of fstrcmp_bounded.
* lib/fstrcmp.c (fstrcmp_bounded): Renamed from fstrcmp. Add
lower_bound argument.
Return quickly if the result is certainly < lower_bound.
* tests/test-fstrcmp.c (check_fstrcmp): Test also fstrcmp_bounded.
*** lib/fstrcmp.h.orig 2008-09-14 21:04:36.000000000 +0200
--- lib/fstrcmp.h 2008-09-14 19:03:36.000000000 +0200
***************
*** 1,5 ****
/* Fuzzy string comparison.
! Copyright (C) 1995, 2000, 2002-2003, 2006 Free Software Foundation, Inc.
This file was written by Peter Miller <address@hidden>
--- 1,6 ----
/* Fuzzy string comparison.
! Copyright (C) 1995, 2000, 2002-2003, 2006, 2008 Free Software
! Foundation, Inc.
This file was written by Peter Miller <address@hidden>
***************
*** 24,32 ****
#endif
/* Fuzzy compare of S1 and S2. Return a measure for the similarity of S1
! and S1. The higher the result, the more similar the strings are. */
extern double fstrcmp (const char *s1, const char *s2);
#ifdef __cplusplus
}
#endif
--- 25,43 ----
#endif
/* Fuzzy compare of S1 and S2. Return a measure for the similarity of S1
! and S1. The higher the result, the more similar the strings are.
! The result is bounded between 0 (meaning very dissimilar strings) and
! 1 (meaning identical strings). */
extern double fstrcmp (const char *s1, const char *s2);
+ /* Like fstrcmp (S1, S2), except that if the result is < LOWER_BOUND, an
+ arbitrary other value < LOWER_BOUND can be returned. */
+ extern double fstrcmp_bounded (const char *s1, const char *s2,
+ double lower_bound);
+
+ /* A shortcut for fstrcmp. Avoids a function call. */
+ #define fstrcmp(s1,s2) fstrcmp_bounded (s1, s2, 0.0)
+
#ifdef __cplusplus
}
#endif
*** lib/fstrcmp.c.orig 2008-09-14 21:04:36.000000000 +0200
--- lib/fstrcmp.c 2008-09-14 20:59:40.000000000 +0200
***************
*** 97,142 ****
gl_once_define(static, keys_init_once)
- /* NAME
- fstrcmp - fuzzy string compare
-
- SYNOPSIS
- double fstrcmp(const char *, const char *);
-
- DESCRIPTION
- The fstrcmp function may be used to compare two string for
- similarity. It is very useful in reducing "cascade" or
- "secondary" errors in compilers or other situations where
- symbol tables occur.
-
- RETURNS
- double; 0 if the strings are entirly dissimilar, 1 if the
- strings are identical, and a number in between if they are
- similar. */
-
double
! fstrcmp (const char *string1, const char *string2)
{
struct context ctxt;
! int xvec_length;
! int yvec_length;
int i;
size_t fdiag_len;
int *buffer;
size_t bufmax;
/* set the info for each string. */
ctxt.xvec = string1;
- xvec_length = strlen (string1);
ctxt.yvec = string2;
- yvec_length = strlen (string2);
-
- /* short-circuit obvious comparisons */
- if (xvec_length == 0 && yvec_length == 0)
- return 1.0;
- if (xvec_length == 0 || yvec_length == 0)
- return 0.0;
/* Set TOO_EXPENSIVE to be approximate square root of input size,
bounded below by 256. */
--- 97,148 ----
gl_once_define(static, keys_init_once)
double
! fstrcmp_bounded (const char *string1, const char *string2, double lower_bound)
{
struct context ctxt;
! int xvec_length = strlen (string1);
! int yvec_length = strlen (string2);
int i;
size_t fdiag_len;
int *buffer;
size_t bufmax;
+ /* short-circuit obvious comparisons */
+ if (xvec_length == 0 || yvec_length == 0)
+ return (xvec_length == 0 && yvec_length == 0 ? 1.0 : 0.0);
+
+ if (lower_bound > 0)
+ {
+ /* Compute a quick upper bound.
+ Each edit is an insertion or deletion of an element, hence modifies
+ the length of the sequence by at most 1.
+ Therefore, when starting from a sequence X and ending at a sequence Y,
+ with N edits, | yvec_length - xvec_length | <= N. (Proof by
+ induction over N.)
+ So, at the end, we will have
+ xvec_edit_count + yvec_edit_count >= | xvec_length - yvec_length |.
+ and hence
+ result
+ = (xvec_length + yvec_length - (xvec_edit_count + yvec_edit_count))
+ / (xvec_length + yvec_length)
+ <= (xvec_length + yvec_length - | yvec_length - xvec_length |)
+ / (xvec_length + yvec_length)
+ = 2 * min (xvec_length, yvec_length) / (xvec_length + yvec_length).
+ */
+ volatile double upper_bound =
+ (double) (2 * MIN (xvec_length, yvec_length))
+ / (xvec_length + yvec_length);
+
+ if (upper_bound < lower_bound)
+ /* Return an arbitrary value < LOWER_BOUND. */
+ return 0.0;
+ }
+
/* set the info for each string. */
ctxt.xvec = string1;
ctxt.yvec = string2;
/* Set TOO_EXPENSIVE to be approximate square root of input size,
bounded below by 256. */
*** tests/test-fstrcmp.c.orig 2008-09-14 21:04:36.000000000 +0200
--- tests/test-fstrcmp.c 2008-09-14 19:00:42.000000000 +0200
***************
*** 45,52 ****
compliant by default, to avoid that msgmerge results become platform and
compiler option dependent. 'volatile' is a portable alternative to gcc's
-ffloat-store option. */
! volatile double result = fstrcmp (string1, string2);
! return (result == expected);
}
int
--- 45,77 ----
compliant by default, to avoid that msgmerge results become platform and
compiler option dependent. 'volatile' is a portable alternative to gcc's
-ffloat-store option. */
! {
! volatile double result = fstrcmp (string1, string2);
! if (!(result == expected))
! return false;
! }
! {
! volatile double result = fstrcmp_bounded (string1, string2, expected);
! if (!(result == expected))
! return false;
! }
! {
! double bound = expected * 0.5; /* implies bound <= expected */
! volatile double result = fstrcmp_bounded (string1, string2, bound);
! if (!(result == expected))
! return false;
! }
! {
! double bound = (1 + expected) * 0.5;
! if (expected < bound)
! {
! volatile double result = fstrcmp_bounded (string1, string2, bound);
! if (!(result < bound))
! return false;
! }
! }
!
! return true;
}
int
- msgmerge speedup: fstrcmp and diffseq improvements, Ralf Wildenhues, 2008/09/14
- [PATCH] New function fstrcmp_if_higher., Ralf Wildenhues, 2008/09/14
- Re: [PATCH] New function fstrcmp_if_higher.,
Bruno Haible <=
- [PATCH] Implement premature termination of compareseq., Ralf Wildenhues, 2008/09/14
- Message not available
- Re: msgmerge speedup: fstrcmp and diffseq improvements, Ralf Wildenhues, 2008/09/15
- Re: msgmerge speedup: fstrcmp and diffseq improvements, Bruno Haible, 2008/09/15
- Re: msgmerge speedup: fstrcmp and diffseq improvements, Ralf Wildenhues, 2008/09/19
- Re: msgmerge speedup: fstrcmp and diffseq improvements, Bruno Haible, 2008/09/20
- Re: msgmerge speedup: fstrcmp and diffseq improvements, Ralf Wildenhues, 2008/09/20
- Re: msgmerge speedup: fstrcmp and diffseq improvements, Paolo Bonzini, 2008/09/21
- Re: profile-directed optimization, Bruno Haible, 2008/09/21