/* charclass -- Tools to create and manipulate sets of C "char"s This module provides tools to create, modify, store and retrieve character classes, and provides tools tuned to the needs of RE lexical analysers. The class itself is an opaque type, referenced by a pointer while under construction, and later by an unique index when finalised. The module tries aggressively to reuse existing finalised classes, rather than create duplicates. Functions are provided to map between indexes and pointers. Because of the deduplication effort, the index reported for a class upon finalisation may map to a different pointer than the one supplied by new (). Classes may be shared between different lexer instances, although, at the time of writing (10 April 2014) it is not thread-safe. In many cases, there might only be one class under construction at any time, with the effort either finalised or abandoned quickly. However, this module recognises that sometimes multiple classes might be worked on in parallel, and so explicitly marks each allocated class area as one of "unused", "work" or "finalised". This marking is done by an array of state bytes dynamically allocated when the pool is created. Copyright (C) 1988, 1998, 2000, 2002, 2004-2005, 2007-2014 Free Software Foundation, Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */ /* Written June, 1988 by Mike Haertel Modified July, 1988 by Arthur David Olson to assist BMG speedups */ /* 2014: Repackaged by "untangle" script, written by behoffski. */ /* Always import environment-specific configuration items first. */ #include #include #include "charclass.h" #include #include #include #include #include /* for EOF assert test. */ #include #include /* for WEOF assert test. */ #include "xalloc.h" /* Lower bound for size of first pool in the list. */ /* ?? Set to 2 for pool debug; Use 10 in production? */ #define POOL_MINIMUM_INITIAL_SIZE 10 #ifndef MAX # define MAX(a,b) ((a) > (b) ? (a) : (b)) #endif #ifndef MIN # define MIN(a,b) ((a) < (b) ? (a) : (b)) #endif /* We maintain a list-of-pools here, choosing to malloc a new slab of memory each time we run out, instead of a realloc strategy. This is so that we can provide a guarantee to the user that any class pointer issued remains valid for the lifetime of the module. */ typedef ptrdiff_t pool_list_index_t; /* Designator for each charclass in each pool. Note that enums are ints by default, but we use a single unsigned char per class in our explicit memory allocation. */ typedef enum { STATE_UNUSED = 0, STATE_WORKING = 1, STATE_FINALISED = 2 } charclass_state_t; typedef struct pool_info_struct { charclass_index_t first_index; size_t alloc; /* ?? Use pool_list_index_t type for these? */ size_t used; charclass_t *classes; /* Charclass designator byte array, one per item, allocated dynamically. */ unsigned char *class_state; } pool_t; static pool_list_index_t pool_list_used = 0; static pool_list_index_t pool_list_alloc = 0; static pool_t *pool_list = NULL; /* While the header only guarantees a 3-bit gutter at each end of each class, we use an entire integer (typically 32 bits) for the gutter, with 1 integer placed at the start of each pool, 1 integer as a shared gutter between each class, and 1 integer after the last class. This is why there is "(*int) + 1" code after class memory alloation calls. */ /* HPUX defines these as macros in sys/param.h. */ #ifdef setbit # undef setbit #endif #ifdef clrbit # undef clrbit #endif /* Number of bits in an unsigned char. */ #ifndef CHARBITS # define CHARBITS 8 #endif /* INTBITS need not be exact, just a lower bound. */ #ifndef INTBITS # define INTBITS (CHARBITS * sizeof (int)) #endif /* First integer value that is greater than any character code. */ #define NOTCHAR (1 << CHARBITS) /* Number of ints required to hold a bit for every character. */ #define CHARCLASS_INTS ((NOTCHAR + INTBITS - 1) / INTBITS) /* Flesh out opaque charclass type given in the header */ /* The gutter integer following the class member storage also serves as the gutter integer before the next class in the list. Note that since the "gutter" notion explicitly includes negative values, members need to be signed ints, not unsigned ints, so that arithmetic shift right can be used (e.g. -8 >> 8 == -1, not -8 / 256 == 0). */ struct charclass_struct { int members[CHARCLASS_INTS]; int gutter_following; }; /* Define class bit operations: test, set and clear a bit. Grrrr. I wanted to exploit arithmetic right shift to come up with a really cheap and neat way of reducing small negative bit values, especially if b == EOF ==-1, to an index of -1 that falls neatly into the gutter, but strict C conformance does not guarantee this. The code below handles the two most likely scenarios, but, as with anything that is undefined, this is playing with fire. */ #if INT_MAX == 32767 # define INT_BITS_LOG2 4 /* log2(sizeof(int)) + log2(CHARBITS) */ #elif INT_MAX == 2147483647 # define INT_BITS_LOG2 5 /* log2(sizeof(int)) + log2(CHARBITS) */ #else # error "Not implemented: Architectures with ints other than 16 or 32 bits" #endif #if ((~0 >> 1) < 0) /* Arithmetic shift right: Both signed and unsigned cases are ok. */ # define ARITH_SHIFT_R_INT(b) ((b) >> INT_BITS_LOG2) #else /* Avoid using right shift if b is negative. The macro may evaluate b twice in some circumstances. */ # define ARITH_SHIFT_R_INT(b) \ (((b) < 0) ? -1 : ((b) >> INT_BITS_LOG2)) #endif bool _GL_ATTRIBUTE_PURE charclass_tstbit (int b, charclass_t const *ccl) { return ccl->members[ARITH_SHIFT_R_INT(b)] >> b % INTBITS & 1; } void charclass_setbit (int b, charclass_t *ccl) { ccl->members[ARITH_SHIFT_R_INT(b)] |= 1U << b % INTBITS; } void charclass_clrbit (int b, charclass_t *ccl) { ccl->members[ARITH_SHIFT_R_INT(b)] &= ~(1U << b % INTBITS); } void charclass_setbit_range (int start, int end, charclass_t *ccl) { int bit; /* Do nothing if the range doesn't make sense. */ if (end < start) return; if (start >= NOTCHAR) return; /* Clip the range to be in the interval [-1..NOTCHAR - 1] */ start = MAX(start, -1); end = MAX(end, -1); /* We know start is < NOTCHAR from the test above. */ end = MIN(end, NOTCHAR - 1); /* ?? Could check that ccl is a valid class, but not at present. */ /* Okay, loop through the range, bit-by-bit, setting members. */ for (bit = start; bit <= end; bit++) ccl->members[ARITH_SHIFT_R_INT(bit)] |= 1U << bit % INTBITS; } void charclass_clrbit_range (int start, int end, charclass_t *ccl) { int bit; /* Do nothing if the range doesn't make sense. */ if (end < start) return; if (start >= NOTCHAR) return; /* Clip the range to be in the interval [-1..NOTCHAR - 1] */ start = MAX(start, -1); end = MAX(end, -1); /* We know start is < NOTCHAR from the test above. */ end = MIN(end, NOTCHAR - 1); /* ?? Could check that ccl is a valid class, but not at present. */ /* Okay, loop through the range, bit-by-bit, clearing members. */ for (bit = start; bit <= end; bit++) ccl->members[ARITH_SHIFT_R_INT(bit)] &= ~(1U << bit % INTBITS); } /* Define whole-set operations: Copy, clear, invert, compare and union */ void charclass_copyset (charclass_t const *src, charclass_t *dst) { memcpy (dst->members, src->members, sizeof(src->members)); } void charclass_zeroset (charclass_t *ccl) { memset (ccl->members, 0, sizeof(ccl->members)); } void charclass_notset (charclass_t *ccl) { int i; for (i = 0; i < CHARCLASS_INTS; ++i) ccl->members[i] = ~ccl->members[i]; } int _GL_ATTRIBUTE_PURE charclass_equal (charclass_t const *ccl1, charclass_t const *ccl2) { return memcmp (ccl1->members, ccl2->members, sizeof(ccl1->members)) == 0; } void charclass_unionset (charclass_t const *src, charclass_t *dst) { int i; for (i = 0; i < CHARCLASS_INTS; ++i) dst->members[i] |= src->members[i]; } void charclass_intersectset (charclass_t const *src, charclass_t *dst) { int i; for (i = 0; i < CHARCLASS_INTS; ++i) dst->members[i] &= src->members[i]; } /* #ifdef DEBUG */ /* Nybble (4bit)-to-char conversion array for little-bit-endian nybbles. */ static const char *disp_nybble = "084c2a6e195d3b7f"; /* Return a static string describing a class (Note: not reentrant). */ char * charclass_describe (charclass_t const *ccl) { /* The string should probably be less than 90 chars, but overcompensate for limited uncertainty introduced by the %p formatting operator. */ static char buf[256]; char *p_buf = buf; int i; p_buf += sprintf (p_buf, "0x%08lx:", (unsigned long) ccl); for (i = 0; i < CHARCLASS_INTS; i += 2) { int j = ccl->members[i]; *p_buf++ = ' '; *p_buf++ = disp_nybble[(j >> 0) & 0x0f]; *p_buf++ = disp_nybble[(j >> 4) & 0x0f]; *p_buf++ = disp_nybble[(j >> 8) & 0x0f]; *p_buf++ = disp_nybble[(j >> 12) & 0x0f]; *p_buf++ = disp_nybble[(j >> 16) & 0x0f]; *p_buf++ = disp_nybble[(j >> 20) & 0x0f]; *p_buf++ = disp_nybble[(j >> 24) & 0x0f]; *p_buf++ = disp_nybble[(j >> 28) & 0x0f]; j = ccl->members[i + 1]; *p_buf++ = disp_nybble[(j >> 0) & 0x0f]; *p_buf++ = disp_nybble[(j >> 4) & 0x0f]; *p_buf++ = disp_nybble[(j >> 8) & 0x0f]; *p_buf++ = disp_nybble[(j >> 12) & 0x0f]; *p_buf++ = disp_nybble[(j >> 16) & 0x0f]; *p_buf++ = disp_nybble[(j >> 20) & 0x0f]; *p_buf++ = disp_nybble[(j >> 24) & 0x0f]; *p_buf++ = disp_nybble[(j >> 28) & 0x0f]; } *p_buf++ = '\0'; return buf; } /* static */ void debug_pools (const char *label, bool class_contents) { pool_list_index_t pool_nr; size_t class_nr; printf ("\nPool %p debug(%s): [alloc, used: %ld %ld]\n", pool_list, label, pool_list_alloc, pool_list_used); for (pool_nr = 0; pool_nr < pool_list_used; pool_nr++) { pool_t *pool = &pool_list[pool_nr]; printf ( " %3ld: first_index, alloc, used, classes: %4ld %3lu %3lu %p\n", pool_nr, pool->first_index, pool->alloc, pool->used, pool->classes); printf (" class_states: "); for (class_nr = 0; class_nr < pool->alloc; class_nr++) switch (pool->class_state[class_nr]) { case STATE_UNUSED: putchar ('.'); break; case STATE_WORKING: putchar ('w'); break; case STATE_FINALISED: putchar ('F'); break; default: printf ("?%02x", pool->class_state[class_nr]); } putchar ('\n'); } /* If class contents requested, print them out as well. */ if (class_contents) for (pool_nr = 0; pool_nr < pool_list_used; pool_nr++) { pool_t *pool = &pool_list[pool_nr]; for (class_nr = 0; class_nr < pool->used; class_nr++) printf ("%s\n", charclass_describe (&pool->classes[class_nr])); } } /* #endif * DEBUG */ static pool_t * add_new_pool (void) { pool_t *prev, *pool; size_t pool_class_alloc; charclass_t *alloc_mem; /* If the pools list is full, use x2nrealloc to expand its size. */ if (pool_list_used == pool_list_alloc) pool_list = x2nrealloc (pool_list, &pool_list_alloc, sizeof (pool_t)); /* Find the size of the last charclass pool in the (old) list. Scale up the size so that malloc activity will decrease as the number of pools increases. Also, add 1 here as we knock off 1 to use as a gutter later. */ prev = &pool_list[pool_list_used - 1]; pool_class_alloc = (prev->alloc * 5 / 2) + 1; alloc_mem = XNMALLOC (pool_class_alloc, charclass_t); /* Set up the new pool, shifting the alloc pointer to create the gutter preceding the first class of the pool. */ pool = &pool_list[pool_list_used++]; pool->classes = alloc_mem + 1; pool->first_index = prev->first_index + prev->alloc; pool->alloc = pool_class_alloc - 1; pool->used = 0; pool->class_state = xzalloc (pool->alloc); return pool; } charclass_t * charclass_alloc (void) { pool_list_index_t pool_nr; charclass_t *class; pool_t *pool = NULL; size_t class_nr; size_t class_last_nr; int *gutter_preceding; /* Locate a pool with unused entries (if any). */ for (pool_nr = 0; pool_nr < pool_list_used; pool_nr++) { pool = &pool_list[pool_nr]; /* Try use the earliest pool possible, first by filling in a hole from a withdrawn class, or by grabbing an unused class from the end of the list. */ class_last_nr = MIN(pool->used + 1, pool->alloc); for (class_nr = 0; class_nr < class_last_nr; class_nr++) { if (pool->class_state[class_nr] == STATE_UNUSED) goto found_pool_and_class; } } /* No space found, so prepare a new pool and make this class its first element. */ pool = add_new_pool (); class_nr = 0; /* FALLTHROUGH */ found_pool_and_class: /* Mark the found class state as working, zero its elements, and return class pointer to caller. Zeroing is needed as this class may have been previously worked on, but then abandoned or withdrawn. */ pool->class_state[class_nr] = STATE_WORKING; if (class_nr >= pool->used) pool->used = class_nr + 1; class = &pool->classes[class_nr]; /* Zero out the class' members, and also the gutters on each side. */ memset (class, 0, sizeof (*class)); gutter_preceding = ((int *) class) - 1; *gutter_preceding = 0; return class; } pool_t * _GL_ATTRIBUTE_PURE find_class_pool (charclass_t const *ccl) { pool_list_index_t pool_nr; pool_t *pool = NULL; ptrdiff_t class_ptr_offset; /* Locate the pool whose memory address space covers this class. */ /* ?? Perhaps check &pool->classes[pool->alloc] in this first loop, and then check that the index is in the "used" portion later, so we can diagnose malformed pointers more exactly. */ for (pool_nr = 0; pool_nr < pool_list_used; pool_nr++) { pool = &pool_list[pool_nr]; if ((pool->classes <= ccl) && (ccl < &pool->classes[pool->alloc])) goto found_pool; } /* No credible pool candidate was found. */ assert ("find_class_pool: no pool found"); return NULL; found_pool: /* Make sure the class clearly lies on an array boundary within the pool's memory allocation. */ class_ptr_offset = (char *) ccl - (char *) pool->classes; if ((class_ptr_offset % sizeof (charclass_t)) != 0) { /* Pointer does not lie at the start of a pool member. */ assert ("find_class_pool: pointer not aligned."); return NULL; } return pool; } static void withdraw_class (charclass_t *ccl, pool_t *class_pool) { pool_t *pool; size_t class_nr; int *gutter_preceding; /* Use pool reference if given, otherwise work back from the class pointer to find the associated pool. */ pool = (class_pool != NULL) ? class_pool : find_class_pool (ccl); if (pool == NULL) assert (!"Could not locate a pool for this charclass"); /* Zero out the gutters each side of the class. */ ccl->gutter_following = 0; gutter_preceding = ((int *) ccl) - 1; *gutter_preceding = 0; /* Work out the class index in the pool. */ class_nr = ccl - pool->classes; pool->class_state[class_nr] = STATE_UNUSED; /* Is this the last item within the pool's class list? */ if (class_nr == pool->used - 1) { /* Yes, reduce the pool member count by 1. */ pool->used--; return; } } /* Finish off creating a class, and report an index that can be used to reference the class. */ charclass_index_t charclass_finalise (charclass_t *ccl) { int *gutter_preceding; pool_list_index_t pool_nr; pool_t *pool; charclass_t *found = NULL; size_t class_nr; pool_t *my_pool = NULL; size_t my_class_nr = 0; /* Search all pools for a finalised class matching this class, and, if found, use it in preference to the new one. While searching, also record where the work class is located. If we can't find ourselves, the pointer is invalid, and throw an assertion. */ for (pool_nr = 0; pool_nr < pool_list_used; pool_nr++) { pool = &pool_list[pool_nr]; for (class_nr = 0; class_nr < pool->used; class_nr++) { charclass_t *search = &pool->classes[class_nr]; /* Have we found ourselves in the list? */ if (search == ccl) { /* Yes, remember this place in case no duplicate is found. */ my_pool = pool; my_class_nr = class_nr; } if (pool->class_state[class_nr] != STATE_FINALISED) continue; if (charclass_equal (search, ccl)) { /* Another class, finalised, matches: Use it in preference to potentially creating a duplicate. */ withdraw_class (ccl, my_pool); found = search; goto found_matching_class; } } } /* No duplicate found... but make sure the search pointer is known. */ assert (my_pool != NULL); assert (my_pool->class_state[my_class_nr] == STATE_WORKING); /* Prepare to convert the search (work) class into a finalised class. */ pool = my_pool; class_nr = my_class_nr; found = &pool->classes[class_nr]; /* FALLTHROUGH */ found_matching_class: /* Clear out the gutter integers each side of the class entry. */ gutter_preceding = found->members - 1; *gutter_preceding = 0; found->gutter_following = 0; pool->class_state[class_nr] = STATE_FINALISED; /* Return the index of the class. */ return pool->first_index + class_nr; } void charclass_abandon (charclass_t *ccl) { withdraw_class (ccl, NULL); } /* Additional functions to help clients work with classes. */ charclass_t * _GL_ATTRIBUTE_PURE charclass_get_pointer (charclass_index_t const index) { pool_list_index_t pool_nr; pool_t *pool; /* Does this class match any class we've seen previously? */ for (pool_nr = 0; pool_nr < pool_list_used; pool_nr++) { /* Is the index inside this pool? */ pool = &pool_list[pool_nr]; if (pool->first_index <= index && index < (pool->first_index + pool->used)) { /* Yes, find the pointer within the pool and return it. */ return &pool->classes[index - pool->first_index]; } } /* The mapping above should never fail; we could return NULL, but we choose to abort instead. */ assert (!"index-to-charclass mapping failed"); return NULL; } charclass_index_t _GL_ATTRIBUTE_PURE charclass_get_index (charclass_t const *ccl) { pool_t *pool; /* This code is similar to charclass_finalise... perhaps merge? */ pool = find_class_pool (ccl); if (pool == NULL) return -1; /* Report the index to the caller. */ return pool->first_index + (ccl - pool->classes); } /* Functions to initialise module on startup, and to shut down and release acquired resources at exit. */ void charclass_initialise (size_t initial_pool_size) { size_t initial_alloc; charclass_t *alloc_mem; pool_t *pool; charclass_t *ccl; charclass_index_t zeroclass_index; /* Usually EOF = WEOF = -1, but the standard merely states that they must be a negative integer. We test for -1 here as it's a prime target for a "permitted" gutter value, and different values might be a problem. */ assert (EOF == -1); assert (WEOF == -1); /* First, set up the list-of-pools structure with initial storage. */ pool_list_alloc = 4; pool_list = (pool_t *) xnmalloc (pool_list_alloc, sizeof (pool_t)); /* If initial pool size is small, inflate it here as we prefer to waste a little memory, rather than issue many calls to xmalloc (). This minimum also ensures that our double-up pool size strategy has a sane starting point. */ initial_alloc = MAX(initial_pool_size, POOL_MINIMUM_INITIAL_SIZE); /* Set up the first pool using our chosen first alloc size. Allocate an extra class, and offset the pool by this amount, in order to accommodate the initial gutter integer. (Note for the future: If charclass alignment becomes significant, then sizeof (charclass) and this offset may need to be changed, perhaps for SIMD instructions.) */ pool_list_used = 1; pool = &pool_list[0]; pool->first_index = 0; pool->alloc = initial_alloc; pool->used = 0; alloc_mem = XNMALLOC (pool->alloc + 1, charclass_t); pool->classes = alloc_mem + 1; pool->class_state = xzalloc (pool->alloc); /* Enforce the all-zeroes class to be the first class. This is needed as "abandon" may leave a hole in a pool in some cases, and in these cases we need to ensure that no-one else picks it up by accident (as this would invalidate the guarantee that the module eliminates all duplicates, from the point of view of the user). So, we set the first class to all-zeroes, and also zero out abandoned classes where a hole is unavoidable. */ ccl = charclass_alloc (); /* Alloc delivers an all-zeroes class. */ zeroclass_index = charclass_finalise (ccl); assert (zeroclass_index == 0); /* debug_pools ("add_new_pool: zeroclass added"); */ } void charclass_destroy (void) { int i; int *alloc_mem; /* First, discard the charclass memory associated with each pool, including catering for the offset used upon creation. */ for (i = 0; i < pool_list_used; i++) { alloc_mem = (int *) pool_list[i].classes; free (alloc_mem - 1); } /* Second, free up the pool list itself. */ free (pool_list); } /* vim:set shiftwidth=2: */