bug-gperf
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [bug-gperf] Add machine-readable output to gperf


From: Sei Lisa
Subject: Re: [bug-gperf] Add machine-readable output to gperf
Date: Mon, 21 Nov 2016 14:21:52 +0100
User-agent: Mozilla/5.0 (X11; Linux i686; rv:24.0) Gecko/20100101 Icedove/24.7.0

Bruno Haible wrote, On 2016-11-21 02:15:
> 
> In which language is the project written?

Pascal. It's using the multi-platform Lazarus IDE, http://www.lazarus-ide.org/

If you're asking in regards to whether it allows binding to a C library, the 
answer is probably yes (not sure about the portability issues that would need 
to be dealt with, though). However, the aim is to avoid any recompilation every 
time there's a change in the keywords, which is often. Just a gperf run should 
suffice.

For reference, here's the current parser of the hash function and keywords 
table (GPL3 licensed; the latter is still WIP):

procedure ReadHashFunc(GperfName: String; BuiltinsName: String);
const
  ParseAssoValuesStr = ' asso_values[] =';
  ParseNumEntriesStr = '#define MAX_HASH_VALUE ';
  ParseAssoStr = 'str[';
  ParseUseLastStr = 'len - 1]]';
  ParseWordlistStr = 'wordlist[] =';
var
  F: Text;
  L: String;
  Len: Integer;
  i: Integer;
  start: Integer;
  NumAux: Integer;
  ReadingAssoValuesTable: Boolean;
begin
  If NumEntries <> 0 then
  begin
    FreeMem(HashTable);
  end;
  { Parse the gperf C output to determine the hash function's parameters
    (unfortunately, gperf doesn't have a machine-readable output mode) }

  NumAux := 0;
  NumKeys := 0;
  ReadingAssoValuesTable := False;
  IncludeLast := False;

  Assign(F, GperfName);
  Reset(F);
  try
    While not Eof(F) do
    begin
      ReadLn(F, L);
      Len := Length(L);

      if ReadingAssoValuesTable then
      begin
        { asso_values[] goes into HashAux (one entry per char) }
        i := 1;
        While i <= Len do
        begin
          While (i <= Len) and not (L[i] in ['0'..'9']) do
            Inc(i);
          If i > Len then
            break;
          start := i;
          While (i <= Len) and (L[i] in ['0'..'9']) do
            Inc(i);
          If NumAux >= 256 then
            raise EParseGperf.Create('Too many entries in asso_values');
          HashAux[NumAux] := StrToInt(Copy(L, start, i - start));
          Inc(NumAux);
        end;

        if Pos('}', L) <> 0 then
        begin
          If NumAux <> 256 then
            raise EParseGperf.Create('Too few entries in asso_values');
          ReadingAssoValuesTable := False;
        end;
      end
      else
      begin
        { MAX_HASH_VALUE goes into NumEntries }
        if Copy(L, 1, Length(ParseNumEntriesStr)) = ParseNumEntriesStr then
          NumEntries := StrToInt(Copy(L, Length(ParseNumEntriesStr) + 1, Len)) 
+ 1
        else if Pos(ParseAssoValuesStr, L) <> 0 then
          ReadingAssoValuesTable := True
        else if Pos(ParseWordlistStr, L) <> 0 then
          Break
        else
        begin
          { Search for snippets of the form asso_values[str[n]+m].
            The n goes into HashKeys (zero-terminated).
            The m goes into HashOffsets. When absent (i.e. most of the time),
            it means zero (it's how the gperf algorithm deals with anagrams).
            IncludeLast is set to True if n is the string 'len - 1'.
            Note asso_values[str[len - 1]] never has +m. }

          repeat
            // Dot indicates position under consideration.
            // XXX indicates any text.
            // .XXXstr[n]+m]
            start := Pos(ParseAssoStr, L);
            If start = 0 then
              Break;
            L := Copy(L, start + Length(ParseAssoStr), Len);
            Len := Length(L);
            // str[.n]+m]
            If Copy(L, 1, Length(ParseUseLastStr)) = ParseUseLastStr then
              // asso_values[str[.len - 1]]
              // Rest is ignored, not skipped - next iteration will skip it.
              IncludeLast := True
            else
            begin
              i := 1;
              // skip number
              While (i <= Len) and (L[i] in ['0'..'9']) do
                Inc(i);
              // parse number into HashKeys
              If NumKeys >= 256 then
                raise EParseGperf.Create('Too many keys');
              HashKeys[NumKeys] := StrToInt(Copy(L, 1, i - 1));
              // str[n.]+m]
              if (i >= Len - 1) or  (L[i] <> ']') then
                raise EParseGperf.Create('Invalid syntax in Gperf code (missing 
or misplaced '']'' after ''str['')');
              Inc(i);
              If L[i] = '+' then
              begin
                Inc(i);
                start := i;
                // str[n]+.m]
                While (i <= Len) and (L[i] in ['0'..'9']) do
                  Inc(i);
                HashOffsets[NumKeys] := StrToInt(Copy(L, start, i - start));
              end
              else
              begin
                // str[n].]
                HashOffsets[NumKeys] := 0;
              end;
              If (i >= Len) or (L[i] <> ']') then
                raise EParseGperf.Create('Unexpected syntax in Gperf code 
(missing last close bracket)');
              Inc(NumKeys);
            end;
          until False;
        end;
      end

    end;
  finally
    Close(F);
  end;

  If NumEntries = 0 then
    raise EParseGperf.Create('Initialization failed');

  // Read keywords data

  HashTable := GetMem(NumEntries * SizeOf(THashTableEntry));
  For i := 0 to NumEntries - 1 do
    HashTable^[i].Entry := '';

  Assign(F, BuiltinsName);
  Reset(F);
  try
    While not Eof(F) do
    begin
      ReadLn(F, L);
      { TODO: Parse builtins.txt into HashTableEntry }
      i := HashValue(PChar(L), Length(L));
      If (i < 0) or (i >= NumEntries) or (HashTable^[i].Entry <> '') then
        raise EParseBuiltins.Create('Mismatch between builtins.txt and 
builtins.gperf, found on keyword: ''' + L + '''');

      HashTable^[i].Entry := L;
    end;
  finally
    Close(F);
  end;
end;

Note parsing the C program needs elaborated code that could fail in future 
versions. It could probably be simplified by using regular expressions, but my 
main concern is the stability of the output.

The hash value calculator should be reasonably fast, even if not so much as a 
version coming from a code generator would be:

function HashValue(P: PChar; Len: Integer): Integer; inline;
var
  Key: Integer;
  Sum: Integer;
  i: Integer;
begin
  Assert((NumEntries <> 0) and (NumKeys <> 0), 'Not Initialized');

  Sum := Len;
  If Len <> 0 then
  begin
    i := NumKeys;
    Repeat
      Dec(i);
      Key := HashKeys[i];
      If Key >= Len then
        Break;
      Sum := Sum + HashAux[Ord(P[Key]) + HashOffsets[i]];
    until i = 0;
    If IncludeLast then
      Sum := Sum + HashAux[Ord(P[Len - 1])];
  end;
  Result := Sum;
end;

I'm going to add an optimization that would select a version without 
HashOffsets when they are all zero, which is the case for the current language 
keywords.

It would also be nice to be able to tell gperf to not consider the last 
character, and let it determine a set of suitable keys otherwise. That would 
let me avoid the last condition.

Regards,
Sei



reply via email to

[Prev in Thread] Current Thread [Next in Thread]