eliot-dev
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Eliot-dev] Changes to eliot/dic/compdic.c [antoine-1]


From: eliot-dev
Subject: [Eliot-dev] Changes to eliot/dic/compdic.c [antoine-1]
Date: Sun, 23 Oct 2005 13:13:58 -0400

Index: eliot/dic/compdic.c
diff -u /dev/null eliot/dic/compdic.c:1.5.2.1
--- /dev/null   Sun Oct 23 17:13:58 2005
+++ eliot/dic/compdic.c Sun Oct 23 17:13:56 2005
@@ -0,0 +1,347 @@
+/* Eliot                                                                     */
+/* Copyright (C) 1999  Antoine Fraboulet                                     */
+/*                                                                           */
+/* This file is part of Eliot.                                               */
+/*                                                                           */
+/* Eliot is free software; you can redistribute it and/or modify             */
+/* it under the terms of the GNU General Public License as published by      */
+/* the Free Software Foundation; either version 2 of the License, or         */
+/* (at your option) any later version.                                       */
+/*                                                                           */
+/* Elit is distributed in the hope that it will be useful,                   */
+/* but WITHOUT ANY WARRANTY; without even the implied warranty of            */
+/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the             */
+/* GNU General Public License for more details.                              */
+/*                                                                           */
+/* You should have received a copy of the GNU General Public License         */
+/* along with this program; if not, write to the Free Software               */
+/* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA 
*/
+
+<<<<<<< compdic.c
+/* $Id: compdic.c,v 1.5.2.1 2005/10/23 17:13:56 afrab Exp $ */
+
+/**
+ *  \file   compdic.c
+ *  \brief  Program used to compress a dictionary
+ *  \author Antoine Fraboulet
+ *  \date   1999
+=======
+/* 
+ * $Id: compdic.c,v 1.5.2.1 2005/10/23 17:13:56 afrab Exp $ 
+>>>>>>> 1.5
+ */
+
+#include <time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+#include <assert.h>
+#include "hashtable.h"
+#include "dic_internals.h"
+
+//#define DEBUG_LIST
+//#define DEBUG_OUTPUT
+//#define DEBUG_OUTPUT_L2
+#define CHECK_RECURSION
+
+char*
+load_uncompressed(const char* file_name, unsigned int *dic_size)
+{
+  unsigned r;
+  char *uncompressed;
+  FILE* file_desc;
+
+  if ((file_desc = fopen (file_name, "r")) == NULL)
+    return NULL;
+
+  if ((uncompressed = (char*)malloc (sizeof(char)*(*dic_size))) == NULL)
+    return NULL;
+
+  r = fread (uncompressed, 1, *dic_size, file_desc);
+  if (r < *dic_size)
+    {
+      /* \n is 2 chars under MS OS */
+      printf("\n");
+      printf("** The number of bytes read is less than the size of the file 
**\n");
+      printf("** this may be OK if you run a Microsoft OS but not on Unix   
**\n");
+      printf("** please check the results.                                  
**\n");
+      printf("\n");
+      *dic_size = r;
+    }
+
+  fclose(file_desc);
+  return uncompressed;
+}
+
+
+int
+file_length(const char* file_name)
+{
+  struct stat stat_buf;
+  if (stat (file_name, &stat_buf) < 0)
+    return - 1;
+  return (int) stat_buf.st_size;
+}
+
+
+void
+skip_init_header(FILE* outfile, Dict_header *header)
+{
+  header->unused_1   = 0;
+  header->unused_2   = 0;
+  header->root       = 0;
+  header->nwords     = 0;
+  header->nodesused  = 1;
+  header->edgesused  = 1;
+  header->nodessaved = 0;
+  header->edgessaved = 0;
+
+  fwrite (header, sizeof(Dict_header), 1, outfile);
+}
+
+
+void
+fix_header(FILE* outfile, Dict_header* header)
+{
+  strcpy(header->ident,_COMPIL_KEYWORD_);
+  header->root = header->edgesused;
+  rewind (outfile);
+  fwrite (header, sizeof(Dict_header), 1, outfile);
+}
+
+
+void
+print_header_info(Dict_header *header)
+{
+  printf("============================\n");
+  printf("keyword length %d bytes\n",(int)strlen(_COMPIL_KEYWORD_));
+  printf("keyword size   %d bytes\n",sizeof(_COMPIL_KEYWORD_));
+  printf("header size    %d bytes\n",sizeof(Dict_header));
+  printf("\n");
+  printf("%d words\n",header->nwords);
+  printf("\n");
+  printf("root : %7d (edge)\n",header->root);
+  printf("root : %7d (byte)\n",header->root * sizeof(Dawg_edge));
+  printf("\n");
+  printf("nodes : %d+%d\n",header->nodesused, header->nodessaved);
+  printf("edges : %d+%d\n",header->edgesused, header->edgessaved);
+  printf("============================\n");
+}
+
+
+void
+write_node(Dawg_edge *edges, int size, int num, FILE* outfile)
+{
+#ifdef DEBUG_OUTPUT
+  int i;
+  printf("writing %d edges\n",num);
+  for(i=0; i<num; i++)
+    {
+#ifdef DEBUG_OUTPUT_L2
+      printf("ptr=%2d t=%d l=%d f=%d chr=%2d (%c)\n",
+            edges[i].ptr, edges[i].term, edges[i].last,
+            edges[i].fill, edges[i].chr, edges[i].chr -1 +'a');
+#endif
+      fwrite (edges+i, sizeof(Dawg_edge), 1, outfile);
+    }
+#else
+  fwrite (edges, size, num, outfile);
+#endif
+}
+
+#define MAX_STRING_LENGTH 200
+
+
+#define MAX_EDGES 2000
+/* ods3: ??   */
+/* ods4: 1746 */
+
+/* global variables */
+FILE*       global_outfile;
+Dict_header global_header;
+Hash_table  global_hashtable;
+
+char        global_stringbuf[MAX_STRING_LENGTH]; /* Space for current string */
+char*       global_endstring;                    /* Marks END of current 
string */
+char*       global_input;
+char*       global_endofinput;
+
+/*
+ * Makenode takes a prefix (as position relative to stringbuf) and
+ * returns an index of the start node of a dawg that recognizes all
+ * words beginning with that prefix.  String is a pointer (relative
+ * to stringbuf) indicating how much of prefix is matched in the
+ * input.
+ */
+#ifdef CHECK_RECURSION
+int current_rec =0;
+int max_rec = 0;
+#endif
+
+unsigned int
+makenode(char *prefix)
+{
+  int    numedges;
+  Dawg_edge  edges[MAX_EDGES];
+  Dawg_edge *edgeptr = edges;
+  unsigned  *saved_position;
+
+#ifdef CHECK_RECURSION
+  current_rec++;
+  if (current_rec > max_rec)
+    max_rec = current_rec;
+#endif
+
+  while (prefix == global_endstring)
+    {
+      /* More edges out of node */
+      edgeptr->ptr  = 0;
+      edgeptr->term = 0;
+      edgeptr->last = 0;
+      edgeptr->fill = 0;
+      edgeptr->chr  = 0;
+
+      (*(edgeptr++)).chr = (*global_endstring++ = *global_input++) & 
DIC_CHAR_MASK;
+      if (*global_input == '\n')                 /* End of a word */
+        {
+          global_header.nwords++;
+          edgeptr[-1].term = 1;                  /* Mark edge as word */
+          *global_endstring++ = *global_input++; /* Skip \n */
+          if (global_input == global_endofinput) /* At end of input? */
+            break;
+
+         global_endstring = global_stringbuf;
+         while(*global_endstring == *global_input)
+           {
+              global_endstring++;
+              global_input++;
+           }
+        }
+      /* make dawg pointed to by this edge */
+      edgeptr[-1].ptr = makenode(prefix + 1);
+    }
+
+  numedges = edgeptr - edges;
+  if (numedges == 0)
+    {
+#ifdef CHECK_RECURSION
+      current_rec --;
+#endif
+      return 0;             /* Special node zero - no edges */
+    }
+
+  edgeptr[-1].last = 1;     /* Mark the last edge */
+
+  saved_position = (unsigned int*) hash_find (global_hashtable,
+                                             (void*)edges,
+                                             numedges*sizeof(Dawg_edge));
+  if (saved_position)
+    {
+      global_header.edgessaved += numedges;
+      global_header.nodessaved++;
+
+#ifdef CHECK_RECURSION
+      current_rec --;
+#endif
+      return *saved_position;
+    }
+  else
+    {
+      unsigned int node_pos;
+
+      node_pos = global_header.edgesused;
+      hash_add(global_hashtable,
+              (void*)edges,numedges*sizeof(Dawg_edge),
+              
(void*)(&global_header.edgesused),sizeof(global_header.edgesused));
+      global_header.edgesused += numedges;
+      global_header.nodesused++;
+      write_node (edges, sizeof(Dawg_edge), numedges, global_outfile);
+
+#ifdef CHECK_RECURSION
+      current_rec --;
+#endif
+      return node_pos;
+    }
+}
+
+
+
+
+int
+main(int argc, char* argv[])
+{
+  unsigned int dicsize;
+  char *uncompressed;
+  Dawg_edge rootnode = {0,0,0,0,0};
+  Dawg_edge specialnode = {0,0,0,0,0};
+
+  char* outfilename;
+  char outfilenamedefault[] = "dict.daw";
+  clock_t starttime, endtime;
+
+  if (argc < 2)
+    {
+      fprintf(stderr,"usage: %s uncompressed_dic [compressed_dic]\n",argv[0]);
+      exit(1);
+    }
+
+  dicsize = file_length (argv[1]);
+  if (dicsize < 0)
+    {
+      fprintf(stderr,"Cannot stat uncompressed dictionary %s\n",argv[1]);
+      exit(1);
+    }
+
+  outfilename = (argc == 3) ? argv[2] : outfilenamedefault;
+
+  if ((global_outfile = fopen (outfilename,"wb")) == NULL)
+    {
+      fprintf(stderr,"Cannot open output file %s\n",outfilename);
+      exit(1);
+    }
+
+  if ((uncompressed = load_uncompressed(argv[1], &dicsize)) == NULL)
+    {
+      fprintf(stderr,"Cannot load uncompressed dictionary into memory\n");
+      exit(1);
+    }
+
+  global_input = uncompressed;
+  global_endofinput = global_input + dicsize;
+
+#define SCALE 0.6
+  global_hashtable = hash_init((unsigned int)(dicsize * SCALE));
+#undef SCALE
+
+  skip_init_header(global_outfile,&global_header);
+
+  specialnode.last = 1;
+  write_node(&specialnode,sizeof(specialnode),1,global_outfile);
+  /*
+   * Call makenode with null (relative to stringbuf) prefix;
+   * Initialize string to null; Put index of start node on output
+   */
+  starttime=clock();
+  rootnode.ptr = makenode(global_endstring = global_stringbuf);
+  endtime=clock();
+  write_node(&rootnode,sizeof(rootnode),1,global_outfile);
+
+  fix_header(global_outfile,&global_header);
+
+  print_header_info(&global_header);
+  hash_destroy(global_hashtable);
+  free(uncompressed);
+  fclose(global_outfile);
+
+  printf(" Elapsed time is                 : %f s\n", 1.0*(endtime-starttime) 
/ CLOCKS_PER_SEC);
+#ifdef CHECK_RECURSION
+  printf(" Maximum recursion level reached : %d\n",max_rec);
+#endif
+  return 0;
+}
+
+




reply via email to

[Prev in Thread] Current Thread [Next in Thread]