Initial revision

author: Greg McGary <greg@mcgary.org> 1997-04-18 06:32:28 +0000
committer: Greg McGary <greg@mcgary.org> 1997-04-18 06:32:28 +0000
commit: a0f7cb2db4c51a4a0fed37fde7f1992aec0cc9bb (patch)
tree: 71fa757f5a4f098864af027e41f3189b3613390f /mkid.c
parent: 017635dde1deb6e6fb5b6ab83f0c73727f9eb49b (diff)
download: idutils-a0f7cb2db4c51a4a0fed37fde7f1992aec0cc9bb.tar.gz
idutils-a0f7cb2db4c51a4a0fed37fde7f1992aec0cc9bb.tar.bz2
idutils-a0f7cb2db4c51a4a0fed37fde7f1992aec0cc9bb.zip
1 files changed, 1091 insertions, 0 deletions
diff --git a/mkid.c b/mkid.c
new file mode 100644
index 0000000..39da994
--- /dev/null
+++ b/mkid.c
@@ -0,0 +1,1091 @@
+/* mkid.c -- build an identifer database
+   Copyright (C) 1986, 1995 Greg McGary
+  
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+  
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+  
+   You should have received a copy of the GNU General Public License
+   along with this program; see the file COPYING.  If not, write to the
+   Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+#include "config.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <limits.h>
+#include <assert.h>
+#include <stdio.h>
+#define fileno(FP) ((FP)->_fileno)
+#include <string.h>
+#include "strxtra.h"
+#include <ctype.h>
+#include <errno.h>
+#include "alloc.h"
+#include "idfile.h"
+#include "idarg.h"
+#include "token.h"
+#include "bitops.h"
+#include "misc.h"
+#include "filenames.h"
+#include "scanners.h"
+
+struct summary
+{
+  struct token **sum_tokens;
+  unsigned char const *sum_hits;
+  struct summary *sum_parent;
+  struct summary *sum_kids[8];
+  long sum_tokens_size;
+  long sum_hits_count;
+  int sum_free_index;
+  int sum_level;
+};
+
+#define MAX_LEVELS 5		/* log_8 of the max # of files log_8 32768 == 5 */
+
+struct token
+{
+  unsigned short tok_count;
+  unsigned char tok_flags;
+  unsigned char tok_hits[MAX_LEVELS];
+  char tok_name[1];
+};
+
+struct token **hash_table;	/* Vector of token pointers */
+
+char *bitsset (char *s1, char const *s2, int n);
+char *bitsclr (char *s1, char const *s2, int n);
+char *bitsand (char *s1, char const *s2, int n);
+char *bitsxor (char *s1, char const *s2, int n);
+int bitstst (char const *s1, char const *s2, int n);
+int bitsany (char const *s, int n);
+int round2(int rough);
+struct token *make_token (char const *name, int);
+void scan_1_file (char const *(*get_token) (FILE*, int*), FILE *source_FILE);
+struct idarg *parse_idargs (int argc, char **argv);
+struct idarg *parse_idargs_from_FILE (FILE *arg_FILE, struct idarg *idarg);
+void init_hash_table (int file_name_count);
+void scan_files (struct idarg *idarg);
+void report_statistics (void);
+
+void init_hash (long size);
+struct token **hash_lookup (char const *key);
+unsigned int string_hash_1 (char const *key);
+unsigned int string_hash_2 (char const *key);
+void rehash (void);
+
+void write_idfile (char const *id_file, struct idarg *idargs);
+void bump_current_hits_signature (void);
+void init_hits_signature (int index);
+int bit_to_index (int bit);
+int compare_tokens (void const *x, void const *y);
+void free_summary_tokens (void);
+void summarize (void);
+void assert_hits (struct summary *summary);
+void write_hits (FILE *fp, struct summary *summary, unsigned char const *tail_hits);
+void sign_token (struct token *token);
+void add_token_to_summary (struct summary *summary, struct token *token);
+void init_summary (void);
+struct summary *make_sibling_summary (struct summary *summary);
+int count_vec_size (struct summary *summary, unsigned char const *tail_hits);
+int count_buf_size (struct summary *summary, unsigned char const *tail_hits);
+void usage (void);
+
+/* Miscellaneous statistics */
+long name_tokens;
+long number_tokens;
+long string_tokens;
+long literal_tokens;
+long comment_tokens;
+long occurrences;
+long heap_size;
+long hits_length = 0;
+long tokens_length = 0;
+long output_length = 0;
+
+/* Hash table maintenance */
+long hash_size;			/* # of slots */
+long hash_capacity;		/* # of usable slots */
+long hash_fill;			/* # of keys inserted in table */
+long hash_probes = 0;
+long hash_lookups = 0;
+int hash_rehashes = 0;
+
+int verbose_flag = 0;
+int statistics_flag = 1;
+
+int args_count = 0;		/* # of args to save */
+int scan_count = 0;		/* # of files to scan */
+int file_name_count = 0;	/* # of files in database */
+int levels = 0;			/* ceil(log(8)) of file_name_count */
+
+unsigned char current_hits_signature[MAX_LEVELS];
+#define INIT_TOKENS_SIZE(level) (1 << ((level) + 13))
+struct summary *summary_root;
+struct summary *summary_leaf;
+
+char PWD_name[BUFSIZ];		/* The current working directory */
+char absolute_idfile_name[BUFSIZ]; /* The absolute name of the database */
+char const *id_file = IDFILE;
+
+char const *program_name;
+
+void
+usage (void)
+{
+  fprintf (stderr, "\
+Usage: %s [-v] [-f<idfile>] [(+|-)l[<lang>]] [(+|-)S<scanarg>] [-a<argfile>] [-] [-u] [files...]\n\
+	-v	 Verbose: print reports of progress\n\
+	-a<file> Open file for arguments\n\
+	-	 Read newline-separated args from stdin\n\
+	-l<lang> Force files to be scanned as <lang> until +l<lang>\n\
+	-S<lang>-<arg> Pass arg to <lang> scanner\n\
+	-S.<suffix>=<lang> Scan files with .<suffix> as <lang>\n\
+	-S<lang>? Print usage documentation for <lang>\n\
+	-u	 Update an existing database (unimplemented)\n\
+\n\
+Version %s; Made %s %s\n",
+	   program_name, FULL_VERSION, __DATE__, __TIME__);
+
+  exit (1);
+}
+
+int
+main (int argc, char **argv)
+{
+  struct idarg *idarg_0;
+  char const *sbrk0;
+
+  program_name = basename (GETARG (argc, argv));
+  if (kshgetwd (PWD_name) == NULL)
+    {
+      fprintf (stderr, "%s: cannot get current working directory name.\n", program_name);
+      return 1;
+    }
+  strcat (PWD_name, "/");
+
+  idarg_0 = parse_idargs (argc, argv);
+  if (idarg_0 == NULL)
+    {
+      fprintf (stderr, "Nothing to do...\n");
+      return 0;
+    }
+
+  sbrk0 = (char const *)sbrk (0);
+  init_hash (scan_count * 64);
+
+  strcpy (absolute_idfile_name, span_file_name (PWD_name, id_file));
+  if (access (id_file, 06) < 0
+      && (errno != ENOENT || access (dirname (id_file), 06) < 0))
+    {
+      filerr ("modify", id_file);
+      return 1;
+    }
+  
+  init_hits_signature (0);
+  init_summary ();
+  init_scanners ();
+  
+  scan_files (idarg_0);
+
+  if (hash_fill == 0)
+    return 0;
+
+  free_summary_tokens ();
+  free (hash_table);
+
+  write_idfile (id_file, idarg_0);
+  heap_size = (char const *)sbrk (0) - sbrk0;
+
+  if (statistics_flag)
+    report_statistics ();
+  return 0;
+}
+
+void
+scan_files (struct idarg *idarg)
+{
+  int keep_lang = 0;
+
+  for ( ; idarg->ida_next; idarg = idarg->ida_next)
+    {
+      char const *(*scanner) (FILE*, int*);
+      FILE *source_FILE;
+      char *arg = idarg->ida_arg;
+      char const *lang_name = NULL;
+      char const *suff;
+      char const *filter;
+
+      if (idarg->ida_index < 0)
+	{
+	  int op = *arg++;
+	  switch (*arg++)
+	    {
+	    case 'l':
+	      if (*arg == '\0')
+		{
+		  keep_lang = 0;
+		  lang_name = NULL;
+		  break;
+		}
+	      if (op == '+')
+		keep_lang = 1;
+	      lang_name = arg;
+	      break;
+	    case 'S':
+	      set_scan_args (op, strdup (arg));
+	      break;
+	    default:
+	      usage ();
+	    }
+	  continue;
+	}
+      if (!(idarg->ida_flags & IDA_SCAN_ME))
+	goto skip;
+
+      suff = strrchr (arg, '.');
+      if (lang_name == NULL)
+	{
+	  if (suff == NULL)
+	    suff = "";
+	  lang_name = get_lang_name (suff);
+	  if (lang_name == NULL)
+	    lang_name = get_lang_name ("");
+	  if (lang_name == NULL)
+	    {
+	      fprintf (stderr, "%s: No language assigned to suffix: `%s'\n", program_name, suff);
+	      goto skip;
+	    }
+	}
+      scanner = get_scanner (lang_name);
+      if (scanner == NULL)
+	{
+	  fprintf (stderr, "%s: No scanner for language: `%s'\n", program_name, lang_name);
+	  goto skip;
+	}
+      filter = get_filter (suff);
+      source_FILE = open_source_FILE (arg, filter);
+      if (source_FILE == NULL)
+	goto skip;
+      if (verbose_flag)
+	{
+	  if (filter)
+	    {
+	      printf ("%s: ", lang_name);
+	      printf (filter, arg);
+	      putchar ('\n');
+	    }
+	  else
+	    printf ("%s: %s\n", lang_name, arg);
+	}
+      scan_1_file (scanner, source_FILE);
+      close_source_FILE (source_FILE, filter);
+    skip:
+      if (!keep_lang)
+	lang_name = NULL;
+      if (idarg->ida_index < file_name_count)
+	{
+	  if (current_hits_signature[0] & 0x80)
+	    summarize ();
+	  bump_current_hits_signature ();
+	}
+    }
+}
+
+void
+report_statistics (void)
+{
+  printf ("Name=%ld, ", name_tokens);
+  printf ("Number=%ld, ", number_tokens);
+  printf ("String=%ld, ", string_tokens);
+  printf ("Literal=%ld, ", literal_tokens);
+  printf ("Comment=%ld\n", comment_tokens);
+
+  printf ("Files=%d, ", scan_count);
+  printf ("Tokens=%ld, ", occurrences);
+  printf ("Bytes=%ld Kb, ", input_chars / 1024);
+  printf ("Heap=%ld Kb, ", heap_size / 1024);
+  printf ("Output=%ld (%ld tok, %ld hit)\n", output_length, tokens_length, hits_length);
+
+  printf ("Load=%ld/%ld=%.2f, ", hash_fill, hash_size,
+	  (double) hash_fill / (double) hash_size);
+  printf ("Rehash=%d, ", hash_rehashes);
+  printf ("Probes=%ld/%ld=%.2f, ", hash_probes, hash_lookups,
+	  (double) hash_probes / (double) hash_lookups);
+  printf ("Freq=%ld/%ld=%.2f\n", occurrences, hash_fill,
+	  (double) occurrences / (double) hash_fill);
+}
+
+struct idarg *
+parse_idargs (int argc, char **argv)
+{
+  struct idarg *idarg;
+  struct idarg *idarg_0;
+  char *arg;
+  int op;
+  FILE *arg_FILE = NULL; 
+  int args_from = 0;
+  enum {
+      AF_CMDLINE = 0x1,		/* file args came on command line */
+      AF_FILE = 0x2,		/* file args came from a file (-f<file>) */
+      AF_IDFILE = 0x4,		/* file args came from an old ID file (-u) */
+      AF_QUERY = 0x8
+  };				/* no file args necessary: usage query */
+
+  idarg = idarg_0 = CALLOC(struct idarg, 1);
+
+  /* Process some arguments, and snarf-up some others for processing
+     later.  */
+  while (argc)
+    {
+      arg = GETARG (argc, argv);
+      if (*arg != '-' && *arg != '+')
+	{
+	  /* arguments are from command line (not pipe) */
+	  args_from |= AF_CMDLINE;
+	  idarg->ida_arg = arg;
+	  idarg->ida_flags = IDA_SCAN_ME;
+	  idarg->ida_index = file_name_count++;
+	  scan_count++;
+	  idarg = (idarg->ida_next = CALLOC(struct idarg, 1));
+
+	  continue;
+	}
+      op = *arg++;
+      switch (*arg++)
+	{
+	case 'u':
+#if 0
+	  args_from |= AF_IDFILE;
+	  old_idarg (id_file, &idarg);
+	  break;
+#endif
+	case '\0':
+	  args_from |= AF_FILE;
+	  idarg = parse_idargs_from_FILE (stdin, idarg);
+	  break;
+	case 'a':
+	  arg_FILE = fopen (arg, "r");
+	  if (arg_FILE == NULL)
+	    filerr ("open", arg);
+	  else
+	    {
+	      args_from |= AF_FILE;
+	      idarg = parse_idargs_from_FILE (arg_FILE, idarg);
+	    }
+	  break;
+	case 'f':
+	  id_file = arg;
+	  break;
+	case 'v':
+	  verbose_flag = 1;
+	  break;
+	case 'S':
+	  if (strchr (&arg[-2], '?'))
+	    {
+	      set_scan_args (op, arg);
+	      args_from |= AF_QUERY;
+	    }
+	  /*FALLTHROUGH */
+	case 'l':
+	case 's':
+	case 'r':
+	  idarg->ida_arg = &arg[-2];
+	  idarg->ida_index = -1;
+	  idarg = (idarg->ida_next = CALLOC(struct idarg, 1));
+
+	  args_count++;
+	  break;
+	default:
+	  usage ();
+	}
+    }
+
+  if (args_from & AF_QUERY)
+    exit (0);
+  /* File args should only come from one place.  Ding the user if
+     arguments came from multiple places, or if none were supplied at
+     all.  */
+  switch (args_from)
+    {
+    case AF_CMDLINE:
+    case AF_FILE:
+    case AF_IDFILE:
+      if (file_name_count > 0)
+	break;
+      /*FALLTHROUGH */
+    case 0:
+      fprintf (stderr, "%s: Use -u, -f<file>, or cmd-line for file args!\n", program_name);
+      usage ();
+    default:
+      fprintf (stderr, "%s: Use only one of: -u, -f<file>, or cmd-line for file args!\n", program_name);
+      usage ();
+    }
+
+  if (scan_count == 0)
+    return NULL;
+
+  return idarg_0;
+}
+
+
+/* Cons up a list of idarg as supplied in a file. */
+struct idarg *
+parse_idargs_from_FILE (FILE *arg_FILE, struct idarg *idarg)
+{
+  int file_count;
+  char buf[BUFSIZ];
+  char *arg;
+
+  file_count = 0;
+  while (fgets (buf, sizeof (buf), arg_FILE))
+    {
+      idarg->ida_arg = arg = strndup (buf, strlen (buf) - 1);
+      if (*arg == '+' || *arg == '-')
+	idarg->ida_index = -1;
+      else
+	{
+	  idarg->ida_flags = IDA_SCAN_ME;
+	  idarg->ida_index = file_name_count++;
+	  scan_count++;
+	}
+      idarg = idarg->ida_next = CALLOC (struct idarg, 1);
+    }
+  return idarg;
+}
+
+void
+scan_1_file (char const *(*get_token) (FILE*, int*), FILE *source_FILE)
+{
+  struct token **slot;
+  char const *key;
+  int flags;
+
+  while ((key = (*get_token) (source_FILE, &flags)) != NULL)
+    {
+      struct token *token = *(slot = hash_lookup (key));
+
+      if (token)
+	{
+	  token->tok_flags |= flags;
+	  if (token->tok_count < USHRT_MAX)
+	    token->tok_count++;
+	  if (!(token->tok_hits[0] & current_hits_signature[0]))
+	    sign_token (token);
+	} else {
+	  *slot = token = make_token (key, flags);
+	  sign_token (token);
+	  if (hash_fill++ >= hash_capacity)
+	    rehash ();
+	}
+    }
+}
+
+/* As the database is written, may need to adjust the file names.  If
+   we are generating the ID file in a remote directory, then adjust
+   the file names to be relative to the location of the ID database.
+
+   (This would be a common useage if you want to make a database for a
+   directory which you have no write access to, so you cannot create
+   the ID file.)  */
+void 
+write_idfile (char const *id_file, struct idarg *idarg)
+{
+  struct token **tokens;
+  int i;
+  FILE *id_FILE;
+  int lasti;
+  struct idhead idh;
+  int fixup_names;
+  char *lsl;
+  int buf_size;
+  int vec_size;
+  int tok_size;
+  int max_buf_size = 0;
+  int max_vec_size = 0;
+
+  if (verbose_flag)
+    printf ("Writing `%s'...\n", id_file);
+  lsl = strrchr (relative_file_name (PWD_name, absolute_idfile_name), '/');
+  if (lsl == NULL)
+    {
+      /* The database is in the cwd, don't adjust the names */
+      fixup_names = 0;
+    }
+  else
+    {
+      /* The database is not in cwd, adjust names so they are relative
+	 to the location of the database, make absolute_idfile_name just be the
+	 directory path to ID.  */
+      fixup_names = 1;
+      *(lsl + 1) = '\0';
+    }
+  id_FILE = fopen (id_file, "w+b");
+  if (id_FILE == NULL)
+    {
+      filerr ("create", id_file);
+      exit (1);
+    }
+  strncpy (idh.idh_magic, IDH_MAGIC, sizeof (idh.idh_magic));
+  idh.idh_version = IDH_VERSION;
+  idh.idh_pad_1 = 0;
+  idh.idh_flags = IDH_COUNTS;
+
+  /* write out the list of pathnames */
+  fseek (id_FILE, sizeof_idhead (), 0);
+  idh.idh_args_offset = ftell (id_FILE);
+  for (i = lasti = 0; idarg->ida_next; idarg = idarg->ida_next)
+    {
+      if (idarg->ida_index > 0)
+	while (++lasti < idarg->ida_index)
+	  {
+	    i++;
+	    putc ('\0', id_FILE);
+	  }
+      if (fixup_names)
+	fputs (relative_file_name (absolute_idfile_name, span_file_name (PWD_name, idarg->ida_arg)), id_FILE);
+      else
+	fputs (idarg->ida_arg, id_FILE);
+      i++;
+      putc ('\0', id_FILE);
+    }
+  idh.idh_args = i;
+  idh.idh_paths = file_name_count;
+
+  /* write out the list of identifiers */
+
+  putc ('\0', id_FILE);
+  putc ('\0', id_FILE);
+  idh.idh_tokens_offset = ftell(id_FILE);
+  
+  assert (summary_root->sum_hits_count == hash_fill);
+  tokens = REALLOC (summary_root->sum_tokens, struct token *, hash_fill);
+  qsort (tokens, hash_fill, sizeof (struct token *), compare_tokens);
+  for (i = 0; i < hash_fill; i++, tokens++)
+    {
+      struct token *token = *tokens;
+      if (*token->tok_name == '\0')
+	{
+	  fprintf (stderr, "Blank token!\n");
+	  hash_fill--;
+	  i--;
+	  continue;
+	}
+      occurrences += token->tok_count;
+      if (token->tok_flags & TOK_NUMBER)
+	number_tokens++;
+      if (token->tok_flags & TOK_NAME)
+	name_tokens++;
+      if (token->tok_flags & TOK_STRING)
+	string_tokens++;
+      if (token->tok_flags & TOK_LITERAL)
+	literal_tokens++;
+      if (token->tok_flags & TOK_COMMENT)
+	comment_tokens++;
+
+      fputs (token->tok_name, id_FILE);
+      putc ('\0', id_FILE);
+      if (token->tok_count > 0xff)
+	token->tok_flags |= TOK_SHORT_COUNT;
+      putc (token->tok_flags, id_FILE);
+      putc (token->tok_count & 0xff, id_FILE);
+      if (token->tok_flags & TOK_SHORT_COUNT)
+	putc (token->tok_count >> 8, id_FILE);
+
+      vec_size = count_vec_size (summary_root, token->tok_hits + levels);
+      buf_size = count_buf_size (summary_root, token->tok_hits + levels);
+      hits_length += buf_size;
+      tok_size = strlen (token->tok_name) + 1;
+      tokens_length += tok_size;
+      buf_size += tok_size + sizeof (token->tok_flags) + sizeof (token->tok_count) + 2;
+      if (buf_size > max_buf_size)
+	  max_buf_size = buf_size;
+      if (vec_size > max_vec_size)
+	  max_vec_size = vec_size;
+
+      write_hits (id_FILE, summary_root, token->tok_hits + levels);
+      putc ('\0', id_FILE);
+      putc ('\0', id_FILE);
+    }
+  assert_hits (summary_root);
+  idh.idh_tokens = hash_fill;
+  output_length = ftell (id_FILE);
+  idh.idh_end_offset = output_length - 2;
+  idh.idh_buf_size = max_buf_size;
+  idh.idh_vec_size = max_vec_size;
+
+  write_idhead (id_FILE, &idh);
+  fclose (id_FILE);
+}
+
+void
+init_hash (long size)
+{
+  hash_size = round2 (size);
+  if (hash_size < 512)
+    hash_size = 512;
+#if 0
+  if (hash_size > 0x8000)
+    hash_size = 0x8000;
+#endif
+  if (hash_size > (128 * 1024))
+      hash_size /= 2;
+  hash_capacity = hash_size - (hash_size >> 4);	/* about 94% */
+  hash_table =  CALLOC (struct token *, hash_size);
+  assert (hash_table);
+}
+
+/* Use double hashing with open addressing.
+   The table size is always a power of two.  */
+struct token **
+hash_lookup (char const *key)
+{
+  unsigned int hash1 = string_hash_1 (key) % hash_size;
+  unsigned int hash2 = 0;
+  struct token **slot = &hash_table[hash1];
+
+  hash_lookups++;
+  for (;;)
+    {
+      hash_probes++;
+      if (*slot == NULL || strcmp (key, (*slot)->tok_name) == 0)
+	return slot;
+
+      if (!hash2)
+	  hash2 = string_hash_2 (key);
+      hash1 = (hash1 + hash2) % hash_size;
+      slot = &hash_table[hash1];
+    }
+}
+
+/* Primary hash function for string keys. */
+unsigned int
+string_hash_1 (char const *key)
+{
+  unsigned int sum = 0;
+
+  key--;
+  while (*++key)
+    sum += (*key << (key[1] & 0xf));
+
+  return sum;
+}
+
+/* Secondary hash function for string keys.  The result must be
+   relatively prime to the table size, which is a power of two,
+   so any odd number will do.  */
+unsigned int
+string_hash_2 (char const *key)
+{
+  unsigned int sum = 0;
+
+  key--;
+  while (*++key)
+    sum += (*key << (key[1] & 0x7));
+
+  return sum | 1;
+}
+
+/* Double the size of the hash table in the event of overflow... */
+void 
+rehash (void)
+{
+  long old_hash_size = hash_size;
+  struct token **old_hash_table = hash_table;
+  struct token **htp;
+  struct token **slot;
+
+  hash_size *= 2;
+  if (verbose_flag)
+    printf ("Rehashing... (doubling size to %ld)\n", hash_size);
+  hash_rehashes++;
+  hash_capacity = hash_size - (hash_size >> 4);
+  hash_table =  CALLOC (struct token *, hash_size);
+
+  for (htp = old_hash_table; htp < &old_hash_table[old_hash_size]; htp++)
+    {
+      if (*htp == NULL)
+	continue;
+      slot = hash_lookup ((*htp)->tok_name);
+
+      if (*slot)
+	{
+	  fprintf (stderr, "%s: Duplicate hash entry!\n", (*slot)->tok_name);
+	  exit (1);
+	}
+      *slot = *htp;
+    }
+  free (old_hash_table);
+}
+
+/* Round a given number up to the nearest power of 2. */
+int 
+round2 (int rough)
+{
+  int round;
+
+  round = 1;
+  while (rough)
+    {
+      round <<= 1;
+      rough >>= 1;
+    }
+  return round;
+}
+
+/* Allocate a new token struct and fill in the name field.  We
+   allocate memory in large chunks to avoid frequent calls to malloc ()
+   which is a major pig.  */
+struct token *
+make_token (char const *name, int flags)
+{
+  struct token *token = (struct token *) malloc (sizeof (struct token) + strlen (name));
+
+  if (!token)
+    {
+      fprintf (stderr, "malloc failure! \n");
+      exit (1);
+    }
+  token->tok_count = 1;
+  token->tok_flags = flags;
+  memset (token->tok_hits, 0, sizeof (token->tok_hits));
+  strcpy (token->tok_name, name);
+
+  return token;
+}
+
+/* ///////////// summary stuff //////////////////////////////////////////// */
+
+void
+bump_current_hits_signature (void)
+{
+  unsigned char *hits = current_hits_signature;
+  while (*hits & 0x80)
+    *hits++ = 1;
+  *hits <<= 1;
+}
+
+void
+init_hits_signature (int index)
+{
+  unsigned char *hits = current_hits_signature;
+  unsigned char const *end = &current_hits_signature[MAX_LEVELS];
+  while (hits < end)
+    {
+      *hits = 1 << (index & 7);
+      index >>= 3;
+      hits++;
+    }
+}
+
+int
+bit_to_index (int bit)
+{
+  int i = 0;
+  while (bit >>= 1)
+    i++;
+  return i;
+}
+
+int
+compare_tokens (void const *x, void const *y)
+{
+  return strcmp ((*(struct token const *const *)x)->tok_name,
+		 (*(struct token const *const *)y)->tok_name);
+}
+
+void
+free_summary_tokens (void)
+{
+  struct summary *summary = summary_leaf;
+  while (summary != summary_root)
+    {
+      free (summary->sum_tokens);
+      summary = summary->sum_parent;
+    }
+}
+
+void
+summarize (void)
+{
+  unsigned char const *hits_sig = current_hits_signature;
+  struct summary *summary = summary_leaf;
+
+  do
+    {
+      long count = summary->sum_hits_count;
+      unsigned char *hits = MALLOC (unsigned char, count + 1);
+      int level = summary->sum_level;
+      struct token **tokens = summary->sum_tokens;
+      long init_size = INIT_TOKENS_SIZE (summary->sum_level);
+
+      if (verbose_flag)
+	{
+	  char const *fmt;
+	  if (count < init_size / 2)
+	    fmt = "level %d: %ld < %ld/2\n";
+	  else if (count > init_size * 2)
+	    fmt = "level %d: %ld > %ld*2\n";
+	  else if (count < init_size)
+	    fmt = "level %d: %ld < %ld\n";
+	  else if (count > init_size)
+	    fmt = "level %d: %ld > %ld\n";
+	  else
+	    fmt = "level %d: %ld == %ld\n";
+	  printf (fmt, summary->sum_level, count, init_size);
+	}
+
+      qsort (tokens, count, sizeof (struct token *), compare_tokens);
+      summary->sum_hits = hits;
+      while (count--)
+	{
+	  unsigned char *hit = &(*tokens++)->tok_hits[level];
+	  *hits++ = *hit;
+	  *hit = 0;
+	}
+      *hits++ = 0;
+      if (summary->sum_parent)
+	{
+	  free (summary->sum_tokens);
+	  summary->sum_tokens = 0;
+	}
+      summary = summary->sum_parent;
+    }
+  while (*++hits_sig & 0x80);
+  summary_leaf = make_sibling_summary (summary_leaf);
+}
+
+void
+init_summary (void)
+{
+  long size = INIT_TOKENS_SIZE (0);
+  summary_root = summary_leaf = CALLOC (struct summary, 1);
+  summary_root->sum_tokens_size = size;
+  summary_root->sum_tokens = MALLOC (struct token *, size);
+}
+
+struct summary *
+make_sibling_summary (struct summary *summary)
+{
+  struct summary *parent = summary->sum_parent;
+  long size;
+
+  if (parent == NULL)
+    {
+      levels++;
+      size = INIT_TOKENS_SIZE (levels);
+      summary_root = summary->sum_parent = parent = CALLOC (struct summary, 1);
+      parent->sum_level = levels;
+      parent->sum_kids[0] = summary;
+      parent->sum_hits_count = summary->sum_hits_count;
+      parent->sum_free_index = 1;
+      parent->sum_tokens_size = size;
+      parent->sum_tokens = REALLOC (summary->sum_tokens, struct token *, size);
+      summary->sum_tokens = 0;
+    }
+  if (parent->sum_free_index == 8)
+    parent = make_sibling_summary (parent);
+  summary = CALLOC (struct summary, 1);
+  summary->sum_level = parent->sum_level - 1;
+  parent->sum_kids[parent->sum_free_index++] = summary;
+  summary->sum_parent = parent;
+  size = INIT_TOKENS_SIZE (summary->sum_level);
+  summary->sum_tokens_size = size;
+  summary->sum_tokens = MALLOC (struct token *, size);
+  return summary;
+}
+
+int
+count_vec_size (struct summary *summary, unsigned char const *tail_hits)
+{
+  struct summary **kids;
+  unsigned int hits = (summary->sum_hits ? *summary->sum_hits : *tail_hits);
+
+  kids = summary->sum_kids;
+  if (*kids == NULL)
+    {
+      static char bits_per_nybble[] = { 0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4 };
+      return bits_per_nybble[hits & 0xf] + bits_per_nybble[hits >> 4];
+    }
+  else
+    {
+      int bit;
+      int count = 0;
+      --tail_hits;
+      for (bit = 1; bit & 0xff; bit <<= 1, ++kids)
+	if (bit & hits)
+	  count += count_vec_size (*kids, tail_hits);
+      return count;
+    }
+}
+
+int
+count_buf_size (struct summary *summary, unsigned char const *tail_hits)
+{
+  struct summary **kids;
+  unsigned int hits = (summary->sum_hits ? *summary->sum_hits : *tail_hits);
+
+  kids = summary->sum_kids;
+  if (*kids == NULL)
+    return 1;
+  else
+    {
+      int bit;
+      int count = 1;
+      --tail_hits;
+      for (bit = 1; bit & 0xff; bit <<= 1, ++kids)
+	if (bit & hits)
+	  count += count_buf_size (*kids, tail_hits);
+      return count;
+    }
+}
+
+void
+assert_hits (struct summary* summary)
+{
+  struct summary **kids = summary->sum_kids;
+  struct summary **end = &kids[8];
+
+  if (summary == summary_root)
+    assert (summary->sum_hits == 0);
+  else
+    assert (summary->sum_hits && *summary->sum_hits == 0);
+  
+  if (end[-1] == 0)
+    while (*--end == 0)
+      ;
+  while (kids < end)
+    assert_hits (*kids++);
+}
+
+void
+write_hits (FILE *fp, struct summary *summary, unsigned char const *tail_hits)
+{
+  struct summary **kids;
+  unsigned int hits = (summary->sum_hits ? *summary->sum_hits++ : *tail_hits);
+
+  assert (hits);
+  putc (hits, fp);
+
+  kids = summary->sum_kids;
+  if (*kids)
+    {
+      int bit;
+      --tail_hits;
+      for (bit = 1; (bit & 0xff) && *kids; bit <<= 1, ++kids)
+	if (bit & hits)
+	  write_hits (fp, *kids, tail_hits);
+    }
+}
+
+void
+sign_token (struct token *token)
+{
+  unsigned char *tok_hits = token->tok_hits;
+  unsigned char *hits_sig = current_hits_signature;
+  unsigned char *end = &current_hits_signature[MAX_LEVELS];
+  struct summary *summary = summary_leaf;
+
+  while (summary)
+    {
+      if (*tok_hits == 0)
+	add_token_to_summary (summary, token);
+      if (*tok_hits & *hits_sig)
+	break;
+      *tok_hits |= *hits_sig;
+      summary = summary->sum_parent;
+      tok_hits++;
+      hits_sig++;
+    }
+  while (hits_sig < end)
+    {
+      if (*tok_hits & *hits_sig)
+	break;
+      *tok_hits |= *hits_sig;
+      tok_hits++;
+      hits_sig++;
+    }
+}
+
+void
+add_token_to_summary (struct summary *summary, struct token *token)
+{
+  long size = summary->sum_tokens_size;
+
+  if (summary->sum_hits_count >= size)
+    {
+      size *= 2;
+      summary->sum_tokens = REALLOC (summary->sum_tokens, struct token *, size);
+      summary->sum_tokens_size = size;
+    }
+  summary->sum_tokens[summary->sum_hits_count++] = token;
+}
+
+int
+bitsany (char const *s, int n)
+{
+  while (n--)
+    if (*s++)
+      return 1;
+
+  return 0;
+}
+
+char *
+bitsset (char *s1, char const *s2, int n)
+{
+  while (n--)
+    *s1++ |= *s2++;
+
+  return s1;
+}
+
+char *
+bitsclr (char *s1, char const *s2, int n)
+{
+  while (n--)
+    *s1++ &= ~*s2++;
+
+  return s1;
+}
+
+#if 0
+
+char *
+bitsand (char *s1, char const *s2, int n)
+{
+  while (n--)
+    *s1++ &= *s2++;
+
+  return s1;
+}
+
+char *
+bitsxor (char *s1, char const *s2, int n)
+{
+  while (n--)
+    *s1++ ^= *s2++;
+
+  return s1;
+}
+
+int
+bitstst (char const *s1, char const *s2, int n)
+{
+  while (n--)
+    if (*s1++ & *s2++)
+      return 1;
+
+  return 0;
+}
+
+#endif
author	Greg McGary <greg@mcgary.org>	1997-04-18 06:32:28 +0000
committer	Greg McGary <greg@mcgary.org>	1997-04-18 06:32:28 +0000
commit	a0f7cb2db4c51a4a0fed37fde7f1992aec0cc9bb (patch)
tree	71fa757f5a4f098864af027e41f3189b3613390f /mkid.c
parent	017635dde1deb6e6fb5b6ab83f0c73727f9eb49b (diff)
download	idutils-a0f7cb2db4c51a4a0fed37fde7f1992aec0cc9bb.tar.gz idutils-a0f7cb2db4c51a4a0fed37fde7f1992aec0cc9bb.tar.bz2 idutils-a0f7cb2db4c51a4a0fed37fde7f1992aec0cc9bb.zip