summaryrefslogtreecommitdiffstats
path: root/mkid.c
diff options
context:
space:
mode:
authorGreg McGary <greg@mcgary.org>1997-04-18 06:32:28 +0000
committerGreg McGary <greg@mcgary.org>1997-04-18 06:32:28 +0000
commita0f7cb2db4c51a4a0fed37fde7f1992aec0cc9bb (patch)
tree71fa757f5a4f098864af027e41f3189b3613390f /mkid.c
parent017635dde1deb6e6fb5b6ab83f0c73727f9eb49b (diff)
downloadidutils-a0f7cb2db4c51a4a0fed37fde7f1992aec0cc9bb.tar.gz
idutils-a0f7cb2db4c51a4a0fed37fde7f1992aec0cc9bb.tar.bz2
idutils-a0f7cb2db4c51a4a0fed37fde7f1992aec0cc9bb.zip
Initial revision
Diffstat (limited to 'mkid.c')
-rw-r--r--mkid.c1091
1 files changed, 1091 insertions, 0 deletions
diff --git a/mkid.c b/mkid.c
new file mode 100644
index 0000000..39da994
--- /dev/null
+++ b/mkid.c
@@ -0,0 +1,1091 @@
+/* mkid.c -- build an identifer database
+ Copyright (C) 1986, 1995 Greg McGary
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; see the file COPYING. If not, write to the
+ Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+#include "config.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <limits.h>
+#include <assert.h>
+#include <stdio.h>
+#define fileno(FP) ((FP)->_fileno)
+#include <string.h>
+#include "strxtra.h"
+#include <ctype.h>
+#include <errno.h>
+#include "alloc.h"
+#include "idfile.h"
+#include "idarg.h"
+#include "token.h"
+#include "bitops.h"
+#include "misc.h"
+#include "filenames.h"
+#include "scanners.h"
+
+struct summary
+{
+ struct token **sum_tokens;
+ unsigned char const *sum_hits;
+ struct summary *sum_parent;
+ struct summary *sum_kids[8];
+ long sum_tokens_size;
+ long sum_hits_count;
+ int sum_free_index;
+ int sum_level;
+};
+
+#define MAX_LEVELS 5 /* log_8 of the max # of files log_8 32768 == 5 */
+
+struct token
+{
+ unsigned short tok_count;
+ unsigned char tok_flags;
+ unsigned char tok_hits[MAX_LEVELS];
+ char tok_name[1];
+};
+
+struct token **hash_table; /* Vector of token pointers */
+
+char *bitsset (char *s1, char const *s2, int n);
+char *bitsclr (char *s1, char const *s2, int n);
+char *bitsand (char *s1, char const *s2, int n);
+char *bitsxor (char *s1, char const *s2, int n);
+int bitstst (char const *s1, char const *s2, int n);
+int bitsany (char const *s, int n);
+int round2(int rough);
+struct token *make_token (char const *name, int);
+void scan_1_file (char const *(*get_token) (FILE*, int*), FILE *source_FILE);
+struct idarg *parse_idargs (int argc, char **argv);
+struct idarg *parse_idargs_from_FILE (FILE *arg_FILE, struct idarg *idarg);
+void init_hash_table (int file_name_count);
+void scan_files (struct idarg *idarg);
+void report_statistics (void);
+
+void init_hash (long size);
+struct token **hash_lookup (char const *key);
+unsigned int string_hash_1 (char const *key);
+unsigned int string_hash_2 (char const *key);
+void rehash (void);
+
+void write_idfile (char const *id_file, struct idarg *idargs);
+void bump_current_hits_signature (void);
+void init_hits_signature (int index);
+int bit_to_index (int bit);
+int compare_tokens (void const *x, void const *y);
+void free_summary_tokens (void);
+void summarize (void);
+void assert_hits (struct summary *summary);
+void write_hits (FILE *fp, struct summary *summary, unsigned char const *tail_hits);
+void sign_token (struct token *token);
+void add_token_to_summary (struct summary *summary, struct token *token);
+void init_summary (void);
+struct summary *make_sibling_summary (struct summary *summary);
+int count_vec_size (struct summary *summary, unsigned char const *tail_hits);
+int count_buf_size (struct summary *summary, unsigned char const *tail_hits);
+void usage (void);
+
+/* Miscellaneous statistics */
+long name_tokens;
+long number_tokens;
+long string_tokens;
+long literal_tokens;
+long comment_tokens;
+long occurrences;
+long heap_size;
+long hits_length = 0;
+long tokens_length = 0;
+long output_length = 0;
+
+/* Hash table maintenance */
+long hash_size; /* # of slots */
+long hash_capacity; /* # of usable slots */
+long hash_fill; /* # of keys inserted in table */
+long hash_probes = 0;
+long hash_lookups = 0;
+int hash_rehashes = 0;
+
+int verbose_flag = 0;
+int statistics_flag = 1;
+
+int args_count = 0; /* # of args to save */
+int scan_count = 0; /* # of files to scan */
+int file_name_count = 0; /* # of files in database */
+int levels = 0; /* ceil(log(8)) of file_name_count */
+
+unsigned char current_hits_signature[MAX_LEVELS];
+#define INIT_TOKENS_SIZE(level) (1 << ((level) + 13))
+struct summary *summary_root;
+struct summary *summary_leaf;
+
+char PWD_name[BUFSIZ]; /* The current working directory */
+char absolute_idfile_name[BUFSIZ]; /* The absolute name of the database */
+char const *id_file = IDFILE;
+
+char const *program_name;
+
+void
+usage (void)
+{
+ fprintf (stderr, "\
+Usage: %s [-v] [-f<idfile>] [(+|-)l[<lang>]] [(+|-)S<scanarg>] [-a<argfile>] [-] [-u] [files...]\n\
+ -v Verbose: print reports of progress\n\
+ -a<file> Open file for arguments\n\
+ - Read newline-separated args from stdin\n\
+ -l<lang> Force files to be scanned as <lang> until +l<lang>\n\
+ -S<lang>-<arg> Pass arg to <lang> scanner\n\
+ -S.<suffix>=<lang> Scan files with .<suffix> as <lang>\n\
+ -S<lang>? Print usage documentation for <lang>\n\
+ -u Update an existing database (unimplemented)\n\
+\n\
+Version %s; Made %s %s\n",
+ program_name, FULL_VERSION, __DATE__, __TIME__);
+
+ exit (1);
+}
+
+int
+main (int argc, char **argv)
+{
+ struct idarg *idarg_0;
+ char const *sbrk0;
+
+ program_name = basename (GETARG (argc, argv));
+ if (kshgetwd (PWD_name) == NULL)
+ {
+ fprintf (stderr, "%s: cannot get current working directory name.\n", program_name);
+ return 1;
+ }
+ strcat (PWD_name, "/");
+
+ idarg_0 = parse_idargs (argc, argv);
+ if (idarg_0 == NULL)
+ {
+ fprintf (stderr, "Nothing to do...\n");
+ return 0;
+ }
+
+ sbrk0 = (char const *)sbrk (0);
+ init_hash (scan_count * 64);
+
+ strcpy (absolute_idfile_name, span_file_name (PWD_name, id_file));
+ if (access (id_file, 06) < 0
+ && (errno != ENOENT || access (dirname (id_file), 06) < 0))
+ {
+ filerr ("modify", id_file);
+ return 1;
+ }
+
+ init_hits_signature (0);
+ init_summary ();
+ init_scanners ();
+
+ scan_files (idarg_0);
+
+ if (hash_fill == 0)
+ return 0;
+
+ free_summary_tokens ();
+ free (hash_table);
+
+ write_idfile (id_file, idarg_0);
+ heap_size = (char const *)sbrk (0) - sbrk0;
+
+ if (statistics_flag)
+ report_statistics ();
+ return 0;
+}
+
+void
+scan_files (struct idarg *idarg)
+{
+ int keep_lang = 0;
+
+ for ( ; idarg->ida_next; idarg = idarg->ida_next)
+ {
+ char const *(*scanner) (FILE*, int*);
+ FILE *source_FILE;
+ char *arg = idarg->ida_arg;
+ char const *lang_name = NULL;
+ char const *suff;
+ char const *filter;
+
+ if (idarg->ida_index < 0)
+ {
+ int op = *arg++;
+ switch (*arg++)
+ {
+ case 'l':
+ if (*arg == '\0')
+ {
+ keep_lang = 0;
+ lang_name = NULL;
+ break;
+ }
+ if (op == '+')
+ keep_lang = 1;
+ lang_name = arg;
+ break;
+ case 'S':
+ set_scan_args (op, strdup (arg));
+ break;
+ default:
+ usage ();
+ }
+ continue;
+ }
+ if (!(idarg->ida_flags & IDA_SCAN_ME))
+ goto skip;
+
+ suff = strrchr (arg, '.');
+ if (lang_name == NULL)
+ {
+ if (suff == NULL)
+ suff = "";
+ lang_name = get_lang_name (suff);
+ if (lang_name == NULL)
+ lang_name = get_lang_name ("");
+ if (lang_name == NULL)
+ {
+ fprintf (stderr, "%s: No language assigned to suffix: `%s'\n", program_name, suff);
+ goto skip;
+ }
+ }
+ scanner = get_scanner (lang_name);
+ if (scanner == NULL)
+ {
+ fprintf (stderr, "%s: No scanner for language: `%s'\n", program_name, lang_name);
+ goto skip;
+ }
+ filter = get_filter (suff);
+ source_FILE = open_source_FILE (arg, filter);
+ if (source_FILE == NULL)
+ goto skip;
+ if (verbose_flag)
+ {
+ if (filter)
+ {
+ printf ("%s: ", lang_name);
+ printf (filter, arg);
+ putchar ('\n');
+ }
+ else
+ printf ("%s: %s\n", lang_name, arg);
+ }
+ scan_1_file (scanner, source_FILE);
+ close_source_FILE (source_FILE, filter);
+ skip:
+ if (!keep_lang)
+ lang_name = NULL;
+ if (idarg->ida_index < file_name_count)
+ {
+ if (current_hits_signature[0] & 0x80)
+ summarize ();
+ bump_current_hits_signature ();
+ }
+ }
+}
+
+void
+report_statistics (void)
+{
+ printf ("Name=%ld, ", name_tokens);
+ printf ("Number=%ld, ", number_tokens);
+ printf ("String=%ld, ", string_tokens);
+ printf ("Literal=%ld, ", literal_tokens);
+ printf ("Comment=%ld\n", comment_tokens);
+
+ printf ("Files=%d, ", scan_count);
+ printf ("Tokens=%ld, ", occurrences);
+ printf ("Bytes=%ld Kb, ", input_chars / 1024);
+ printf ("Heap=%ld Kb, ", heap_size / 1024);
+ printf ("Output=%ld (%ld tok, %ld hit)\n", output_length, tokens_length, hits_length);
+
+ printf ("Load=%ld/%ld=%.2f, ", hash_fill, hash_size,
+ (double) hash_fill / (double) hash_size);
+ printf ("Rehash=%d, ", hash_rehashes);
+ printf ("Probes=%ld/%ld=%.2f, ", hash_probes, hash_lookups,
+ (double) hash_probes / (double) hash_lookups);
+ printf ("Freq=%ld/%ld=%.2f\n", occurrences, hash_fill,
+ (double) occurrences / (double) hash_fill);
+}
+
+struct idarg *
+parse_idargs (int argc, char **argv)
+{
+ struct idarg *idarg;
+ struct idarg *idarg_0;
+ char *arg;
+ int op;
+ FILE *arg_FILE = NULL;
+ int args_from = 0;
+ enum {
+ AF_CMDLINE = 0x1, /* file args came on command line */
+ AF_FILE = 0x2, /* file args came from a file (-f<file>) */
+ AF_IDFILE = 0x4, /* file args came from an old ID file (-u) */
+ AF_QUERY = 0x8
+ }; /* no file args necessary: usage query */
+
+ idarg = idarg_0 = CALLOC(struct idarg, 1);
+
+ /* Process some arguments, and snarf-up some others for processing
+ later. */
+ while (argc)
+ {
+ arg = GETARG (argc, argv);
+ if (*arg != '-' && *arg != '+')
+ {
+ /* arguments are from command line (not pipe) */
+ args_from |= AF_CMDLINE;
+ idarg->ida_arg = arg;
+ idarg->ida_flags = IDA_SCAN_ME;
+ idarg->ida_index = file_name_count++;
+ scan_count++;
+ idarg = (idarg->ida_next = CALLOC(struct idarg, 1));
+
+ continue;
+ }
+ op = *arg++;
+ switch (*arg++)
+ {
+ case 'u':
+#if 0
+ args_from |= AF_IDFILE;
+ old_idarg (id_file, &idarg);
+ break;
+#endif
+ case '\0':
+ args_from |= AF_FILE;
+ idarg = parse_idargs_from_FILE (stdin, idarg);
+ break;
+ case 'a':
+ arg_FILE = fopen (arg, "r");
+ if (arg_FILE == NULL)
+ filerr ("open", arg);
+ else
+ {
+ args_from |= AF_FILE;
+ idarg = parse_idargs_from_FILE (arg_FILE, idarg);
+ }
+ break;
+ case 'f':
+ id_file = arg;
+ break;
+ case 'v':
+ verbose_flag = 1;
+ break;
+ case 'S':
+ if (strchr (&arg[-2], '?'))
+ {
+ set_scan_args (op, arg);
+ args_from |= AF_QUERY;
+ }
+ /*FALLTHROUGH */
+ case 'l':
+ case 's':
+ case 'r':
+ idarg->ida_arg = &arg[-2];
+ idarg->ida_index = -1;
+ idarg = (idarg->ida_next = CALLOC(struct idarg, 1));
+
+ args_count++;
+ break;
+ default:
+ usage ();
+ }
+ }
+
+ if (args_from & AF_QUERY)
+ exit (0);
+ /* File args should only come from one place. Ding the user if
+ arguments came from multiple places, or if none were supplied at
+ all. */
+ switch (args_from)
+ {
+ case AF_CMDLINE:
+ case AF_FILE:
+ case AF_IDFILE:
+ if (file_name_count > 0)
+ break;
+ /*FALLTHROUGH */
+ case 0:
+ fprintf (stderr, "%s: Use -u, -f<file>, or cmd-line for file args!\n", program_name);
+ usage ();
+ default:
+ fprintf (stderr, "%s: Use only one of: -u, -f<file>, or cmd-line for file args!\n", program_name);
+ usage ();
+ }
+
+ if (scan_count == 0)
+ return NULL;
+
+ return idarg_0;
+}
+
+
+/* Cons up a list of idarg as supplied in a file. */
+struct idarg *
+parse_idargs_from_FILE (FILE *arg_FILE, struct idarg *idarg)
+{
+ int file_count;
+ char buf[BUFSIZ];
+ char *arg;
+
+ file_count = 0;
+ while (fgets (buf, sizeof (buf), arg_FILE))
+ {
+ idarg->ida_arg = arg = strndup (buf, strlen (buf) - 1);
+ if (*arg == '+' || *arg == '-')
+ idarg->ida_index = -1;
+ else
+ {
+ idarg->ida_flags = IDA_SCAN_ME;
+ idarg->ida_index = file_name_count++;
+ scan_count++;
+ }
+ idarg = idarg->ida_next = CALLOC (struct idarg, 1);
+ }
+ return idarg;
+}
+
+void
+scan_1_file (char const *(*get_token) (FILE*, int*), FILE *source_FILE)
+{
+ struct token **slot;
+ char const *key;
+ int flags;
+
+ while ((key = (*get_token) (source_FILE, &flags)) != NULL)
+ {
+ struct token *token = *(slot = hash_lookup (key));
+
+ if (token)
+ {
+ token->tok_flags |= flags;
+ if (token->tok_count < USHRT_MAX)
+ token->tok_count++;
+ if (!(token->tok_hits[0] & current_hits_signature[0]))
+ sign_token (token);
+ } else {
+ *slot = token = make_token (key, flags);
+ sign_token (token);
+ if (hash_fill++ >= hash_capacity)
+ rehash ();
+ }
+ }
+}
+
+/* As the database is written, may need to adjust the file names. If
+ we are generating the ID file in a remote directory, then adjust
+ the file names to be relative to the location of the ID database.
+
+ (This would be a common useage if you want to make a database for a
+ directory which you have no write access to, so you cannot create
+ the ID file.) */
+void
+write_idfile (char const *id_file, struct idarg *idarg)
+{
+ struct token **tokens;
+ int i;
+ FILE *id_FILE;
+ int lasti;
+ struct idhead idh;
+ int fixup_names;
+ char *lsl;
+ int buf_size;
+ int vec_size;
+ int tok_size;
+ int max_buf_size = 0;
+ int max_vec_size = 0;
+
+ if (verbose_flag)
+ printf ("Writing `%s'...\n", id_file);
+ lsl = strrchr (relative_file_name (PWD_name, absolute_idfile_name), '/');
+ if (lsl == NULL)
+ {
+ /* The database is in the cwd, don't adjust the names */
+ fixup_names = 0;
+ }
+ else
+ {
+ /* The database is not in cwd, adjust names so they are relative
+ to the location of the database, make absolute_idfile_name just be the
+ directory path to ID. */
+ fixup_names = 1;
+ *(lsl + 1) = '\0';
+ }
+ id_FILE = fopen (id_file, "w+b");
+ if (id_FILE == NULL)
+ {
+ filerr ("create", id_file);
+ exit (1);
+ }
+ strncpy (idh.idh_magic, IDH_MAGIC, sizeof (idh.idh_magic));
+ idh.idh_version = IDH_VERSION;
+ idh.idh_pad_1 = 0;
+ idh.idh_flags = IDH_COUNTS;
+
+ /* write out the list of pathnames */
+ fseek (id_FILE, sizeof_idhead (), 0);
+ idh.idh_args_offset = ftell (id_FILE);
+ for (i = lasti = 0; idarg->ida_next; idarg = idarg->ida_next)
+ {
+ if (idarg->ida_index > 0)
+ while (++lasti < idarg->ida_index)
+ {
+ i++;
+ putc ('\0', id_FILE);
+ }
+ if (fixup_names)
+ fputs (relative_file_name (absolute_idfile_name, span_file_name (PWD_name, idarg->ida_arg)), id_FILE);
+ else
+ fputs (idarg->ida_arg, id_FILE);
+ i++;
+ putc ('\0', id_FILE);
+ }
+ idh.idh_args = i;
+ idh.idh_paths = file_name_count;
+
+ /* write out the list of identifiers */
+
+ putc ('\0', id_FILE);
+ putc ('\0', id_FILE);
+ idh.idh_tokens_offset = ftell(id_FILE);
+
+ assert (summary_root->sum_hits_count == hash_fill);
+ tokens = REALLOC (summary_root->sum_tokens, struct token *, hash_fill);
+ qsort (tokens, hash_fill, sizeof (struct token *), compare_tokens);
+ for (i = 0; i < hash_fill; i++, tokens++)
+ {
+ struct token *token = *tokens;
+ if (*token->tok_name == '\0')
+ {
+ fprintf (stderr, "Blank token!\n");
+ hash_fill--;
+ i--;
+ continue;
+ }
+ occurrences += token->tok_count;
+ if (token->tok_flags & TOK_NUMBER)
+ number_tokens++;
+ if (token->tok_flags & TOK_NAME)
+ name_tokens++;
+ if (token->tok_flags & TOK_STRING)
+ string_tokens++;
+ if (token->tok_flags & TOK_LITERAL)
+ literal_tokens++;
+ if (token->tok_flags & TOK_COMMENT)
+ comment_tokens++;
+
+ fputs (token->tok_name, id_FILE);
+ putc ('\0', id_FILE);
+ if (token->tok_count > 0xff)
+ token->tok_flags |= TOK_SHORT_COUNT;
+ putc (token->tok_flags, id_FILE);
+ putc (token->tok_count & 0xff, id_FILE);
+ if (token->tok_flags & TOK_SHORT_COUNT)
+ putc (token->tok_count >> 8, id_FILE);
+
+ vec_size = count_vec_size (summary_root, token->tok_hits + levels);
+ buf_size = count_buf_size (summary_root, token->tok_hits + levels);
+ hits_length += buf_size;
+ tok_size = strlen (token->tok_name) + 1;
+ tokens_length += tok_size;
+ buf_size += tok_size + sizeof (token->tok_flags) + sizeof (token->tok_count) + 2;
+ if (buf_size > max_buf_size)
+ max_buf_size = buf_size;
+ if (vec_size > max_vec_size)
+ max_vec_size = vec_size;
+
+ write_hits (id_FILE, summary_root, token->tok_hits + levels);
+ putc ('\0', id_FILE);
+ putc ('\0', id_FILE);
+ }
+ assert_hits (summary_root);
+ idh.idh_tokens = hash_fill;
+ output_length = ftell (id_FILE);
+ idh.idh_end_offset = output_length - 2;
+ idh.idh_buf_size = max_buf_size;
+ idh.idh_vec_size = max_vec_size;
+
+ write_idhead (id_FILE, &idh);
+ fclose (id_FILE);
+}
+
+void
+init_hash (long size)
+{
+ hash_size = round2 (size);
+ if (hash_size < 512)
+ hash_size = 512;
+#if 0
+ if (hash_size > 0x8000)
+ hash_size = 0x8000;
+#endif
+ if (hash_size > (128 * 1024))
+ hash_size /= 2;
+ hash_capacity = hash_size - (hash_size >> 4); /* about 94% */
+ hash_table = CALLOC (struct token *, hash_size);
+ assert (hash_table);
+}
+
+/* Use double hashing with open addressing.
+ The table size is always a power of two. */
+struct token **
+hash_lookup (char const *key)
+{
+ unsigned int hash1 = string_hash_1 (key) % hash_size;
+ unsigned int hash2 = 0;
+ struct token **slot = &hash_table[hash1];
+
+ hash_lookups++;
+ for (;;)
+ {
+ hash_probes++;
+ if (*slot == NULL || strcmp (key, (*slot)->tok_name) == 0)
+ return slot;
+
+ if (!hash2)
+ hash2 = string_hash_2 (key);
+ hash1 = (hash1 + hash2) % hash_size;
+ slot = &hash_table[hash1];
+ }
+}
+
+/* Primary hash function for string keys. */
+unsigned int
+string_hash_1 (char const *key)
+{
+ unsigned int sum = 0;
+
+ key--;
+ while (*++key)
+ sum += (*key << (key[1] & 0xf));
+
+ return sum;
+}
+
+/* Secondary hash function for string keys. The result must be
+ relatively prime to the table size, which is a power of two,
+ so any odd number will do. */
+unsigned int
+string_hash_2 (char const *key)
+{
+ unsigned int sum = 0;
+
+ key--;
+ while (*++key)
+ sum += (*key << (key[1] & 0x7));
+
+ return sum | 1;
+}
+
+/* Double the size of the hash table in the event of overflow... */
+void
+rehash (void)
+{
+ long old_hash_size = hash_size;
+ struct token **old_hash_table = hash_table;
+ struct token **htp;
+ struct token **slot;
+
+ hash_size *= 2;
+ if (verbose_flag)
+ printf ("Rehashing... (doubling size to %ld)\n", hash_size);
+ hash_rehashes++;
+ hash_capacity = hash_size - (hash_size >> 4);
+ hash_table = CALLOC (struct token *, hash_size);
+
+ for (htp = old_hash_table; htp < &old_hash_table[old_hash_size]; htp++)
+ {
+ if (*htp == NULL)
+ continue;
+ slot = hash_lookup ((*htp)->tok_name);
+
+ if (*slot)
+ {
+ fprintf (stderr, "%s: Duplicate hash entry!\n", (*slot)->tok_name);
+ exit (1);
+ }
+ *slot = *htp;
+ }
+ free (old_hash_table);
+}
+
+/* Round a given number up to the nearest power of 2. */
+int
+round2 (int rough)
+{
+ int round;
+
+ round = 1;
+ while (rough)
+ {
+ round <<= 1;
+ rough >>= 1;
+ }
+ return round;
+}
+
+/* Allocate a new token struct and fill in the name field. We
+ allocate memory in large chunks to avoid frequent calls to malloc ()
+ which is a major pig. */
+struct token *
+make_token (char const *name, int flags)
+{
+ struct token *token = (struct token *) malloc (sizeof (struct token) + strlen (name));
+
+ if (!token)
+ {
+ fprintf (stderr, "malloc failure! \n");
+ exit (1);
+ }
+ token->tok_count = 1;
+ token->tok_flags = flags;
+ memset (token->tok_hits, 0, sizeof (token->tok_hits));
+ strcpy (token->tok_name, name);
+
+ return token;
+}
+
+/* ///////////// summary stuff //////////////////////////////////////////// */
+
+void
+bump_current_hits_signature (void)
+{
+ unsigned char *hits = current_hits_signature;
+ while (*hits & 0x80)
+ *hits++ = 1;
+ *hits <<= 1;
+}
+
+void
+init_hits_signature (int index)
+{
+ unsigned char *hits = current_hits_signature;
+ unsigned char const *end = &current_hits_signature[MAX_LEVELS];
+ while (hits < end)
+ {
+ *hits = 1 << (index & 7);
+ index >>= 3;
+ hits++;
+ }
+}
+
+int
+bit_to_index (int bit)
+{
+ int i = 0;
+ while (bit >>= 1)
+ i++;
+ return i;
+}
+
+int
+compare_tokens (void const *x, void const *y)
+{
+ return strcmp ((*(struct token const *const *)x)->tok_name,
+ (*(struct token const *const *)y)->tok_name);
+}
+
+void
+free_summary_tokens (void)
+{
+ struct summary *summary = summary_leaf;
+ while (summary != summary_root)
+ {
+ free (summary->sum_tokens);
+ summary = summary->sum_parent;
+ }
+}
+
+void
+summarize (void)
+{
+ unsigned char const *hits_sig = current_hits_signature;
+ struct summary *summary = summary_leaf;
+
+ do
+ {
+ long count = summary->sum_hits_count;
+ unsigned char *hits = MALLOC (unsigned char, count + 1);
+ int level = summary->sum_level;
+ struct token **tokens = summary->sum_tokens;
+ long init_size = INIT_TOKENS_SIZE (summary->sum_level);
+
+ if (verbose_flag)
+ {
+ char const *fmt;
+ if (count < init_size / 2)
+ fmt = "level %d: %ld < %ld/2\n";
+ else if (count > init_size * 2)
+ fmt = "level %d: %ld > %ld*2\n";
+ else if (count < init_size)
+ fmt = "level %d: %ld < %ld\n";
+ else if (count > init_size)
+ fmt = "level %d: %ld > %ld\n";
+ else
+ fmt = "level %d: %ld == %ld\n";
+ printf (fmt, summary->sum_level, count, init_size);
+ }
+
+ qsort (tokens, count, sizeof (struct token *), compare_tokens);
+ summary->sum_hits = hits;
+ while (count--)
+ {
+ unsigned char *hit = &(*tokens++)->tok_hits[level];
+ *hits++ = *hit;
+ *hit = 0;
+ }
+ *hits++ = 0;
+ if (summary->sum_parent)
+ {
+ free (summary->sum_tokens);
+ summary->sum_tokens = 0;
+ }
+ summary = summary->sum_parent;
+ }
+ while (*++hits_sig & 0x80);
+ summary_leaf = make_sibling_summary (summary_leaf);
+}
+
+void
+init_summary (void)
+{
+ long size = INIT_TOKENS_SIZE (0);
+ summary_root = summary_leaf = CALLOC (struct summary, 1);
+ summary_root->sum_tokens_size = size;
+ summary_root->sum_tokens = MALLOC (struct token *, size);
+}
+
+struct summary *
+make_sibling_summary (struct summary *summary)
+{
+ struct summary *parent = summary->sum_parent;
+ long size;
+
+ if (parent == NULL)
+ {
+ levels++;
+ size = INIT_TOKENS_SIZE (levels);
+ summary_root = summary->sum_parent = parent = CALLOC (struct summary, 1);
+ parent->sum_level = levels;
+ parent->sum_kids[0] = summary;
+ parent->sum_hits_count = summary->sum_hits_count;
+ parent->sum_free_index = 1;
+ parent->sum_tokens_size = size;
+ parent->sum_tokens = REALLOC (summary->sum_tokens, struct token *, size);
+ summary->sum_tokens = 0;
+ }
+ if (parent->sum_free_index == 8)
+ parent = make_sibling_summary (parent);
+ summary = CALLOC (struct summary, 1);
+ summary->sum_level = parent->sum_level - 1;
+ parent->sum_kids[parent->sum_free_index++] = summary;
+ summary->sum_parent = parent;
+ size = INIT_TOKENS_SIZE (summary->sum_level);
+ summary->sum_tokens_size = size;
+ summary->sum_tokens = MALLOC (struct token *, size);
+ return summary;
+}
+
+int
+count_vec_size (struct summary *summary, unsigned char const *tail_hits)
+{
+ struct summary **kids;
+ unsigned int hits = (summary->sum_hits ? *summary->sum_hits : *tail_hits);
+
+ kids = summary->sum_kids;
+ if (*kids == NULL)
+ {
+ static char bits_per_nybble[] = { 0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4 };
+ return bits_per_nybble[hits & 0xf] + bits_per_nybble[hits >> 4];
+ }
+ else
+ {
+ int bit;
+ int count = 0;
+ --tail_hits;
+ for (bit = 1; bit & 0xff; bit <<= 1, ++kids)
+ if (bit & hits)
+ count += count_vec_size (*kids, tail_hits);
+ return count;
+ }
+}
+
+int
+count_buf_size (struct summary *summary, unsigned char const *tail_hits)
+{
+ struct summary **kids;
+ unsigned int hits = (summary->sum_hits ? *summary->sum_hits : *tail_hits);
+
+ kids = summary->sum_kids;
+ if (*kids == NULL)
+ return 1;
+ else
+ {
+ int bit;
+ int count = 1;
+ --tail_hits;
+ for (bit = 1; bit & 0xff; bit <<= 1, ++kids)
+ if (bit & hits)
+ count += count_buf_size (*kids, tail_hits);
+ return count;
+ }
+}
+
+void
+assert_hits (struct summary* summary)
+{
+ struct summary **kids = summary->sum_kids;
+ struct summary **end = &kids[8];
+
+ if (summary == summary_root)
+ assert (summary->sum_hits == 0);
+ else
+ assert (summary->sum_hits && *summary->sum_hits == 0);
+
+ if (end[-1] == 0)
+ while (*--end == 0)
+ ;
+ while (kids < end)
+ assert_hits (*kids++);
+}
+
+void
+write_hits (FILE *fp, struct summary *summary, unsigned char const *tail_hits)
+{
+ struct summary **kids;
+ unsigned int hits = (summary->sum_hits ? *summary->sum_hits++ : *tail_hits);
+
+ assert (hits);
+ putc (hits, fp);
+
+ kids = summary->sum_kids;
+ if (*kids)
+ {
+ int bit;
+ --tail_hits;
+ for (bit = 1; (bit & 0xff) && *kids; bit <<= 1, ++kids)
+ if (bit & hits)
+ write_hits (fp, *kids, tail_hits);
+ }
+}
+
+void
+sign_token (struct token *token)
+{
+ unsigned char *tok_hits = token->tok_hits;
+ unsigned char *hits_sig = current_hits_signature;
+ unsigned char *end = &current_hits_signature[MAX_LEVELS];
+ struct summary *summary = summary_leaf;
+
+ while (summary)
+ {
+ if (*tok_hits == 0)
+ add_token_to_summary (summary, token);
+ if (*tok_hits & *hits_sig)
+ break;
+ *tok_hits |= *hits_sig;
+ summary = summary->sum_parent;
+ tok_hits++;
+ hits_sig++;
+ }
+ while (hits_sig < end)
+ {
+ if (*tok_hits & *hits_sig)
+ break;
+ *tok_hits |= *hits_sig;
+ tok_hits++;
+ hits_sig++;
+ }
+}
+
+void
+add_token_to_summary (struct summary *summary, struct token *token)
+{
+ long size = summary->sum_tokens_size;
+
+ if (summary->sum_hits_count >= size)
+ {
+ size *= 2;
+ summary->sum_tokens = REALLOC (summary->sum_tokens, struct token *, size);
+ summary->sum_tokens_size = size;
+ }
+ summary->sum_tokens[summary->sum_hits_count++] = token;
+}
+
+int
+bitsany (char const *s, int n)
+{
+ while (n--)
+ if (*s++)
+ return 1;
+
+ return 0;
+}
+
+char *
+bitsset (char *s1, char const *s2, int n)
+{
+ while (n--)
+ *s1++ |= *s2++;
+
+ return s1;
+}
+
+char *
+bitsclr (char *s1, char const *s2, int n)
+{
+ while (n--)
+ *s1++ &= ~*s2++;
+
+ return s1;
+}
+
+#if 0
+
+char *
+bitsand (char *s1, char const *s2, int n)
+{
+ while (n--)
+ *s1++ &= *s2++;
+
+ return s1;
+}
+
+char *
+bitsxor (char *s1, char const *s2, int n)
+{
+ while (n--)
+ *s1++ ^= *s2++;
+
+ return s1;
+}
+
+int
+bitstst (char const *s1, char const *s2, int n)
+{
+ while (n--)
+ if (*s1++ & *s2++)
+ return 1;
+
+ return 0;
+}
+
+#endif