summaryrefslogtreecommitdiffstats
path: root/mkid.c
diff options
context:
space:
mode:
Diffstat (limited to 'mkid.c')
-rw-r--r--mkid.c256
1 files changed, 70 insertions, 186 deletions
diff --git a/mkid.c b/mkid.c
index e3f7e70..daee166 100644
--- a/mkid.c
+++ b/mkid.c
@@ -19,6 +19,7 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <stdlib.h>
+#include <stddef.h>
#include <unistd.h>
#include <limits.h>
#include <assert.h>
@@ -31,13 +32,17 @@
#include "strxtra.h"
#include "alloc.h"
#include "idfile.h"
-#include "idarg.h"
#include "token.h"
#include "bitops.h"
#include "misc.h"
#include "filenames.h"
+#include "hash.h"
#include "scanners.h"
+#ifndef offsetof
+#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
+#endif
+
struct summary
{
struct token **sum_tokens;
@@ -55,7 +60,7 @@ struct summary
int sum_level;
};
-#define MAX_LEVELS 5 /* log_8 of the max # of files log_8 32768 == 5 */
+#define MAX_LEVELS 5 /* log_8 of the max # of files: log_8(32768) == 5 */
struct token
{
@@ -65,34 +70,28 @@ struct token
char tok_name[1];
};
-struct token **hash_table; /* Vector of token pointers */
-
char *bitsset __P((char *s1, char const *s2, int n));
char *bitsclr __P((char *s1, char const *s2, int n));
char *bitsand __P((char *s1, char const *s2, int n));
char *bitsxor __P((char *s1, char const *s2, int n));
int bitstst __P((char const *s1, char const *s2, int n));
int bitsany __P((char const *s, int n));
-int round2 __P((int rough));
struct token *make_token __P((char const *name, int));
void scan_1_file __P((char const *(*get_token) (FILE*, int*), FILE *source_FILE));
struct idarg *parse_idargs __P((int argc, char **argv));
struct idarg *parse_idargs_from_FILE __P((FILE *arg_FILE, struct idarg *idarg));
-void init_hash_table __P((int file_name_count));
void scan_files __P((struct idarg *idarg));
void report_statistics __P((void));
-void init_hash __P((long size));
-struct token **hash_lookup __P((char const *key));
-unsigned int string_hash_1 __P((char const *key));
-unsigned int string_hash_2 __P((char const *key));
-void rehash __P((void));
+unsigned long token_hash_1 __P((void const *key));
+unsigned long token_hash_2 __P((void const *key));
+int token_hash_cmp __P((void const *x, void const *y));
void write_idfile __P((char const *id_file, struct idarg *idargs));
void bump_current_hits_signature __P((void));
void init_hits_signature __P((int i));
int bit_to_index __P((int bit));
-int compare_tokens __P((void const *x, void const *y));
+int token_qsort_cmp __P((void const *x, void const *y));
void free_summary_tokens __P((void));
void summarize __P((void));
void assert_hits __P((struct summary *summary));
@@ -105,6 +104,9 @@ int count_vec_size __P((struct summary *summary, unsigned char const *tail_hits)
int count_buf_size __P((struct summary *summary, unsigned char const *tail_hits));
void usage __P((void));
+struct hash_table token_table;
+struct hash_table file_table;
+
/* Miscellaneous statistics */
long input_chars;
long name_tokens;
@@ -118,14 +120,6 @@ long hits_length = 0;
long tokens_length = 0;
long output_length = 0;
-/* Hash table maintenance */
-unsigned long hash_size; /* # of slots */
-unsigned long hash_capacity; /* # of usable slots */
-unsigned long hash_fill; /* # of keys inserted in table */
-unsigned long hash_probes = 0;
-unsigned long hash_lookups = 0;
-unsigned int hash_rehashes = 0;
-
int verbose_flag = 0;
int statistics_flag = 1;
@@ -139,7 +133,7 @@ unsigned char current_hits_signature[MAX_LEVELS];
struct summary *summary_root;
struct summary *summary_leaf;
-char PWD_name[BUFSIZ]; /* The current working directory */
+char PWD_buf[BUFSIZ]; /* The current working directory */
char absolute_idfile_name[BUFSIZ]; /* The absolute name of the database */
char const *id_file_name = IDFILE;
@@ -149,7 +143,7 @@ void
usage (void)
{
fprintf (stderr, "\
-Usage: %s [-v] [-f<idfile>] [(+|-)l[<lang>]] [(+|-)S<scanarg>] [-a<argfile>] [-] [-u] [files...]\n\
+Usage: %s [-v] [-f<idfile>] [(+|-)l[<lang>]] [(+|-)S<scanarg>] [-a<argfile>] [-] [files...]\n\
-v Verbose: print reports of progress\n\
-a<file> Open file for arguments\n\
- Read newline-separated args from stdin\n\
@@ -157,14 +151,18 @@ Usage: %s [-v] [-f<idfile>] [(+|-)l[<lang>]] [(+|-)S<scanarg>] [-a<argfile>] [-]
-S<lang>-<arg> Pass arg to <lang> scanner\n\
-S.<suffix>=<lang> Scan files with .<suffix> as <lang>\n\
-S<lang>? Print usage documentation for <lang>\n\
- -u Update an existing database (unimplemented)\n\
\n\
-Version %s; Made %s %s\n",
- program_name, VERSION, __DATE__, __TIME__);
-
+Version %s",
+ program_name, VERSION);
+#ifdef __DATE__
+ fprintf (stderr, "; Made %s %s", __DATE__, __TIME__);
+#endif
+ fputc ('\n', stderr);
exit (1);
}
+void *sbrk ();
+
int
main (int argc, char **argv)
{
@@ -172,13 +170,6 @@ main (int argc, char **argv)
char const *sbrk0;
program_name = basename ((argc--, *argv++));
- if (kshgetwd (PWD_name) == NULL)
- {
- fprintf (stderr, "%s: cannot get current working directory name.\n", program_name);
- return 1;
- }
- strcat (PWD_name, "/");
-
init_scanners ();
idarg_0 = parse_idargs (argc, argv);
@@ -188,10 +179,11 @@ main (int argc, char **argv)
return 0;
}
- sbrk0 = (char const *)sbrk (0);
- init_hash (scan_count * 64);
+ sbrk0 = (char const *) sbrk (0);
+ hash_init (&token_table, scan_count * 64, token_hash_1, token_hash_2, token_hash_cmp);
- strcpy (absolute_idfile_name, span_file_name (PWD_name, id_file_name));
+ get_PWD (PWD_buf);
+ strcpy (absolute_idfile_name, span_file_name (PWD_buf, id_file_name));
if (access (id_file_name, 06) < 0
&& (errno != ENOENT || access (dirname (id_file_name), 06) < 0))
{
@@ -204,11 +196,11 @@ main (int argc, char **argv)
scan_files (idarg_0);
- if (hash_fill == 0)
+ if (token_table.ht_fill == 0)
return 0;
free_summary_tokens ();
- free (hash_table);
+ free (token_table.ht_vec);
write_idfile (id_file_name, idarg_0);
heap_size = (char const *) sbrk (0) - sbrk0;
@@ -225,7 +217,7 @@ scan_files (struct idarg *idarg)
for ( ; idarg->ida_next; idarg = idarg->ida_next)
{
- char const *(*scanner) (FILE*, int*);
+ char const *(*scanner) __P((FILE*, int*));
FILE *source_FILE;
char *arg = idarg->ida_arg;
char const *lang_name = NULL;
@@ -320,13 +312,13 @@ report_statistics (void)
printf ("Heap=%ld Kb, ", heap_size / 1024);
printf ("Output=%ld (%ld tok, %ld hit)\n", output_length, tokens_length, hits_length);
- printf ("Load=%ld/%ld=%.2f, ", hash_fill, hash_size,
- (double) hash_fill / (double) hash_size);
- printf ("Rehash=%d, ", hash_rehashes);
- printf ("Probes=%ld/%ld=%.2f, ", hash_probes, hash_lookups,
- (double) hash_probes / (double) hash_lookups);
- printf ("Freq=%ld/%ld=%.2f\n", occurrences, hash_fill,
- (double) occurrences / (double) hash_fill);
+ printf ("Load=%ld/%ld=%.2f, ", token_table.ht_fill, token_table.ht_size,
+ (double) token_table.ht_fill / (double) token_table.ht_size);
+ printf ("Rehash=%d, ", token_table.ht_rehashes);
+ printf ("Probes=%ld/%ld=%.2f, ", token_table.ht_probes, token_table.ht_lookups,
+ (double) token_table.ht_probes / (double) token_table.ht_lookups);
+ printf ("Freq=%ld/%ld=%.2f\n", occurrences, token_table.ht_fill,
+ (double) occurrences / (double) token_table.ht_fill);
}
struct idarg *
@@ -341,8 +333,7 @@ parse_idargs (int argc, char **argv)
enum {
AF_CMDLINE = 0x1, /* file args came on command line */
AF_FILE = 0x2, /* file args came from a file (-f<file>) */
- AF_IDFILE = 0x4, /* file args came from an old ID file (-u) */
- AF_QUERY = 0x8
+ AF_USAGE = 0x8
}; /* no file args necessary: usage query */
idarg = idarg_0 = CALLOC (struct idarg, 1);
@@ -367,12 +358,6 @@ parse_idargs (int argc, char **argv)
op = *arg++;
switch (*arg++)
{
- case 'u':
-#if 0
- args_from |= AF_IDFILE;
- old_idarg (id_file_name, &idarg);
- break;
-#endif
case '\0':
args_from |= AF_FILE;
idarg = parse_idargs_from_FILE (stdin, idarg);
@@ -397,12 +382,10 @@ parse_idargs (int argc, char **argv)
if (strchr (&arg[-2], '?'))
{
set_scan_args (op, arg);
- args_from |= AF_QUERY;
+ args_from |= AF_USAGE;
}
/*FALLTHROUGH */
case 'l':
- case 's':
- case 'r':
idarg->ida_arg = &arg[-2];
idarg->ida_index = -1;
idarg = (idarg->ida_next = CALLOC (struct idarg, 1));
@@ -414,7 +397,7 @@ parse_idargs (int argc, char **argv)
}
}
- if (args_from & AF_QUERY)
+ if (args_from & AF_USAGE)
exit (0);
/* File args should only come from one place. Ding the user if
arguments came from multiple places, or if none were supplied at
@@ -423,7 +406,6 @@ parse_idargs (int argc, char **argv)
{
case AF_CMDLINE:
case AF_FILE:
- case AF_IDFILE:
if (file_name_count > 0)
break;
/*FALLTHROUGH */
@@ -468,7 +450,7 @@ parse_idargs_from_FILE (FILE *arg_FILE, struct idarg *idarg)
}
void
-scan_1_file (char const *(*get_token) (FILE*, int*), FILE *source_FILE)
+scan_1_file (get_token_t get_token, FILE *source_FILE)
{
struct stat stat_buf;
struct token **slot;
@@ -490,8 +472,9 @@ scan_1_file (char const *(*get_token) (FILE*, int*), FILE *source_FILE)
{
if (*key == '\0')
continue;
- token = *(slot = hash_lookup (key));
total_tokens++;
+ slot = (struct token **) hash_lookup (&token_table, key - offsetof (struct token, tok_name));
+ token = *slot;
if (token)
{
token->tok_flags |= flags;
@@ -507,8 +490,8 @@ scan_1_file (char const *(*get_token) (FILE*, int*), FILE *source_FILE)
sign_token (token);
distinct_tokens++;
new_tokens++;
- if (hash_fill++ >= hash_capacity)
- rehash ();
+ if (token_table.ht_fill++ >= token_table.ht_capacity)
+ rehash (&token_table);
}
}
if (verbose_flag)
@@ -546,13 +529,13 @@ write_idfile (char const *file_name, struct idarg *idarg)
if (verbose_flag)
printf ("Sorting tokens...\n");
- assert (summary_root->sum_hits_count == hash_fill);
- tokens = REALLOC (summary_root->sum_tokens, struct token *, hash_fill);
- qsort (tokens, hash_fill, sizeof (struct token *), compare_tokens);
+ assert (summary_root->sum_hits_count == token_table.ht_fill);
+ tokens = REALLOC (summary_root->sum_tokens, struct token *, token_table.ht_fill);
+ qsort (tokens, token_table.ht_fill, sizeof (struct token *), token_qsort_cmp);
if (verbose_flag)
printf ("Writing `%s'...\n", file_name);
- lsl = strrchr (relative_file_name (PWD_name, absolute_idfile_name), '/');
+ lsl = strrchr (relative_file_name (PWD_buf, absolute_idfile_name), '/');
if (lsl == NULL)
{
/* The database is in the cwd, don't adjust the names */
@@ -575,23 +558,20 @@ write_idfile (char const *file_name, struct idarg *idarg)
idh.idh_magic[0] = IDH_MAGIC_0;
idh.idh_magic[1] = IDH_MAGIC_1;
idh.idh_version = IDH_VERSION;
- idh.idh_pad_1 = 0;
idh.idh_flags = IDH_COUNTS;
/* write out the list of pathnames */
fseek (id_FILE, sizeof_idhead (), 0);
idh.idh_args_offset = ftell (id_FILE);
- for (i = 0; idarg->ida_next; idarg = idarg->ida_next)
+ for ( ; idarg->ida_next; idarg = idarg->ida_next)
{
if (*idarg->ida_arg != '-' && fixup_names)
- fputs (relative_file_name (absolute_idfile_name, span_file_name (PWD_name, idarg->ida_arg)), id_FILE);
+ fputs (relative_file_name (absolute_idfile_name, span_file_name (PWD_buf, idarg->ida_arg)), id_FILE);
else
fputs (idarg->ida_arg, id_FILE);
- i++;
putc ('\0', id_FILE);
}
- idh.idh_args = i;
- idh.idh_paths = file_name_count;
+ idh.idh_files = file_name_count;
/* write out the list of identifiers */
@@ -599,7 +579,7 @@ write_idfile (char const *file_name, struct idarg *idarg)
putc ('\0', id_FILE);
idh.idh_tokens_offset = ftell (id_FILE);
- for (i = 0; i < hash_fill; i++, tokens++)
+ for (i = 0; i < token_table.ht_fill; i++, tokens++)
{
struct token *token = *tokens;
occurrences += token->tok_count;
@@ -639,7 +619,7 @@ write_idfile (char const *file_name, struct idarg *idarg)
putc ('\0', id_FILE);
}
assert_hits (summary_root);
- idh.idh_tokens = hash_fill;
+ idh.idh_tokens = token_table.ht_fill;
output_length = ftell (id_FILE);
idh.idh_end_offset = output_length - 2;
idh.idh_buf_size = max_buf_size;
@@ -649,119 +629,30 @@ write_idfile (char const *file_name, struct idarg *idarg)
fclose (id_FILE);
}
-void
-init_hash (long size)
-{
- hash_size = round2 (size);
- if (hash_size < 512)
- hash_size = 512;
-#if 0
- if (hash_size > 0x8000)
- hash_size = 0x8000;
-#endif
- if (hash_size > (128 * 1024))
- hash_size /= 2;
- hash_capacity = hash_size - (hash_size >> 4); /* about 94% */
- hash_table = CALLOC (struct token *, hash_size);
- assert (hash_table);
-}
-
-/* Use double hashing with open addressing.
- The table size is always a power of two. */
-struct token **
-hash_lookup (char const *key)
-{
- unsigned int hash1 = string_hash_1 (key) % hash_size;
- unsigned int hash2 = 0;
- struct token **slot = &hash_table[hash1];
-
- hash_lookups++;
- for (;;)
- {
- hash_probes++;
- if (*slot == NULL || strcmp (key, (*slot)->tok_name) == 0)
- return slot;
-
- if (!hash2)
- hash2 = string_hash_2 (key);
- hash1 = (hash1 + hash2) % hash_size;
- slot = &hash_table[hash1];
- }
-}
-
-/* Primary hash function for string keys. */
-unsigned int
-string_hash_1 (char const *key)
+unsigned long
+token_hash_1 (void const *key)
{
- unsigned int sum = 0;
-
- key--;
- while (*++key)
- sum += (*key << (key[1] & 0xf));
-
- return sum;
+ return_STRING_HASH_1 (((struct token const *) key)->tok_name);
}
-/* Secondary hash function for string keys. The result must be
- relatively prime to the table size, which is a power of two,
- so any odd number will do. */
-unsigned int
-string_hash_2 (char const *key)
+unsigned long
+token_hash_2 (void const *key)
{
- unsigned int sum = 0;
-
- key--;
- while (*++key)
- sum += (*key << (key[1] & 0x7));
-
- return sum | 1;
+ return_STRING_HASH_2 (((struct token const *) key)->tok_name);
}
-/* Double the size of the hash table in the event of overflow... */
-void
-rehash (void)
+int
+token_hash_cmp (void const *x, void const *y)
{
- unsigned long old_hash_size = hash_size;
- struct token **old_hash_table = hash_table;
- struct token **htp;
- struct token **slot;
-
- hash_size *= 2;
- if (verbose_flag)
- printf ("\n\tRehashing... (doubling size to %ld)\n", hash_size);
- hash_rehashes++;
- hash_capacity = hash_size - (hash_size >> 4);
- hash_table = CALLOC (struct token *, hash_size);
-
- for (htp = old_hash_table; htp < &old_hash_table[old_hash_size]; htp++)
- {
- if (*htp == NULL)
- continue;
- slot = hash_lookup ((*htp)->tok_name);
-
- if (*slot)
- {
- fprintf (stderr, "%s: Duplicate hash entry!\n", (*slot)->tok_name);
- exit (1);
- }
- *slot = *htp;
- }
- free (old_hash_table);
+ return_STRING_COMPARE (((struct token const *) x)->tok_name,
+ ((struct token const *) y)->tok_name);
}
-/* Round a given number up to the nearest power of 2. */
-int
-round2 (int rough)
+int
+token_qsort_cmp (void const *x, void const *y)
{
- int round;
-
- round = 1;
- while (rough)
- {
- round <<= 1;
- rough >>= 1;
- }
- return round;
+ return_STRING_COMPARE ((*(struct token const *const *) x)->tok_name,
+ (*(struct token const *const *) y)->tok_name);
}
struct token *
@@ -815,13 +706,6 @@ bit_to_index (int bit)
return i;
}
-int
-compare_tokens (void const *x, void const *y)
-{
- return strcmp ((*(struct token const *const *) x)->tok_name,
- (*(struct token const *const *) y)->tok_name);
-}
-
void
free_summary_tokens (void)
{
@@ -863,7 +747,7 @@ summarize (void)
printf (fmt, summary->sum_level, count, init_size);
}
- qsort (tokens, count, sizeof (struct token *), compare_tokens);
+ qsort (tokens, count, sizeof (struct token *), token_qsort_cmp);
summary->sum_hits = hits;
while (count--)
{