diff options
author | Greg McGary <greg@mcgary.org> | 1997-04-18 06:42:59 +0000 |
---|---|---|
committer | Greg McGary <greg@mcgary.org> | 1997-04-18 06:42:59 +0000 |
commit | 40b4b4e4990e67028efb79345fba5fa9e760b522 (patch) | |
tree | 34e9ba556c821f7c7f94b99c0cbcbf0cc321b5af /src/mkid.c | |
parent | 8c13e09279f361e18173f9e237c454af0ab33299 (diff) | |
download | idutils-40b4b4e4990e67028efb79345fba5fa9e760b522.tar.gz idutils-40b4b4e4990e67028efb79345fba5fa9e760b522.tar.bz2 idutils-40b4b4e4990e67028efb79345fba5fa9e760b522.zip |
Initial revision
Diffstat (limited to 'src/mkid.c')
-rw-r--r-- | src/mkid.c | 794 |
1 files changed, 794 insertions, 0 deletions
diff --git a/src/mkid.c b/src/mkid.c new file mode 100644 index 0000000..ebdb2fc --- /dev/null +++ b/src/mkid.c @@ -0,0 +1,794 @@ +/* mkid.c -- build an identifer database + Copyright (C) 1986, 1995, 1996 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ + +#include <sys/types.h> +#include <sys/stat.h> +#include <stdlib.h> +#include <stddef.h> +#include <unistd.h> +#include <limits.h> +#include <assert.h> +#include <stdio.h> +#include <string.h> +#include <ctype.h> +#include <errno.h> +#include <getopt.h> + +#include <config.h> +#include "system.h" +#include "pathmax.h" +#include "strxtra.h" +#include "alloc.h" +#include "idfile.h" +#include "token.h" +#include "bitops.h" +#include "misc.h" +#include "filenames.h" +#include "hash.h" +#include "scanners.h" +#include "error.h" + +struct summary +{ + struct token **sum_tokens; + unsigned char const *sum_hits; + struct summary *sum_parent; + union { + struct summary *u_kids[8]; /* when sum_level > 0 */ +#define sum_kids sum_u.u_kids + struct member_file *u_files[8]; /* when sum_level == 0 */ +#define sum_files sum_u.u_files + } sum_u; + unsigned long sum_tokens_size; + unsigned long sum_hits_count; + int sum_free_index; + int sum_level; +}; + +void usage __P((void)); +static void help_me __P((void)); +int main __P((int argc, char **argv)); +void assert_writeable __P((char const *file_name)); +void scan_files __P((struct idhead *idhp)); +void scan_member_file __P((struct member_file const *member)); +void scan_member_file_1 __P((get_token_func_t get_token, void const *args, FILE *source_FILE)); +void report_statistics __P((void)); +void write_id_file __P((struct idhead *idhp)); +unsigned long token_hash_1 __P((void const *key)); +unsigned long token_hash_2 __P((void const *key)); +int token_hash_cmp __P((void const *x, void const *y)); +int token_qsort_cmp __P((void const *x, void const *y)); +void bump_current_hits_signature __P((void)); +void init_hits_signature __P((int i)); +void free_summary_tokens __P((void)); +void summarize __P((void)); +void init_summary __P((void)); +struct summary *make_sibling_summary __P((struct summary *summary)); +int count_vec_size __P((struct summary *summary, unsigned char const *tail_hits)); +int count_buf_size __P((struct summary *summary, unsigned char const *tail_hits)); +void assert_hits __P((struct summary* summary)); +void write_hits __P((FILE *fp, struct summary *summary, unsigned char const *tail_hits)); +void sign_token __P((struct token *token)); +void add_token_to_summary __P((struct summary *summary, struct token *token)); + +struct hash_table token_table; + +/* Miscellaneous statistics */ +size_t input_chars; +size_t name_tokens; +size_t number_tokens; +size_t string_tokens; +size_t literal_tokens; +size_t comment_tokens; +size_t occurrences; +size_t hits_length = 0; +size_t tokens_length = 0; +size_t output_length = 0; + +int verbose_flag = 0; +int statistics_flag = 0; + +int file_name_count = 0; /* # of files in database */ +int levels = 0; /* ceil(log(8)) of file_name_count */ + +unsigned char current_hits_signature[MAX_LEVELS]; +#define INIT_TOKENS_SIZE(level) (1 << ((level) + 13)) +struct summary *summary_root; +struct summary *summary_leaf; + +char const *program_name; + +char *include_languages = 0; +char *exclude_languages = 0; +char *lang_map_file_name = 0; +int show_version = 0; +int show_help = 0; +struct idhead idh; + +void +usage (void) +{ + fprintf (stderr, _("Try `%s --help' for more information.\n"), + program_name); + exit (1); +} + +static struct option const long_options[] = +{ + { "file", required_argument, 0, 'f' }, + { "output", required_argument, 0, 'o' }, + { "include", required_argument, 0, 'i' }, + { "exclude", required_argument, 0, 'r' }, + { "lang-arg", required_argument, 0, 'l' }, + { "lang-map", required_argument, 0, 'm' }, + { "verbose", no_argument, 0, 'v' }, + { "statistics", no_argument, 0, 's' }, + { "help", no_argument, &show_help, 1 }, + { "version", no_argument, &show_version, 1 }, + { 0 } +}; + +static void +help_me (void) +{ + printf (_("\ +Usage: %s [OPTION]... [FILE]...\n"), + program_name); + + printf (_("\ +Build an identifier database.\n\ + -o, --output=OUTFILE file name of ID database output\n\ + -f, --file=OUTFILE synonym for --output\n\ + -i, --include=LANGS include languages in LANGS (default: \"C C++ asm\")\n\ + -x, --exclude=LANGS exclude languages in LANGS\n\ + -l, --lang-arg=LANG:ARG pass ARG as a default for LANG (see below)\n\ + -m, --lang-map=MAPFILE use MAPFILE to map file names onto source language\n\ + -v, --verbose report progress and as files are scanned\n\ + -s, --statistics report statistics at end of run\n\ +\n\ + --help display this help and exit\n\ + --version output version information and exit\n\ +\n\ +FILE may be a file name, or a directory name to recursively search.\n\ +The `--include' and `--exclude' options are mutually-exclusive.\n\ +\n\ +The following arguments apply to the language-specific scanners:\n\ +")); + language_help_me (); + exit (0); +} + +#if !HAVE_DECL_SBRK +extern void *sbrk (); +#endif +char const *heap_initial; +char const *heap_after_walk; +char const *heap_after_scan; + +int +main (int argc, char **argv) +{ + program_name = argv[0]; + idh.idh_file_name = ID_FILE_NAME; + for (;;) + { + int optc = getopt_long (argc, argv, "o:f:i:x:l:m:uvs", + long_options, (int *) 0); + if (optc < 0) + break; + switch (optc) + { + case 0: + break; + + case 'o': + case 'f': + idh.idh_file_name = optarg; + break; + + case 'i': + include_languages = optarg; + break; + + case 'x': + exclude_languages = optarg; + break; + + case 'l': + language_save_arg (optarg); + break; + + case 'm': + lang_map_file_name = optarg; + break; + + case 'v': + verbose_flag = 1; + break; + + case 's': + statistics_flag = 1; + break; + + default: + usage (); + } + } + + if (show_version) + { + printf ("%s - %s\n", program_name, PACKAGE_VERSION); + exit (0); + } + + if (show_help) + help_me (); + + argc -= optind; + argv += optind; + language_getopt (); + + assert_writeable (idh.idh_file_name); + + if (argc == 0) + { + argc++; + *(char const **)--argv = "."; + } + heap_initial = (char const *) sbrk (0); + init_idh_obstacks (&idh); + init_idh_tables (&idh); + parse_language_map (lang_map_file_name); + + { + struct file_link *cwd_link = get_current_dir_link (); + while (argc--) + walk_flink (parse_file_name (*argv++, cwd_link), 0); + mark_member_file_links (&idh); + heap_after_walk = (char const *) sbrk (0); + scan_files (&idh); + heap_after_scan = sbrk (0); + free_summary_tokens (); + free (token_table.ht_vec); + chdir_to_link (cwd_link); + write_id_file (&idh); + } + if (statistics_flag) + report_statistics (); + exit (0); +} + +void +assert_writeable (char const *file_name) +{ + if (access (file_name, 06) < 0) + { + if (errno == ENOENT) + { + char const *dir_name = dirname (file_name); + if (access (dir_name, 06) < 0) + error (1, errno, _("can't create `%s' in `%s'"), + basename (file_name), dir_name); + } + else + error (1, errno, _("can't modify `%s'"), file_name); + } +} + +void +scan_files (struct idhead *idhp) +{ + struct member_file **members_0 + = (struct member_file **) hash_dump (&idhp->idh_member_file_table, + 0, member_file_qsort_compare); + struct member_file **end = &members_0[idhp->idh_member_file_table.ht_fill]; + struct member_file **members; + + hash_init (&token_table, idhp->idh_member_file_table.ht_fill * 64, + token_hash_1, token_hash_2, token_hash_cmp); + init_hits_signature (0); + init_summary (); + obstack_init (&tokens_obstack); + + for (members = members_0; members < end; members++) + scan_member_file (*members); + free (members_0); +} + +void +scan_member_file (struct member_file const *member) +{ + struct lang_args const *lang_args = member->mf_lang_args; + struct language const *lang = lang_args->la_language; + get_token_func_t get_token = lang->lg_get_token; + struct file_link *flink = member->mf_link; + struct stat st; + FILE *source_FILE; + size_t bytes; + + chdir_to_link (flink->fl_parent); + source_FILE = open_source_FILE (flink->fl_name); + if (source_FILE) + { + char buf[PATH_MAX]; + if (verbose_flag) + { + printf ("%d: %s: %s", member->mf_index, lang->lg_name, + absolute_path (buf, flink)); + fflush (stdout); + } + if (fstat (fileno (source_FILE), &st) < 0) + error (0, errno, _("can't stat `%s'"), absolute_path (buf, flink)); + else + { + bytes = st.st_size; + input_chars += bytes; + } + scan_member_file_1 (get_token, lang_args->la_args_digested, source_FILE); + if (verbose_flag) + putchar ('\n'); + close_source_FILE (source_FILE); + } + if (current_hits_signature[0] & 0x80) + summarize (); +#if 0 + if (member->mf_index < file_name_count) +#endif + bump_current_hits_signature (); +} + +void +scan_member_file_1 (get_token_func_t get_token, void const *args, FILE *source_FILE) +{ + struct stat st; + struct token **slot; + struct token *token; + int flags; + int bytes = 0; + int total_tokens = 0; + int new_tokens = 0; + int distinct_tokens = 0; + + while ((token = (*get_token) (source_FILE, args, &flags)) != NULL) + { + if (*token->tok_name == '\0') { + obstack_free (&tokens_obstack, token); + continue; + } + total_tokens++; + slot = (struct token **) hash_find_slot (&token_table, token); + if (HASH_VACANT (*slot)) + { + token->tok_count = 1; + memset (token->tok_hits, 0, sizeof (token->tok_hits)); + token->tok_flags = flags; + sign_token (token); + distinct_tokens++; + new_tokens++; + hash_insert_at (&token_table, token, slot); + } + else + { + obstack_free (&tokens_obstack, token); + token = *slot; + token->tok_flags |= flags; + if (token->tok_count < USHRT_MAX) + token->tok_count++; + if (!(token->tok_hits[0] & current_hits_signature[0])) + { + sign_token (token); + distinct_tokens++; + } + } + } + if (verbose_flag) + { + printf (_(" new = %d/%d"), new_tokens, distinct_tokens); + if (distinct_tokens != 0) + printf (" = %.0f%%", 100.0 * (double) new_tokens / (double) distinct_tokens); + } +} + +void +report_statistics (void) +{ + printf (_("Name=%ld, "), name_tokens); + printf (_("Number=%ld, "), number_tokens); + printf (_("String=%ld, "), string_tokens); + printf (_("Literal=%ld, "), literal_tokens); + printf (_("Comment=%ld\n"), comment_tokens); + + printf (_("Files=%d, "), idh.idh_files); + printf (_("Tokens=%ld, "), occurrences); + printf (_("Bytes=%ld Kb, "), input_chars / 1024); + printf (_("Heap=%ld+%ld Kb, "), (heap_after_scan - heap_after_walk) / 1024, + (heap_after_walk - heap_initial) / 1024); + printf (_("Output=%ld (%ld tok, %ld hit)\n"), output_length, tokens_length, hits_length); + + hash_print_stats (&token_table, stdout); + printf (_(", Freq=%ld/%ld=%.2f\n"), occurrences, token_table.ht_fill, + (double) occurrences / (double) token_table.ht_fill); +} + +/* As the database is written, may need to adjust the file names. If + we are generating the ID file in a remote directory, then adjust + the file names to be relative to the location of the ID database. + + (This would be a common useage if you want to make a database for a + directory which you have no write access to, so you cannot create + the ID file.) */ +void +write_id_file (struct idhead *idhp) +{ + struct token **tokens; + int i; + int buf_size; + int vec_size; + int tok_size; + int max_buf_size = 0; + int max_vec_size = 0; + + if (verbose_flag) + printf (_("Sorting tokens...\n")); + assert (summary_root->sum_hits_count == token_table.ht_fill); + tokens = REALLOC (summary_root->sum_tokens, struct token *, token_table.ht_fill); + qsort (tokens, token_table.ht_fill, sizeof (struct token *), token_qsort_cmp); + + if (verbose_flag) + printf (_("Writing `%s'...\n"), idhp->idh_file_name); + idhp->idh_FILE = fopen (idhp->idh_file_name, "w+b"); + if (idhp->idh_FILE == NULL) + error (1, errno, _("can't create `%s'"), idhp->idh_file_name); + + idhp->idh_magic[0] = IDH_MAGIC_0; + idhp->idh_magic[1] = IDH_MAGIC_1; + idhp->idh_version = IDH_VERSION; + idhp->idh_flags = IDH_COUNTS; + + /* write out the list of pathnames */ + + fseek (idhp->idh_FILE, sizeof_idhead (), 0); + idhp->idh_flinks_offset = ftell (idhp->idh_FILE); + serialize_file_links (idhp); + + /* write out the list of identifiers */ + + putc ('\0', idhp->idh_FILE); + putc ('\0', idhp->idh_FILE); + idhp->idh_tokens_offset = ftell (idhp->idh_FILE); + + for (i = 0; i < token_table.ht_fill; i++, tokens++) + { + struct token *token = *tokens; + occurrences += token->tok_count; + if (token->tok_flags & TOK_NUMBER) + number_tokens++; + if (token->tok_flags & TOK_NAME) + name_tokens++; + if (token->tok_flags & TOK_STRING) + string_tokens++; + if (token->tok_flags & TOK_LITERAL) + literal_tokens++; + if (token->tok_flags & TOK_COMMENT) + comment_tokens++; + + fputs (token->tok_name, idhp->idh_FILE); + putc ('\0', idhp->idh_FILE); + if (token->tok_count > 0xff) + token->tok_flags |= TOK_SHORT_COUNT; + putc (token->tok_flags, idhp->idh_FILE); + putc (token->tok_count & 0xff, idhp->idh_FILE); + if (token->tok_flags & TOK_SHORT_COUNT) + putc (token->tok_count >> 8, idhp->idh_FILE); + + vec_size = count_vec_size (summary_root, token->tok_hits + levels); + buf_size = count_buf_size (summary_root, token->tok_hits + levels); + hits_length += buf_size; + tok_size = strlen (token->tok_name) + 1; + tokens_length += tok_size; + buf_size += tok_size + sizeof (token->tok_flags) + sizeof (token->tok_count) + 2; + if (buf_size > max_buf_size) + max_buf_size = buf_size; + if (vec_size > max_vec_size) + max_vec_size = vec_size; + + write_hits (idhp->idh_FILE, summary_root, token->tok_hits + levels); + putc ('\0', idhp->idh_FILE); + putc ('\0', idhp->idh_FILE); + } + assert_hits (summary_root); + idhp->idh_tokens = token_table.ht_fill; + output_length = ftell (idhp->idh_FILE); + idhp->idh_end_offset = output_length - 2; + idhp->idh_buf_size = max_buf_size; + idhp->idh_vec_size = max_vec_size; + + write_idhead (&idh); + fclose (idhp->idh_FILE); +} + +unsigned long +token_hash_1 (void const *key) +{ + return_STRING_HASH_1 (((struct token const *) key)->tok_name); +} + +unsigned long +token_hash_2 (void const *key) +{ + return_STRING_HASH_2 (((struct token const *) key)->tok_name); +} + +int +token_hash_cmp (void const *x, void const *y) +{ + return_STRING_COMPARE (((struct token const *) x)->tok_name, + ((struct token const *) y)->tok_name); +} + +int +token_qsort_cmp (void const *x, void const *y) +{ + return_STRING_COMPARE ((*(struct token const *const *) x)->tok_name, + (*(struct token const *const *) y)->tok_name); +} + + +/****************************************************************************/ + +void +bump_current_hits_signature (void) +{ + unsigned char *hits = current_hits_signature; + while (*hits & 0x80) + *hits++ = 1; + *hits <<= 1; +} + +void +init_hits_signature (int i) +{ + unsigned char *hits = current_hits_signature; + unsigned char const *end = ¤t_hits_signature[MAX_LEVELS]; + while (hits < end) + { + *hits = 1 << (i & 7); + i >>= 3; + hits++; + } +} + +void +free_summary_tokens (void) +{ + struct summary *summary = summary_leaf; + while (summary != summary_root) + { + free (summary->sum_tokens); + summary = summary->sum_parent; + } +} + +void +summarize (void) +{ + unsigned char const *hits_sig = current_hits_signature; + struct summary *summary = summary_leaf; + + do + { + unsigned long count = summary->sum_hits_count; + unsigned char *hits = MALLOC (unsigned char, count + 1); + unsigned int level = summary->sum_level; + struct token **tokens = summary->sum_tokens; + unsigned long init_size = INIT_TOKENS_SIZE (summary->sum_level); + + if (verbose_flag) + printf (_("level %d: %ld/%ld = %.0f%%\n"), + summary->sum_level, count, init_size, + 100.0 * (double) count / (double) init_size); + + qsort (tokens, count, sizeof (struct token *), token_qsort_cmp); + summary->sum_hits = hits; + while (count--) + { + unsigned char *hit = &(*tokens++)->tok_hits[level]; + *hits++ = *hit; + *hit = 0; + } + *hits++ = 0; + if (summary->sum_parent) + { + free (summary->sum_tokens); + summary->sum_tokens = 0; + } + summary = summary->sum_parent; + } + while (*++hits_sig & 0x80); + summary_leaf = make_sibling_summary (summary_leaf); +} + +void +init_summary (void) +{ + unsigned long size = INIT_TOKENS_SIZE (0); + summary_root = summary_leaf = CALLOC (struct summary, 1); + summary_root->sum_tokens_size = size; + summary_root->sum_tokens = MALLOC (struct token *, size); +} + +struct summary * +make_sibling_summary (struct summary *summary) +{ + struct summary *parent = summary->sum_parent; + unsigned long size; + + if (parent == NULL) + { + levels++; + summary_root = summary->sum_parent = parent = CALLOC (struct summary, 1); + parent->sum_level = levels; + parent->sum_kids[0] = summary; + parent->sum_hits_count = summary->sum_hits_count; + parent->sum_free_index = 1; + size = INIT_TOKENS_SIZE (levels); + if (summary->sum_tokens_size >= size) + { + parent->sum_tokens_size = summary->sum_tokens_size; + parent->sum_tokens = summary->sum_tokens; + } + else + { + parent->sum_tokens_size = size; + parent->sum_tokens = REALLOC (summary->sum_tokens, struct token *, size); + } + summary->sum_tokens = 0; + } + if (parent->sum_free_index == 8) + parent = make_sibling_summary (parent); + summary = CALLOC (struct summary, 1); + summary->sum_level = parent->sum_level - 1; + parent->sum_kids[parent->sum_free_index++] = summary; + summary->sum_parent = parent; + size = INIT_TOKENS_SIZE (summary->sum_level); + summary->sum_tokens_size = size; + summary->sum_tokens = MALLOC (struct token *, size); + return summary; +} + +int +count_vec_size (struct summary *summary, unsigned char const *tail_hits) +{ + struct summary **kids; + unsigned int hits = (summary->sum_hits ? *summary->sum_hits : *tail_hits); + + kids = summary->sum_kids; + if (*kids == NULL) + { + static char bits_per_nybble[] = { 0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4 }; + return bits_per_nybble[hits & 0xf] + bits_per_nybble[hits >> 4]; + } + else + { + int bit; + int count = 0; + --tail_hits; + for (bit = 1; bit & 0xff; bit <<= 1, ++kids) + if (bit & hits) + count += count_vec_size (*kids, tail_hits); + return count; + } +} + +int +count_buf_size (struct summary *summary, unsigned char const *tail_hits) +{ + struct summary **kids; + unsigned int hits = (summary->sum_hits ? *summary->sum_hits : *tail_hits); + + kids = summary->sum_kids; + if (*kids == NULL) + return 1; + else + { + int bit; + int count = 1; + --tail_hits; + for (bit = 1; bit & 0xff; bit <<= 1, ++kids) + if (bit & hits) + count += count_buf_size (*kids, tail_hits); + return count; + } +} + +void +assert_hits (struct summary* summary) +{ + struct summary **kids = summary->sum_kids; + struct summary **end = &kids[8]; + + assert (summary->sum_hits == NULL || *summary->sum_hits == 0); + + if (end[-1] == 0) + while (*--end == 0) + ; + while (kids < end) + assert_hits (*kids++); +} + +void +write_hits (FILE *fp, struct summary *summary, unsigned char const *tail_hits) +{ + struct summary **kids; + unsigned int hits = (summary->sum_hits ? *summary->sum_hits++ : *tail_hits); + + assert (hits); + putc (hits, fp); + + kids = summary->sum_kids; + if (*kids) + { + int bit; + --tail_hits; + for (bit = 1; (bit & 0xff) && *kids; bit <<= 1, ++kids) + if (bit & hits) + write_hits (fp, *kids, tail_hits); + } +} + +void +sign_token (struct token *token) +{ + unsigned char *tok_hits = token->tok_hits; + unsigned char *hits_sig = current_hits_signature; + unsigned char *end = ¤t_hits_signature[MAX_LEVELS]; + struct summary *summary = summary_leaf; + + while (summary) + { + if (*tok_hits == 0) + add_token_to_summary (summary, token); + if (*tok_hits & *hits_sig) + break; + *tok_hits |= *hits_sig; + summary = summary->sum_parent; + tok_hits++; + hits_sig++; + } + while (hits_sig < end) + { + if (*tok_hits & *hits_sig) + break; + *tok_hits |= *hits_sig; + tok_hits++; + hits_sig++; + } +} + +void +add_token_to_summary (struct summary *summary, struct token *token) +{ + unsigned long size = summary->sum_tokens_size; + + if (summary->sum_hits_count >= size) + { + size *= 2; + summary->sum_tokens = REALLOC (summary->sum_tokens, struct token *, size); + summary->sum_tokens_size = size; + } + summary->sum_tokens[summary->sum_hits_count++] = token; +} |