diff options
Diffstat (limited to 'scanners.c')
-rw-r--r-- | scanners.c | 1209 |
1 files changed, 1209 insertions, 0 deletions
diff --git a/scanners.c b/scanners.c new file mode 100644 index 0000000..9020a1c --- /dev/null +++ b/scanners.c @@ -0,0 +1,1209 @@ +/* scanners.c -- file & directory name manipulations + Copyright (C) 1986, 1995 Greg McGary + VHIL portions Copyright (C) 1988 Tom Horsley + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; see the file COPYING. If not, write to the + Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#include "config.h" +#include <stdio.h> +#include <string.h> +#include <ctype.h> +#include "strxtra.h" +#include "token.h" +#include "alloc.h" +#include "scanners.h" + +extern char const *program_name; + +static char const *get_token_VHIL (FILE *input_FILE, int *flags); +static char const *get_token_c (FILE *input_FILE, int *flags); +static void set_args_c (char const *lang_name, int op, char const *arg); +static void set_ctype_c (char const *chars, int type); +static void clear_ctype_c (char const *chars, int type); +static void usage_c (char const *lang_name); + +static char const *get_token_asm (FILE *input_FILE, int *flags); +static void set_ctype_asm (char const *chars, int type); +static void clear_ctype_asm (char const *chars, int type); +static void usage_asm (char const *lang_name); +static void set_args_asm (char const *lang_name, int op, char const *arg); + +static char const *get_token_text (FILE *input_FILE, int *flags); +static void set_ctype_text (char const *chars, int type); +static void clear_ctype_text (char const *chars, int type); +static void usage_text (char const *lang_name); +static void set_args_text (char const *lang_name, int op, char const *arg); + +/****************************************************************************/ + +struct language +{ + char const *lang_name; + char const *(*lang_get_token) (FILE *input_FILE, int *flags); + void (*lang_set_args) (char const *lang_name, int op, char const *arg); + char const *lang_filter; + struct language *lang_next; +}; + +struct suffix +{ + char const *suff_suffix; + char const *suff_lang_name; + struct language *suff_language; + struct suffix *suff_next; +}; + +static struct suffix *get_suffix_entry (char const *suffix); +static struct language *get_lang_entry (char const *lang_name); +static void usage_scan (void); + +struct language languages[] = +{ + /* must be sorted for bsearch(3) */ + { "C", get_token_c, set_args_c, NULL }, + { "TeX", get_token_text, set_args_text, NULL }, + { "VHIL", get_token_VHIL, set_args_c, NULL }, + { "asm", get_token_asm, set_args_asm, NULL }, +/*{ "elisp", get_token_elisp, set_args_elisp, NULL },*/ + { "gzip", NULL, NULL, "zcat %s" }, + { "roff", get_token_text, set_args_text, "sed '/^\\.so/d' < %s | deroff" }, + { "text", get_token_text, set_args_text, NULL }, +}; + +/* + This is a rather incomplete list of default associations + between suffixes and languages. You may add more to the + default list, or you may define them dynamically with the + `-S<suff>=<lang>' argument to mkid(1) and idx(1). e.g. to + associate a `.ada' suffix with the Ada language, use + `-S.ada=ada' +*/ +struct suffix suffixes[] = +{ + { "", "text" }, + { ".1", "roff" }, + { ".2", "roff" }, + { ".3", "roff" }, + { ".4", "roff" }, + { ".5", "roff" }, + { ".6", "roff" }, + { ".7", "roff" }, + { ".8", "roff" }, + { ".C", "C" }, + { ".H", "C" }, + { ".Z", "gzip" }, + { ".c", "C" }, + { ".cc", "C" }, + { ".cpp", "C" }, + { ".cxx", "C" }, + { ".doc", "text" }, +/*{ ".el", "elisp" },*/ + { ".gz", "gzip" }, + { ".h", "C" }, + { ".hh", "C" }, + { ".hpp", "C" }, + { ".hxx", "C" }, + { ".l", "C" }, + { ".lex", "C" }, + { ".ltx", "TeX" }, + { ".p", "pas" }, + { ".pas", "pas" }, + { ".s", "asm" }, + { ".S", "asm" }, + { ".tex", "TeX" }, + { ".x", "VHIL" }, + { ".y", "C" }, + { ".yacc", "C" }, + { ".z", "gzip" }, +}; + +void +init_scanners (void) +{ + struct language *lang; + struct language *lang_N = &languages[(sizeof(languages)/sizeof(languages[0])) - 1]; + struct suffix *suff; + struct suffix *suff_N = &suffixes[(sizeof(suffixes)/sizeof(suffixes[0])) - 1]; + + for (lang = languages; lang <= lang_N; ++lang) + lang->lang_next = lang + 1; + lang_N->lang_next = NULL; + + for (suff = suffixes; suff <= suff_N; ++suff) { + lang = get_lang_entry (suff->suff_lang_name); + if (lang) + suff->suff_language = lang; + suff->suff_next = suff + 1; + } + suff_N->suff_next = NULL; +} + +/* Return a suffix table entry for the given suffix. */ +static struct suffix * +get_suffix_entry (char const *suffix) +{ + struct suffix *stp; + + if (suffix == NULL) + suffix = ""; + + for (stp = suffixes; stp; stp = stp->suff_next) + if (strequ (stp->suff_suffix, suffix)) + return stp; + return stp; +} + +static struct language * +get_lang_entry (char const *lang_name) +{ + struct language *ltp; + + if (lang_name == NULL) + lang_name = ""; + + for (ltp = languages; ltp->lang_next; ltp = ltp->lang_next) + if (ltp->lang_name == lang_name || strequ (ltp->lang_name, lang_name)) + return ltp; + return ltp; +} + +char const * +get_lang_name (char const *suffix) +{ + struct suffix *stp; + + stp = get_suffix_entry (suffix); + if (stp->suff_next == NULL) + return NULL; + return stp->suff_language->lang_name; +} + +char const * +get_filter (char const *suffix) +{ + struct suffix *stp; + + stp = get_suffix_entry (suffix); + if (stp->suff_next == NULL) + return NULL; + return stp->suff_language->lang_filter; +} + +char const *(* +get_scanner (char const *lang) + ) (FILE *input_FILE, int *flags) +{ + struct language *ltp; + + ltp = get_lang_entry (lang); + if (ltp->lang_next == NULL) + return NULL; + return ltp->lang_get_token; +} + +void +set_scan_args (int op, char *arg) +{ + struct language *ltp, *ltp2; + struct suffix *stp; + char *lhs; + char *lhs2; + int count = 0; + + lhs = arg; + while (isalnum (*arg) || *arg == '.') + arg++; + + if (strequ (lhs, "?=?")) + { + for (stp = suffixes; stp->suff_next; stp = stp->suff_next) + { + printf ("%s%s=%s", (count++ > 0) ? ", " : "", stp->suff_suffix, stp->suff_language->lang_name); + if (stp->suff_language->lang_filter) + printf (" (%s)", stp->suff_language->lang_filter); + } + if (count) + putchar ('\n'); + return; + } + + if (strnequ (lhs, "?=", 2)) + { + lhs += 2; + ltp = get_lang_entry (lhs); + if (ltp->lang_next == NULL) + { + printf ("No scanner for language `%s'\n", lhs); + return; + } + for (stp = suffixes; stp->suff_next; stp = stp->suff_next) + if (stp->suff_language == ltp) + { + printf ("%s%s=%s", (count++ > 0) ? ", " : "", stp->suff_suffix, ltp->lang_name); + if (stp->suff_language->lang_filter) + printf (" (%s)", stp->suff_language->lang_filter); + } + if (count) + putchar ('\n'); + return; + } + + if (strequ (arg, "=?")) + { + lhs[strlen (lhs) - 2] = '\0'; + stp = get_suffix_entry (lhs); + if (stp->suff_next == NULL) + { + printf ("No scanner assigned to suffix `%s'\n", lhs); + return; + } + printf ("%s=%s", stp->suff_suffix, stp->suff_language->lang_name); + if (stp->suff_language->lang_filter) + printf (" (%s)", stp->suff_language->lang_filter); + printf ("\n"); + return; + } + + if (*arg == '=') + { + *arg++ = '\0'; + + ltp = get_lang_entry (arg); + if (ltp->lang_next == NULL) + { + fprintf (stderr, "%s: Language undefined: %s\n", program_name, arg); + return; + } + stp = get_suffix_entry (lhs); + if (stp->suff_next == NULL) + { + stp->suff_suffix = lhs; + stp->suff_language = ltp; + stp->suff_next = CALLOC (struct suffix, 1); + } + else if (!strequ (arg, stp->suff_language->lang_name)) + { + fprintf (stderr, "%s: Note: `%s=%s' overrides `%s=%s'\n", program_name, lhs, arg, lhs, stp->suff_language->lang_name); + stp->suff_language = ltp; + } + return; + } + else if (*arg == '/') + { + *arg++ = '\0'; + ltp = get_lang_entry (lhs); + if (ltp->lang_next == NULL) + { + ltp->lang_name = lhs; + ltp->lang_get_token = get_token_text; + ltp->lang_set_args = set_args_text; + ltp->lang_filter = NULL; + ltp->lang_next = CALLOC (struct language, 1); + } + lhs2 = arg; + arg = strchr (arg, '/'); + if (arg == NULL) + ltp2 = ltp; + else + { + *arg++ = '\0'; + ltp2 = get_lang_entry (lhs2); + if (ltp2->lang_next == NULL) + { + fprintf (stderr, "%s: language %s not defined.\n", program_name, lhs2); + ltp2 = ltp; + } + } + ltp->lang_get_token = ltp2->lang_get_token; + ltp->lang_set_args = ltp2->lang_set_args; + if (ltp->lang_filter && (!strequ (arg, ltp->lang_filter))) + fprintf (stderr, "%s: Note: `%s/%s' overrides `%s/%s'\n", program_name, lhs, arg, lhs, ltp->lang_filter); + ltp->lang_filter = arg; + return; + } + + if (op == '+') + { + switch (op = *arg++) + { + case '+': + case '-': + case '?': + break; + default: + usage_scan (); + } + for (ltp = languages; ltp->lang_next; ltp = ltp->lang_next) + (*ltp->lang_set_args) (NULL, op, arg); + return; + } + + if (*arg == '-' || *arg == '+' || *arg == '?') + { + op = *arg; + *arg++ = '\0'; + + ltp = get_lang_entry (lhs); + if (ltp->lang_next == NULL) + { + fprintf (stderr, "%s: Language undefined: %s\n", program_name, lhs); + return; + } + (*ltp->lang_set_args) (lhs, op, arg); + return; + } + + usage_scan (); +} + +static void +usage_scan (void) +{ + fprintf (stderr, "Usage: %s [-S<suffix>=<lang>] [+S(+|-)<arg>] [-S<lang>(+|-)<arg>] [-S<lang>/<lang>/<filter>]\n", program_name); + exit (1); +} + +/*************** C & C++ ****************************************************/ + +#define I1 0x0001 /* 1st char of an identifier [a-zA-Z_] */ +#define DG 0x0002 /* decimal digit [0-9] */ +#define NM 0x0004 /* extra chars in a hex or long number [a-fA-FxXlL] */ +#define C1 0x0008 /* C comment introduction char: / */ +#define C2 0x0010 /* C comment termination char: * */ +#define Q1 0x0020 /* single quote: ' */ +#define Q2 0x0040 /* double quote: " */ +#define ES 0x0080 /* escape char: \ */ +#define NL 0x0100 /* newline: \n */ +#define EF 0x0200 /* EOF */ +#define SK 0x0400 /* Make these chars valid for names within strings */ +#define VH 0x0800 /* VHIL comment introduction char: # */ +#define WS 0x1000 /* White space characters */ + +/* + character class membership macros: +*/ +#define ISDIGIT(c) ((rct)[c] & (DG)) /* digit */ +#define ISNUMBER(c) ((rct)[c] & (DG|NM)) /* legal in a number */ +#define ISEOF(c) ((rct)[c] & (EF)) /* EOF */ +#define ISID1ST(c) ((rct)[c] & (I1)) /* 1st char of an identifier */ +#define ISIDREST(c) ((rct)[c] & (I1|DG)) /* rest of an identifier */ +#define ISSTRKEEP(c) ((rct)[c] & (I1|DG|SK)) /* keep contents of string */ +#define ISSPACE(c) ((rct)[c] & (WS)) /* white space character */ +/* + The `BORING' classes should be skipped over + until something interesting comes along... +*/ +#define ISBORING(c) (!((rct)[c] & (EF|NL|I1|DG|Q1|Q2|C1|VH))) /* fluff */ +#define ISCBORING(c) (!((rct)[c] & (EF|C2))) /* comment fluff */ +#define ISVBORING(c) (!((rct)[c] & (EF|NL))) /* vhil comment fluff */ +#define ISQ1BORING(c) (!((rct)[c] & (EF|NL|Q1|ES))) /* char const fluff */ +#define ISQ2BORING(c) (!((rct)[c] & (EF|NL|Q2|ES))) /* quoted str fluff */ + +static short ctype_c[257] = +{ + EF, +/* 0 1 2 3 4 5 6 7 */ +/* ----- ----- ----- ----- ----- ----- ----- ----- */ +/*000*/ 0, 0, 0, 0, 0, 0, 0, 0, +/*010*/ 0, 0, NL, 0, 0, 0, 0, 0, +/*020*/ 0, 0, 0, 0, 0, 0, 0, 0, +/*030*/ 0, 0, 0, 0, 0, 0, 0, 0, +/*040*/ 0, 0, Q2, 0, 0, 0, 0, Q1, +/*050*/ 0, 0, C2, 0, 0, 0, 0, C1, +/*060*/ DG, DG, DG, DG, DG, DG, DG, DG, +/*070*/ DG, DG, 0, 0, 0, 0, 0, 0, +/*100*/ 0, I1|NM, I1|NM, I1|NM, I1|NM, I1|NM, I1|NM, I1, +/*110*/ I1, I1, I1, I1, I1|NM, I1, I1, I1, +/*120*/ I1, I1, I1, I1, I1, I1, I1, I1, +/*130*/ I1|NM, I1, I1, 0, ES, 0, 0, I1, +/*140*/ 0, I1|NM, I1|NM, I1|NM, I1|NM, I1|NM, I1|NM, I1, +/*150*/ I1, I1, I1, I1, I1|NM, I1, I1, I1, +/*160*/ I1, I1, I1, I1, I1, I1, I1, I1, +/*170*/ I1|NM, I1, I1, 0, 0, 0, 0, 0, +}; + +static int eat_underscore = 1; +static int scan_VHIL = 0; + +static char const * +get_token_VHIL (FILE *input_FILE, int *flags) +{ + if (!scan_VHIL) + set_args_c ("vhil", '+', "v"); + return get_token_c (input_FILE, flags); +} + +/* + Grab the next identifier the C source + file opened with the handle `input_FILE'. + This state machine is built for speed, not elegance. +*/ +static char const * +get_token_c (FILE *input_FILE, int *flags) +{ + static char input_buffer[BUFSIZ]; + static int new_line = 1; + short *rct = &ctype_c[1]; + int c; + char *id = input_buffer; + +top: + c = getc (input_FILE); + if (new_line) + { + new_line = 0; + if (c == '.') + { + /* Auto-recognize vhil code when you see a '.' in column 1. + also ignore lines that start with a '.' */ + if (!scan_VHIL) + set_args_c ("vhil", '+', "v"); + while (ISVBORING (c)) + c = getc (input_FILE); + new_line = 1; + goto top; + } + if (c != '#') + goto next; + c = getc (input_FILE); + if (scan_VHIL && ISSPACE (c)) + { + while (ISVBORING (c)) + c = getc (input_FILE); + new_line = 1; + goto top; + } + while (ISBORING (c)) + c = getc (input_FILE); + if (!ISID1ST (c)) + goto next; + id = input_buffer; + *id++ = c; + while (ISIDREST (c = getc (input_FILE))) + *id++ = c; + *id = '\0'; + if (strequ (input_buffer, "include")) + { + while (c == ' ' || c == '\t') + c = getc (input_FILE); + if (c == '\n') + { + new_line = 1; + goto top; + } + id = input_buffer; + if (c == '"') + { + c = getc (input_FILE); + while (c != '\n' && c != EOF && c != '"') + { + *id++ = c; + c = getc (input_FILE); + } + *flags = TOK_STRING; + } + else if (c == '<') + { + c = getc (input_FILE); + while (c != '\n' && c != EOF && c != '>') + { + *id++ = c; + c = getc (input_FILE); + } + *flags = TOK_STRING; + } + else if (ISID1ST (c)) + { + *id++ = c; + while (ISIDREST (c = getc (input_FILE))) + *id++ = c; + *flags = TOK_NAME; + } + else + { + while (c != '\n' && c != EOF) + c = getc (input_FILE); + new_line = 1; + goto top; + } + while (c != '\n' && c != EOF) + c = getc (input_FILE); + new_line = 1; + *id = '\0'; + return input_buffer; + } + if (strnequ (input_buffer, "if", 2) + || strequ (input_buffer, "define") + || strequ (input_buffer, "elif") /* ansi C */ + || (scan_VHIL && strequ (input_buffer, "elsif")) + || strequ (input_buffer, "undef")) + goto next; + while ((c != '\n') && (c != EOF)) + c = getc (input_FILE); + new_line = 1; + goto top; + } + +next: + while (ISBORING (c)) + c = getc (input_FILE); + + switch (c) + { + case '"': + id = input_buffer; + *id++ = c = getc (input_FILE); + for (;;) + { + while (ISQ2BORING (c)) + *id++ = c = getc (input_FILE); + if (c == '\\') + { + *id++ = c = getc (input_FILE); + continue; + } + else if (c != '"') + goto next; + break; + } + *--id = '\0'; + id = input_buffer; + while (ISSTRKEEP (*id)) + id++; + if (*id || id == input_buffer) + { + c = getc (input_FILE); + goto next; + } + *flags = TOK_STRING; + if (eat_underscore && input_buffer[0] == '_' && input_buffer[1]) + return &input_buffer[1]; + else + return input_buffer; + + case '\'': + c = getc (input_FILE); + for (;;) + { + while (ISQ1BORING (c)) + c = getc (input_FILE); + if (c == '\\') + { + c = getc (input_FILE); + continue; + } + else if (c == '\'') + c = getc (input_FILE); + goto next; + } + + case '/': + c = getc (input_FILE); + if (c == '/') + { /* Cope with C++ comment */ + while (ISVBORING (c)) + c = getc (input_FILE); + new_line = 1; + goto top; + } + else if (c != '*') + goto next; + c = getc (input_FILE); + for (;;) + { + while (ISCBORING (c)) + c = getc (input_FILE); + c = getc (input_FILE); + if (c == '/') + { + c = getc (input_FILE); + goto next; + } + else if (ISEOF (c)) + { + new_line = 1; + return NULL; + } + } + + case '\n': + new_line = 1; + goto top; + + case '#': + if (!scan_VHIL) + { + /* Auto-recognize vhil when find a # in the middle of a line. */ + set_args_c ("vhil", '+', "v"); + } + c = getc (input_FILE); + while (ISVBORING (c)) + c = getc (input_FILE); + new_line = 1; + goto top; + default: + if (ISEOF (c)) + { + new_line = 1; + return NULL; + } + id = input_buffer; + *id++ = c; + if (ISID1ST (c)) + { + *flags = TOK_NAME; + while (ISIDREST (c = getc (input_FILE))) + *id++ = c; + } + else if (ISDIGIT (c)) + { + *flags = TOK_NUMBER; + while (ISNUMBER (c = getc (input_FILE))) + *id++ = c; + } + else + fprintf (stderr, "junk: `\\%3o'", c); + ungetc (c, input_FILE); + *id = '\0'; + *flags |= TOK_LITERAL; + return input_buffer; + } +} + +static void +set_ctype_c (char const *chars, int type) +{ + short *rct = &ctype_c[1]; + + while (*chars) + rct[*chars++] |= type; +} + +static void +clear_ctype_c (char const *chars, int type) +{ + short *rct = &ctype_c[1]; + + while (*chars) + rct[*chars++] &= ~type; +} + +static void +usage_c (char const *lang_name) +{ + fprintf (stderr, "Usage: %s does not accept %s scanner arguments\n", program_name, lang_name); + exit (1); +} + +static char document_c[] = "\ +The C scanner arguments take the form -Sc<arg>, where <arg>\n\ +is one of the following: (<cc> denotes one or more characters)\n\ + (+|-)u . . . . (Do|Don't) strip a leading `_' from ids in strings.\n\ + (+|-)s<cc> . . Allow <cc> in string ids, and (keep|ignore) those ids.\n\ + -v . . . . . . Skip vhil comments."; + +static void +set_args_c (char const *lang_name, int op, char const *arg) +{ + if (op == '?') + { + puts (document_c); + return; + } + switch (*arg++) + { + case 'u': + eat_underscore = (op == '+'); + break; + case 's': + if (op == '+') + set_ctype_c (arg, SK); + else + clear_ctype_c (arg, SK); + break; + case 'v': + set_ctype_c ("$", I1); + set_ctype_c ("#", VH); + set_ctype_c (" \t", WS); + scan_VHIL = 1; + break; + default: + if (lang_name) + usage_c (lang_name); + break; + } +} + +#undef I1 +#undef DG +#undef NM +#undef C1 +#undef C2 +#undef Q1 +#undef Q2 +#undef ES +#undef NL +#undef EF +#undef SK +#undef VH +#undef WS +#undef ISDIGIT +#undef ISNUMBER +#undef ISEOF +#undef ISID1ST +#undef ISIDREST +#undef ISSTRKEEP +#undef ISSPACE +#undef ISBORING +#undef ISCBORING +#undef ISVBORING +#undef ISQ1BORING +#undef ISQ2BORING + +/*************** Assembly ***************************************************/ + +#define I1 0x01 /* 1st char of an identifier [a-zA-Z_] */ +#define NM 0x02 /* digit [0-9a-fA-FxX] */ +#define NL 0x04 /* newline: \n */ +#define CM 0x08 /* assembler comment char: usually # or | */ +#define IG 0x10 /* ignore `identifiers' with these chars in them */ +#define C1 0x20 /* C comment introduction char: / */ +#define C2 0x40 /* C comment termination char: * */ +#define EF 0x80 /* EOF */ + +/* Assembly Language character classes */ +#define ISID1ST(c) ((rct)[c] & (I1)) +#define ISIDREST(c) ((rct)[c] & (I1|NM)) +#define ISNUMBER(c) ((rct)[c] & (NM)) +#define ISEOF(c) ((rct)[c] & (EF)) +#define ISCOMMENT(c) ((rct)[c] & (CM)) +#define ISBORING(c) (!((rct)[c] & (EF|NL|I1|NM|CM|C1))) +#define ISCBORING(c) (!((rct)[c] & (EF|NL))) +#define ISCCBORING(c) (!((rct)[c] & (EF|C2))) +#define ISIGNORE(c) ((rct)[c] & (IG)) + +static char ctype_asm[257] = +{ + EF, +/* 0 1 2 3 4 5 6 7 */ +/* ----- ----- ----- ----- ----- ----- ----- ----- */ +/*000*/ 0, 0, 0, 0, 0, 0, 0, 0, +/*010*/ 0, 0, NL, 0, 0, 0, 0, 0, +/*020*/ 0, 0, 0, 0, 0, 0, 0, 0, +/*030*/ 0, 0, 0, 0, 0, 0, 0, 0, +/*040*/ 0, 0, 0, 0, 0, 0, 0, 0, +/*050*/ 0, 0, C2, 0, 0, 0, 0, C1, +/*060*/ NM, NM, NM, NM, NM, NM, NM, NM, +/*070*/ NM, NM, 0, 0, 0, 0, 0, 0, +/*100*/ 0, I1|NM, I1|NM, I1|NM, I1|NM, I1|NM, I1|NM, I1, +/*110*/ I1, I1, I1, I1, I1|NM, I1, I1, I1, +/*120*/ I1, I1, I1, I1, I1, I1, I1, I1, +/*130*/ I1|NM, I1, I1, 0, 0, 0, 0, I1, +/*140*/ 0, I1|NM, I1|NM, I1|NM, I1|NM, I1|NM, I1|NM, I1, +/*150*/ I1, I1, I1, I1, I1|NM, I1, I1, I1, +/*160*/ I1, I1, I1, I1, I1, I1, I1, I1, +/*170*/ I1|NM, I1, I1, 0, 0, 0, 0, 0, + +}; + +static int cpp_on_asm = 1; + +/* + Grab the next identifier the assembly language + source file opened with the handle `input_FILE'. + This state machine is built for speed, not elegance. +*/ +static char const * +get_token_asm (FILE *input_FILE, int *flags) +{ + static char input_buffer[BUFSIZ]; + char *rct = &ctype_asm[1]; + int c; + char *id = input_buffer; + static int new_line = 1; + +top: + c = getc (input_FILE); + if (cpp_on_asm > 0 && new_line) + { + new_line = 0; + if (c != '#') + goto next; + while (ISBORING (c)) + c = getc (input_FILE); + if (!ISID1ST (c)) + goto next; + id = input_buffer; + *id++ = c; + while (ISIDREST (c = getc (input_FILE))) + *id++ = c; + *id = '\0'; + if (strequ (input_buffer, "include")) + { + while (c != '"' && c != '<') + c = getc (input_FILE); + id = input_buffer; + *id++ = c = getc (input_FILE); + while ((c = getc (input_FILE)) != '"' && c != '>') + *id++ = c; + *id = '\0'; + *flags = TOK_STRING; + return input_buffer; + } + if (strnequ (input_buffer, "if", 2) + || strequ (input_buffer, "define") + || strequ (input_buffer, "undef")) + goto next; + while (c != '\n') + c = getc (input_FILE); + new_line = 1; + goto top; + } + +next: + while (ISBORING (c)) + c = getc (input_FILE); + + if (ISCOMMENT (c)) + { + while (ISCBORING (c)) + c = getc (input_FILE); + new_line = 1; + } + + if (ISEOF (c)) + { + new_line = 1; + return NULL; + } + + if (c == '\n') + { + new_line = 1; + goto top; + } + + if (c == '/') + { + if ((c = getc (input_FILE)) != '*') + goto next; + c = getc (input_FILE); + for (;;) + { + while (ISCCBORING (c)) + c = getc (input_FILE); + c = getc (input_FILE); + if (c == '/') + { + c = getc (input_FILE); + break; + } + else if (ISEOF (c)) + { + new_line = 1; + return NULL; + } + } + goto next; + } + + id = input_buffer; + if (eat_underscore && c == '_' && !ISID1ST (c = getc (input_FILE))) + { + ungetc (c, input_FILE); + return "_"; + } + *id++ = c; + if (ISID1ST (c)) + { + *flags = TOK_NAME; + while (ISIDREST (c = getc (input_FILE))) + *id++ = c; + } + else if (ISNUMBER (c)) + { + *flags = TOK_NUMBER; + while (ISNUMBER (c = getc (input_FILE))) + *id++ = c; + } + else + { + if (isprint (c)) + fprintf (stderr, "junk: `%c'", c); + else + fprintf (stderr, "junk: `\\%03o'", c); + goto next; + } + + *id = '\0'; + for (id = input_buffer; *id; id++) + if (ISIGNORE (*id)) + goto next; + ungetc (c, input_FILE); + *flags |= TOK_LITERAL; + return input_buffer; +} + +static void +set_ctype_asm (char const *chars, int type) +{ + char *rct = &ctype_asm[1]; + + while (*chars) + rct[*chars++] |= type; +} + +static void +clear_ctype_asm (char const *chars, int type) +{ + char *rct = &ctype_asm[1]; + + while (*chars) + rct[*chars++] &= ~type; +} + +static void +usage_asm (char const *lang_name) +{ + fprintf (stderr, "Usage: %s -S%s([-c<cc>] [-u] [(+|-)a<cc>] [(+|-)p] [(+|-)C])\n", program_name, lang_name); + exit (1); +} + +static char document_asm[] = "\ +The Assembler scanner arguments take the form -Sasm<arg>, where\n\ +<arg> is one of the following: (<cc> denotes one or more characters)\n\ + -c<cc> . . . . <cc> introduce(s) a comment until end-of-line.\n\ + (+|-)u . . . . (Do|Don't) strip a leading `_' from ids.\n\ + (+|-)a<cc> . . Allow <cc> in ids, and (keep|ignore) those ids.\n\ + (+|-)p . . . . (Do|Don't) handle C-preprocessor directives.\n\ + (+|-)C . . . . (Do|Don't) handle C-style comments. (/* */)"; + +static void +set_args_asm (char const *lang_name, int op, char const *arg) +{ + if (op == '?') + { + puts (document_asm); + return; + } + switch (*arg++) + { + case 'a': + set_ctype_asm (arg, I1 | ((op == '-') ? IG : 0)); + break; + case 'c': + set_ctype_asm (arg, CM); + break; + case 'u': + eat_underscore = (op == '+'); + break; + case 'p': + cpp_on_asm = (op == '+'); + break; + case 'C': + if (op == '+') + { + set_ctype_asm ("/", C1); + set_ctype_asm ("*", C2); + } + else + { + clear_ctype_asm ("/", C1); + clear_ctype_asm ("*", C2); + } + break; + default: + if (lang_name) + usage_asm (lang_name); + break; + } +} + +#undef I1 +#undef NM +#undef NL +#undef CM +#undef IG +#undef C1 +#undef C2 +#undef EF +#undef ISID1ST +#undef ISIDREST +#undef ISNUMBER +#undef ISEOF +#undef ISCOMMENT +#undef ISBORING +#undef ISCBORING +#undef ISCCBORING +#undef ISIGNORE + +/*************** Text *******************************************************/ + +#define I1 0x01 /* 1st char of an identifier [a-zA-Z_] */ +#define NM 0x02 /* digit [0-9a-fA-FxX] */ +#define SQ 0x04 /* squeeze these out (.,',-) */ +#define EF 0x80 /* EOF */ + +/* Text character classes */ +#define ISID1ST(c) ((rct)[c] & (I1)) +#define ISIDREST(c) ((rct)[c] & (I1|NM|SQ)) +#define ISNUMBER(c) ((rct)[c] & (NM)) +#define ISEOF(c) ((rct)[c] & (EF)) +#define ISBORING(c) (!((rct)[c] & (I1|NM|EF))) +#define ISIDSQUEEZE(c) ((rct)[c] & (SQ)) + +static char ctype_text[257] = +{ + EF, +/* 0 1 2 3 4 5 6 7 */ +/* ----- ----- ----- ----- ----- ----- ----- ----- */ +/*000*/ 0, 0, 0, 0, 0, 0, 0, 0, +/*010*/ 0, 0, 0, 0, 0, 0, 0, 0, +/*020*/ 0, 0, 0, 0, 0, 0, 0, 0, +/*030*/ 0, 0, 0, 0, 0, 0, 0, 0, +/*040*/ 0, 0, 0, 0, 0, 0, 0, SQ, +/*050*/ 0, 0, 0, 0, 0, SQ, SQ, 0, +/*060*/ NM, NM, NM, NM, NM, NM, NM, NM, +/*070*/ NM, NM, 0, 0, 0, 0, 0, 0, +/*100*/ 0, I1|NM, I1|NM, I1|NM, I1|NM, I1|NM, I1|NM, I1, +/*110*/ I1, I1, I1, I1, I1|NM, I1, I1, I1, +/*120*/ I1, I1, I1, I1, I1, I1, I1, I1, +/*130*/ I1|NM, I1, I1, 0, 0, 0, 0, I1, +/*140*/ 0, I1|NM, I1|NM, I1|NM, I1|NM, I1|NM, I1|NM, I1, +/*150*/ I1, I1, I1, I1, I1|NM, I1, I1, I1, +/*160*/ I1, I1, I1, I1, I1, I1, I1, I1, +/*170*/ I1|NM, I1, I1, 0, 0, 0, 0, 0, +}; + +/* + Grab the next identifier the text source file opened with the + handle `input_FILE'. This state machine is built for speed, not + elegance. +*/ +static char const * +get_token_text (FILE *input_FILE, int *flags) +{ + static char input_buffer[BUFSIZ]; + char *rct = &ctype_text[1]; + int c; + char *id = input_buffer; + +top: + c = getc (input_FILE); + while (ISBORING (c)) + c = getc (input_FILE); + if (ISEOF (c)) + return NULL; + id = input_buffer; + *id++ = c; + if (ISID1ST (c)) + { + *flags = TOK_NAME; + while (ISIDREST (c = getc (input_FILE))) + if (!ISIDSQUEEZE (c)) + *id++ = c; + } + else if (ISNUMBER (c)) + { + *flags = TOK_NUMBER; + while (ISNUMBER (c = getc (input_FILE))) + *id++ = c; + } + else + { + if (isprint (c)) + fprintf (stderr, "junk: `%c'", c); + else + fprintf (stderr, "junk: `\\%03o'", c); + goto top; + } + + *id = '\0'; + ungetc (c, input_FILE); + *flags |= TOK_LITERAL; + return input_buffer; +} + +static void +set_ctype_text (char const *chars, int type) +{ + char *rct = &ctype_text[1]; + + while (*chars) + rct[*chars++] |= type; +} + +static void +clear_ctype_text (char const *chars, int type) +{ + char *rct = &ctype_text[1]; + + while (*chars) + rct[*chars++] &= ~type; +} + +static void +usage_text (char const *lang_name) +{ + fprintf (stderr, "Usage: %s -S%s([(+|-)a<cc>] [(+|-)s<cc>]\n", program_name, lang_name); + exit (1); +} + +static char document_text[] = "\ +The Text scanner arguments take the form -Stext<arg>, where\n\ +<arg> is one of the following: (<cc> denotes one or more characters)\n\ + (+|-)a<cc> . . Include (or exculde) <cc> in ids.\n\ + (+|-)s<cc> . . Squeeze (or don't squeeze) <cc> out of ids."; + +static void +set_args_text (char const *lang_name, int op, char const *arg) +{ + if (op == '?') + { + puts (document_text); + return; + } + switch (*arg++) + { + case 'a': + if (op == '+') + set_ctype_text (arg, I1); + else + clear_ctype_text (arg, I1); + break; + case 's': + if (op == '+') + set_ctype_text (arg, SQ); + else + clear_ctype_text (arg, SQ); + break; + default: + if (lang_name) + usage_text (lang_name); + break; + } +} + +#undef I1 +#undef NM +#undef SQ +#undef EF +#undef ISID1ST +#undef ISIDREST +#undef ISNUMBER +#undef ISEOF +#undef ISBORING +#undef ISIDSQUEEZE |