diff options
author | Kaz Kylheku <kaz@kylheku.com> | 2013-10-05 10:01:24 -0700 |
---|---|---|
committer | Kaz Kylheku <kaz@kylheku.com> | 2013-10-05 10:01:24 -0700 |
commit | 33c2ad9765e7dc34b9c645b304cfd51524056d9e (patch) | |
tree | d79c8bf99d16404788b65ade7f322dba9d380343 /hc.c | |
download | hc-33c2ad9765e7dc34b9c645b304cfd51524056d9e.tar.gz hc-33c2ad9765e7dc34b9c645b304cfd51524056d9e.tar.bz2 hc-33c2ad9765e7dc34b9c645b304cfd51524056d9e.zip |
HTML cleaner utility.
Diffstat (limited to 'hc.c')
-rw-r--r-- | hc.c | 265 |
1 files changed, 265 insertions, 0 deletions
@@ -0,0 +1,265 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "hc.h" + + +static int allowed_el_spec[] = { + tok_el_a, + tok_el_abbr, + tok_el_acronym, + tok_el_address, + /* tok_el_applet, */ + /* tok_el_area, */ + tok_el_b, + /* tok_el_base, */ + tok_el_basefont, + tok_el_bdo, + tok_el_big, + tok_el_blockquote, + /* tok_el_body, */ + tok_el_br, + /* tok_el_button, */ + tok_el_caption, + tok_el_center, + tok_el_cite, + tok_el_code, + tok_el_col, + tok_el_colgroup, + tok_el_dd, + tok_el_del, + tok_el_dfn, + tok_el_dir, + tok_el_div, + tok_el_dl, + tok_el_dt, + tok_el_em, + /* tok_el_fieldset, */ + tok_el_font, + tok_el_form, + /* tok_el_frame, */ + /* tok_el_frameset, */ + tok_el_h1, + tok_el_h2, + tok_el_h3, + tok_el_h4, + tok_el_h5, + tok_el_h6, + /* tok_el_head, */ + tok_el_hr, + /* tok_el_html, */ + tok_el_i, + /* tok_el_iframe, */ + tok_el_img, + /* tok_el_input, */ + tok_el_ins, + tok_el_kbd, + /* tok_el_label, */ + /* tok_el_legend, */ + tok_el_li, + /* tok_el_link, */ + /* tok_el_map, */ + /* tok_el_menu, */ + /* tok_el_meta, */ + /* tok_el_noframes, */ + /* tok_el_noscript, */ + /* tok_el_object, */ + tok_el_ol, + tok_el_optgroup, + /* tok_el_option, */ + tok_el_p, + /* tok_el_param, */ + tok_el_pre, + tok_el_q, + tok_el_samp, + /* tok_el_script,*/ + /* tok_el_select, */ + tok_el_small, + tok_el_span, + tok_el_strike, + tok_el_strong, + tok_el_style, + tok_el_sub, + tok_el_sup, + tok_el_table, + tok_el_tbody, + tok_el_td, + /* tok_el_textarea, */ + tok_el_tfoot, + tok_el_th, + tok_el_thead, + /* tok_el_title, */ + tok_el_tr, + tok_el_tt, + tok_el_u, + tok_el_ul, + /* tok_el_var, */ + tok_eof, +}; + +static int allowed_el[tok_max]; + +static const token_t blank; +static token_t pushback; + +static void bail() +{ + fprintf(stderr, "bad html\n"); + exit(EXIT_FAILURE); +} + +static token_t mktok(toktype_t type, char *text) +{ + token_t tok = { 0, 0, 0, 0 }; + tok.type = type; + tok.lexeme = strdup(text); + return tok; +} + +static void deltok(token_t tok) +{ + free(tok.lexeme); +} + +static int null(token_t tok) +{ + return tok.type == tok_eof; +} + +static token_t gettok(void) +{ + if (null(pushback)) { + int type = yylex(); + return mktok(type, yytext); + } else { + token_t tok = pushback; + pushback = blank; + return tok; + } +} + +static void ungettok(token_t tok) +{ + deltok(pushback); + pushback = tok; +} + +static token_t printtok(token_t tok) +{ + if (!null(tok)) + fputs(tok.lexeme, stdout); + return tok; +} + +static token_t match(int type) +{ + token_t tok = gettok(); + if (tok.type != type) + bail(); + return tok; +} + +static token_t optmatch(int type) +{ + token_t tok = gettok(); + if (tok.type != type) { + ungettok(tok); + return blank; + } + return tok; +} + +static token_t lookfor(int type) +{ + token_t tok; + for (;;) { + tok = gettok(); + if (tok.type == type || null(tok)) + break; + deltok(tok); + } + return tok; +} + +static token_t printuntil(int type) +{ + token_t tok; + for (;;) { + tok = gettok(); + if (tok.type == type || null(tok)) + break; + deltok(printtok(tok)); + } + printtok(tok); + return tok; +} + +static void parse_element(token_t in) +{ + token_t end = optmatch('/'); + token_t name = gettok(); + + switch (name.type) { + case '/': + if (!null(end)) + bail(); + printtok(in); + printtok(name); + deltok(printtok(lookfor('>'))); + goto out; + return; + case '>': + bail(); + default: + break; + } + + if (allowed_el[name.type]) { + printtok(in); + printtok(end); + printtok(name); + deltok(printuntil('>')); + } else { + deltok(lookfor('>')); + } + +out: + deltok(end); + deltok(name); +} + +static void parse(void) +{ + for (;;) { + token_t tok = gettok(); + + switch (tok.type) { + case '<': + parse_element(tok); + break; + case tok_eof: + deltok(tok); + return; + default: + printtok(tok); + break; + } + + deltok(tok); + } +} + +static void init(void) +{ + int i; + + for (i = 0; allowed_el_spec[i] != tok_eof; i++) + allowed_el[allowed_el_spec[i]] = 1; +} + +int main(void) +{ + init(); + parse(); + return 0; +} |