summaryrefslogtreecommitdiffstats
path: root/hc.c
diff options
context:
space:
mode:
authorKaz Kylheku <kaz@kylheku.com>2013-10-05 10:01:24 -0700
committerKaz Kylheku <kaz@kylheku.com>2013-10-05 10:01:24 -0700
commit33c2ad9765e7dc34b9c645b304cfd51524056d9e (patch)
treed79c8bf99d16404788b65ade7f322dba9d380343 /hc.c
downloadhc-33c2ad9765e7dc34b9c645b304cfd51524056d9e.tar.gz
hc-33c2ad9765e7dc34b9c645b304cfd51524056d9e.tar.bz2
hc-33c2ad9765e7dc34b9c645b304cfd51524056d9e.zip
HTML cleaner utility.
Diffstat (limited to 'hc.c')
-rw-r--r--hc.c265
1 files changed, 265 insertions, 0 deletions
diff --git a/hc.c b/hc.c
new file mode 100644
index 0000000..4c14e04
--- /dev/null
+++ b/hc.c
@@ -0,0 +1,265 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "hc.h"
+
+
+static int allowed_el_spec[] = {
+ tok_el_a,
+ tok_el_abbr,
+ tok_el_acronym,
+ tok_el_address,
+ /* tok_el_applet, */
+ /* tok_el_area, */
+ tok_el_b,
+ /* tok_el_base, */
+ tok_el_basefont,
+ tok_el_bdo,
+ tok_el_big,
+ tok_el_blockquote,
+ /* tok_el_body, */
+ tok_el_br,
+ /* tok_el_button, */
+ tok_el_caption,
+ tok_el_center,
+ tok_el_cite,
+ tok_el_code,
+ tok_el_col,
+ tok_el_colgroup,
+ tok_el_dd,
+ tok_el_del,
+ tok_el_dfn,
+ tok_el_dir,
+ tok_el_div,
+ tok_el_dl,
+ tok_el_dt,
+ tok_el_em,
+ /* tok_el_fieldset, */
+ tok_el_font,
+ tok_el_form,
+ /* tok_el_frame, */
+ /* tok_el_frameset, */
+ tok_el_h1,
+ tok_el_h2,
+ tok_el_h3,
+ tok_el_h4,
+ tok_el_h5,
+ tok_el_h6,
+ /* tok_el_head, */
+ tok_el_hr,
+ /* tok_el_html, */
+ tok_el_i,
+ /* tok_el_iframe, */
+ tok_el_img,
+ /* tok_el_input, */
+ tok_el_ins,
+ tok_el_kbd,
+ /* tok_el_label, */
+ /* tok_el_legend, */
+ tok_el_li,
+ /* tok_el_link, */
+ /* tok_el_map, */
+ /* tok_el_menu, */
+ /* tok_el_meta, */
+ /* tok_el_noframes, */
+ /* tok_el_noscript, */
+ /* tok_el_object, */
+ tok_el_ol,
+ tok_el_optgroup,
+ /* tok_el_option, */
+ tok_el_p,
+ /* tok_el_param, */
+ tok_el_pre,
+ tok_el_q,
+ tok_el_samp,
+ /* tok_el_script,*/
+ /* tok_el_select, */
+ tok_el_small,
+ tok_el_span,
+ tok_el_strike,
+ tok_el_strong,
+ tok_el_style,
+ tok_el_sub,
+ tok_el_sup,
+ tok_el_table,
+ tok_el_tbody,
+ tok_el_td,
+ /* tok_el_textarea, */
+ tok_el_tfoot,
+ tok_el_th,
+ tok_el_thead,
+ /* tok_el_title, */
+ tok_el_tr,
+ tok_el_tt,
+ tok_el_u,
+ tok_el_ul,
+ /* tok_el_var, */
+ tok_eof,
+};
+
+static int allowed_el[tok_max];
+
+static const token_t blank;
+static token_t pushback;
+
+static void bail()
+{
+ fprintf(stderr, "bad html\n");
+ exit(EXIT_FAILURE);
+}
+
+static token_t mktok(toktype_t type, char *text)
+{
+ token_t tok = { 0, 0, 0, 0 };
+ tok.type = type;
+ tok.lexeme = strdup(text);
+ return tok;
+}
+
+static void deltok(token_t tok)
+{
+ free(tok.lexeme);
+}
+
+static int null(token_t tok)
+{
+ return tok.type == tok_eof;
+}
+
+static token_t gettok(void)
+{
+ if (null(pushback)) {
+ int type = yylex();
+ return mktok(type, yytext);
+ } else {
+ token_t tok = pushback;
+ pushback = blank;
+ return tok;
+ }
+}
+
+static void ungettok(token_t tok)
+{
+ deltok(pushback);
+ pushback = tok;
+}
+
+static token_t printtok(token_t tok)
+{
+ if (!null(tok))
+ fputs(tok.lexeme, stdout);
+ return tok;
+}
+
+static token_t match(int type)
+{
+ token_t tok = gettok();
+ if (tok.type != type)
+ bail();
+ return tok;
+}
+
+static token_t optmatch(int type)
+{
+ token_t tok = gettok();
+ if (tok.type != type) {
+ ungettok(tok);
+ return blank;
+ }
+ return tok;
+}
+
+static token_t lookfor(int type)
+{
+ token_t tok;
+ for (;;) {
+ tok = gettok();
+ if (tok.type == type || null(tok))
+ break;
+ deltok(tok);
+ }
+ return tok;
+}
+
+static token_t printuntil(int type)
+{
+ token_t tok;
+ for (;;) {
+ tok = gettok();
+ if (tok.type == type || null(tok))
+ break;
+ deltok(printtok(tok));
+ }
+ printtok(tok);
+ return tok;
+}
+
+static void parse_element(token_t in)
+{
+ token_t end = optmatch('/');
+ token_t name = gettok();
+
+ switch (name.type) {
+ case '/':
+ if (!null(end))
+ bail();
+ printtok(in);
+ printtok(name);
+ deltok(printtok(lookfor('>')));
+ goto out;
+ return;
+ case '>':
+ bail();
+ default:
+ break;
+ }
+
+ if (allowed_el[name.type]) {
+ printtok(in);
+ printtok(end);
+ printtok(name);
+ deltok(printuntil('>'));
+ } else {
+ deltok(lookfor('>'));
+ }
+
+out:
+ deltok(end);
+ deltok(name);
+}
+
+static void parse(void)
+{
+ for (;;) {
+ token_t tok = gettok();
+
+ switch (tok.type) {
+ case '<':
+ parse_element(tok);
+ break;
+ case tok_eof:
+ deltok(tok);
+ return;
+ default:
+ printtok(tok);
+ break;
+ }
+
+ deltok(tok);
+ }
+}
+
+static void init(void)
+{
+ int i;
+
+ for (i = 0; allowed_el_spec[i] != tok_eof; i++)
+ allowed_el[allowed_el_spec[i]] = 1;
+}
+
+int main(void)
+{
+ init();
+ parse();
+ return 0;
+}