diff options
author | Arnold D. Robbins <arnold@skeeve.com> | 2010-12-07 22:39:13 +0200 |
---|---|---|
committer | Arnold D. Robbins <arnold@skeeve.com> | 2010-12-07 22:39:13 +0200 |
commit | 071f0732801ed668956462649b9ee10cb6794599 (patch) | |
tree | 445faca91b94bd49493207a8cf45ca8dff3e59ed /test/dtdgport.awk | |
parent | 2ea264adbbefc2337b766f06d1ceb38cd798915b (diff) | |
download | egawk-071f0732801ed668956462649b9ee10cb6794599.tar.gz egawk-071f0732801ed668956462649b9ee10cb6794599.tar.bz2 egawk-071f0732801ed668956462649b9ee10cb6794599.zip |
Add a test file.
Diffstat (limited to 'test/dtdgport.awk')
-rw-r--r-- | test/dtdgport.awk | 325 |
1 files changed, 325 insertions, 0 deletions
diff --git a/test/dtdgport.awk b/test/dtdgport.awk new file mode 100644 index 00000000..42b9545b --- /dev/null +++ b/test/dtdgport.awk @@ -0,0 +1,325 @@ +# dtdgport.awk +# Reads an XML document from standard input and +# prints a DTD for this document to standard output. +# http://saxon.sourceforge.net/dtdgen.html +# JK 2004-10-09 +# JK 2006-03-05 +# JK 2007-08-05 Converted for portability from the original dtd_generator.awk + +# The program makes an internal list of all the elements +# and attributes that appear in your document, noting how +# they are nested, and noting which elements contain +# character data. + +BEGIN { + while (getXMLEVENT(ARGV[1])) { + # Remember each element. + if ( XMLEVENT == "STARTELEM" ) { + # Remember the parent names of each child node. + name[XMLDEPTH] = XMLSTARTELEM + if (XMLDEPTH>1) + child[name[XMLDEPTH-1], XMLSTARTELEM] ++ + # Count how often the element occurs. + elem[XMLSTARTELEM] ++ + # Remember all the attributes with the element. + for (a in XMLATTR) + attr[XMLSTARTELEM,a] ++ + } + } +} + +END { print_elem(1, name[1]) } # name[1] is the root + +# Print one element (including sub-elements) but only once. +function print_elem(depth, element, c, atn, chl, n, i, myChildren) { + if (already_printed[element]++) + return + indent=sprintf("%*s", 2*depth-2, "") + myChildren="" + for (c in child) { + split(c, chl, SUBSEP) + if (element == chl[1]) { + if (myChildren=="") + myChildren = chl[2] + else + myChildren = myChildren " | " chl[2] + } + } + # If an element has no child nodes, declare it as such. + if (myChildren=="") + print indent "<!ELEMENT", element , "( #PCDATA ) >" + else + print indent "<!ELEMENT", element , "(", myChildren, ")* >" + # After the element name itself, list its attributes. + for (a in attr) { + split(a, atn, SUBSEP) + # Treat only those attributes that belong to the current element. + if (element == atn[1]) { + # If an attribute occured each time with its element, notice this. + if (attr[element, atn[2]] == elem[element]) + print indent "<!ATTLIST", element, atn[2], "CDATA #REQUIRED>" + else + print indent "<!ATTLIST", element, atn[2], "CDATA #IMPLIED>" + } + } + # Now go through the child nodes of this elements and print them. + gsub(/[\|]/, " ", myChildren) + n=split(myChildren, chl) + for(i=1; i<=n; i++) { + print_elem(depth+1, chl[i]) + split(myChildren, chl) + } +} + +## +# getXMLEVENT( file ): # read next xml-data into XMLEVENT,XMLNAME,XMLATTR +# # referenced entities are not resolved +# Parameters: +# file -- path to xml file +# External variables: +# XMLEVENT -- type of item read, e.g. "STARTELEM"(tag), "ENDELEM"(end tag), +# "COMMENT"(comment), "CHARDATA"(data) +# XMLNAME -- value of item, e.g. tagname if type is "STARTELEM" or "ENDELEM" +# XMLATTR -- Map of attributes, only set if XMLEVENT=="STARTELEM" +# XMLPATH -- Path to current tag, e.g. /TopLevelTag/SubTag1/SubTag2 +# XMLROW -- current line number in input file +# XMLERROR -- error text, set on parse error +# Returns: +# 1 on successful read: XMLEVENT, XMLNAME, XMLATTR are set accordingly +# "" at end of file or parse error, XMLERROR is set on error +# Private Data: +# _XMLIO -- buffer, XMLROW, XMLPATH for open files +## + +function getXMLEVENT( file ,end,p,q,tag,att,accu,mline,mode,S0,ex,dtd) { + XMLEVENT=XMLNAME=XMLERROR=XMLSTARTELEM=XMLENDELEM = "" + split("", XMLATTR) + S0 = _XMLIO[file,"S0"] + XMLROW = _XMLIO[file,"line"]; + XMLPATH = _XMLIO[file,"path"]; + XMLDEPTH=_XMLIO[file,"depth"]+0; + dtd = _XMLIO[file,"dtd"]; + while (!XMLEVENT) { + if (S0 == "") { + if (1 != (getline S0 < file)) + break; + XMLROW ++; + S0 = S0 RS; + } + if (mode == "") { + mline = XMLROW + accu="" + p = substr(S0,1,1) + if (p != "<" && !(dtd && p=="]")) + mode="CHARDATA" + else if (p == "]") { + S0 = substr(S0,2) + mode="ENDDOCT" + end=">" + dtd=0 + } else if ( substr(S0,1,4) == "<!--" ) { + S0=substr(S0,5) + mode="COMMENT" + end="-->" + } else if ( substr(S0,1,9) == "<!DOCTYPE" ) { + S0 = substr(S0,10) + mode = "STARTDOCT" + end = ">" + } else if (substr(S0,1,9) == "<![CDATA[" ) { + S0 = substr(S0,10) + mode = "CDA" + end = "]]>" + } else if ( substr(S0,1,2) == "<!" ) { + S0 = substr(S0,3) + mode = "DEC" + end = ">" + } else if (substr(S0,1,2) == "<?") { + S0 = substr(S0,3) + mode = "PROCINST" + end = "?>" + } else if ( substr(S0,1,2)=="</" ) { + S0 = substr(S0,3) + mode = "ENDELEM" + end = ">"; + tag = S0 + sub(/[ \n\r\t>].*$/,"",tag) + S0 = substr(S0,length(tag)+1) + ex = XMLPATH + sub(/\/[^\/]*$/,"",XMLPATH) + ex = substr(ex, length(XMLPATH)+2) + if (tag != ex) { + XMLERROR = "unexpected close tag <" ex ">..</" tag ">" + break + } + } else { + S0 = substr(S0,2) + mode = "STARTELEM" + tag = S0 + sub(/[ \n\r\t\/>].*$/,"",tag) + S0 = substr(S0, length(tag)+1) + if (tag !~ /^[A-Za-z:_][0-9A-Za-z:_.-]*$/ ) { # /^[[:alpha:]:_][[:alnum:]:_.-]*$/ + XMLERROR = "invalid tag name '" tag "'" + break + } + XMLPATH = XMLPATH "/" tag; + } + } else if (mode == "CHARDATA") { # terminated by "<" or EOF + p = index(S0, "<") + if (dtd && (q=index(S0,"]")) && (!p || q<p) ) + p = q + if (p) { + XMLEVENT = "CHARDATA" + XMLNAME = accu unescapeXML(substr(S0, 1, p-1)) + S0 = substr(S0, p) + mode = "" + } else { + accu = accu unescapeXML(S0) + S0 = "" + } + } else if ( mode == "STARTELEM" ) { + sub(/^[ \n\r\t]*/,"",S0) + if (S0 == "") + continue + if (substr(S0, 1, 2) == "/>" ) { + S0 = substr(S0, 3) + mode = "" + XMLEVENT = "STARTELEM" + XMLNAME = XMLSTARTELEM = tag + XMLDEPTH ++ + S0 = "</" tag ">" S0 + } else if (substr(S0, 1, 1) == ">" ) { + S0 = substr(S0, 2) + mode = "" + XMLEVENT = "STARTELEM" + XMLNAME = XMLSTARTELEM = tag + XMLDEPTH ++ + } else { + att = S0 + sub(/[= \n\r\t\/>].*$/,"",att) + S0 = substr(S0, length(att) + 1) + mode = "ATTR" + if (att !~ /^[A-Za-z:_][0-9A-Za-z:_.-]*$/ ) { # /^[[:alpha:]:_][[:alnum:]:_.-]*$/ + XMLERROR = "invalid attribute name '" att "'" + break + } + } + } else if (mode == "ATTR") { + sub(/^[ \n\r\t]*/, "", S0) + if (S0 == "") + continue + if (substr(S0,1,1) == "=" ) { + S0 = substr(S0,2) + mode = "EQ" + } else { + XMLATTR[att] = att + mode = "STARTELEM" + } + } else if (mode == "EQ") { + sub(/^[ \n\r\t]*/,"",S0) + if (S0 == "") + continue + end = substr(S0,1,1) + if (end == "\"" || end == "'") { + S0 = substr(S0,2) + accu = "" + mode = "VALUE" + } else { + accu = S0 + sub(/[ \n\r\t\/>].*$/,"", accu) + S0 = substr(S0, length(accu)+1) + XMLATTR[att] = unescapeXML(accu) + mode = "STARTELEM" + } + } else if (mode == "VALUE") { # terminated by end + if (p = index(S0, end)) { + XMLATTR[att] = accu unescapeXML(substr(S0,1,p-1)) + S0 = substr(S0, p+length(end)) + mode = "STARTELEM" + } else { + accu = accu unescapeXML(S0) + S0="" + } + } else if (mode == "STARTDOCT") { # terminated by "[" or ">" + if ((q = index(S0, "[")) && (!(p = index(S0,end)) || q<p )) { + XMLEVENT = mode + XMLNAME = accu substr(S0, 1, q-1) + S0 = substr(S0, q+1) + mode = "" + dtd = 1 + } else if (p = index(S0,end)) { + XMLEVENT = mode + XMLNAME = accu substr(S0, 1, p-1) + S0 = "]" substr(S0, p) + mode = "" + dtd = 1 + } else { + accu = accu S0 + S0 = "" + } + } else if (p = index(S0,end)) { # terminated by end + XMLEVENT = mode + XMLNAME = XMLENDELEM = ( mode=="ENDELEM" ? tag : accu substr(S0,1,p-1)) + if (mode=="ENDELEM") XMLDEPTH -- + S0 = substr(S0, p+length(end)) + mode = "" + } else { + accu = accu S0 + S0 = "" + } + } + _XMLIO[file, "S0"] = S0; + _XMLIO[file, "line"] = XMLROW; + _XMLIO[file, "path"] = XMLPATH; + _XMLIO[file, "depth"] = XMLDEPTH; + _XMLIO[file, "dtd"] = dtd; + if (mode == "CHARDATA") { + mode = "" + if (accu != "") + XMLEVENT = "CHARDATA" + XMLNAME = "" + $0 = accu + } + if (XMLEVENT) { + if (XMLEVENT == "STARTELEM") { + # Copy attributes into $0. + NF=0 + for (ex in XMLATTR) { + NF ++ + $NF = ex + } + } + return 1 + } + close(file); + delete _XMLIO[file, "S0"]; + delete _XMLIO[file, "line"]; + delete _XMLIO[file, "path"]; + delete _XMLIO[file, "depth"]; + delete _XMLIO[file, "dtd"]; + if (XMLERROR) + XMLERROR = file ":" XMLROW": " XMLERROR + else if (mode) XMLERROR=file ":" mline ": " "unterminated " mode + else if (XMLPATH) XMLERROR=file ":" XMLROW": " "unclosed tag(s) " XMLPATH +} # func. getXMLEVENT + +# unescape data and attribute values, used by getXMLEVENT +function unescapeXML(text) { + gsub( "'", "'", text ) + gsub( """, "\"", text ) + gsub( ">", ">", text ) + gsub( "<", "<", text ) + gsub( "&", "\\&", text) + return text +} + +# close xml file +function closeXMLEVENT(file) { + close(file); + delete _XMLIO[file,"S0"] + delete _XMLIO[file,"line"] + delete _XMLIO[file,"path"]; + delete _XMLIO[file,"depth"]; + delete _XMLIO[file,"dtd"] + delete _XMLIO[file,"open"] + delete _XMLIO[file,"IND"] +} |