aboutsummaryrefslogtreecommitdiffstats
path: root/test/dtdgport.awk
diff options
context:
space:
mode:
authorArnold D. Robbins <arnold@skeeve.com>2010-12-07 22:39:13 +0200
committerArnold D. Robbins <arnold@skeeve.com>2010-12-07 22:39:13 +0200
commit071f0732801ed668956462649b9ee10cb6794599 (patch)
tree445faca91b94bd49493207a8cf45ca8dff3e59ed /test/dtdgport.awk
parent2ea264adbbefc2337b766f06d1ceb38cd798915b (diff)
downloadegawk-071f0732801ed668956462649b9ee10cb6794599.tar.gz
egawk-071f0732801ed668956462649b9ee10cb6794599.tar.bz2
egawk-071f0732801ed668956462649b9ee10cb6794599.zip
Add a test file.
Diffstat (limited to 'test/dtdgport.awk')
-rw-r--r--test/dtdgport.awk325
1 files changed, 325 insertions, 0 deletions
diff --git a/test/dtdgport.awk b/test/dtdgport.awk
new file mode 100644
index 00000000..42b9545b
--- /dev/null
+++ b/test/dtdgport.awk
@@ -0,0 +1,325 @@
+# dtdgport.awk
+# Reads an XML document from standard input and
+# prints a DTD for this document to standard output.
+# http://saxon.sourceforge.net/dtdgen.html
+# JK 2004-10-09
+# JK 2006-03-05
+# JK 2007-08-05 Converted for portability from the original dtd_generator.awk
+
+# The program makes an internal list of all the elements
+# and attributes that appear in your document, noting how
+# they are nested, and noting which elements contain
+# character data.
+
+BEGIN {
+ while (getXMLEVENT(ARGV[1])) {
+ # Remember each element.
+ if ( XMLEVENT == "STARTELEM" ) {
+ # Remember the parent names of each child node.
+ name[XMLDEPTH] = XMLSTARTELEM
+ if (XMLDEPTH>1)
+ child[name[XMLDEPTH-1], XMLSTARTELEM] ++
+ # Count how often the element occurs.
+ elem[XMLSTARTELEM] ++
+ # Remember all the attributes with the element.
+ for (a in XMLATTR)
+ attr[XMLSTARTELEM,a] ++
+ }
+ }
+}
+
+END { print_elem(1, name[1]) } # name[1] is the root
+
+# Print one element (including sub-elements) but only once.
+function print_elem(depth, element, c, atn, chl, n, i, myChildren) {
+ if (already_printed[element]++)
+ return
+ indent=sprintf("%*s", 2*depth-2, "")
+ myChildren=""
+ for (c in child) {
+ split(c, chl, SUBSEP)
+ if (element == chl[1]) {
+ if (myChildren=="")
+ myChildren = chl[2]
+ else
+ myChildren = myChildren " | " chl[2]
+ }
+ }
+ # If an element has no child nodes, declare it as such.
+ if (myChildren=="")
+ print indent "<!ELEMENT", element , "( #PCDATA ) >"
+ else
+ print indent "<!ELEMENT", element , "(", myChildren, ")* >"
+ # After the element name itself, list its attributes.
+ for (a in attr) {
+ split(a, atn, SUBSEP)
+ # Treat only those attributes that belong to the current element.
+ if (element == atn[1]) {
+ # If an attribute occured each time with its element, notice this.
+ if (attr[element, atn[2]] == elem[element])
+ print indent "<!ATTLIST", element, atn[2], "CDATA #REQUIRED>"
+ else
+ print indent "<!ATTLIST", element, atn[2], "CDATA #IMPLIED>"
+ }
+ }
+ # Now go through the child nodes of this elements and print them.
+ gsub(/[\|]/, " ", myChildren)
+ n=split(myChildren, chl)
+ for(i=1; i<=n; i++) {
+ print_elem(depth+1, chl[i])
+ split(myChildren, chl)
+ }
+}
+
+##
+# getXMLEVENT( file ): # read next xml-data into XMLEVENT,XMLNAME,XMLATTR
+# # referenced entities are not resolved
+# Parameters:
+# file -- path to xml file
+# External variables:
+# XMLEVENT -- type of item read, e.g. "STARTELEM"(tag), "ENDELEM"(end tag),
+# "COMMENT"(comment), "CHARDATA"(data)
+# XMLNAME -- value of item, e.g. tagname if type is "STARTELEM" or "ENDELEM"
+# XMLATTR -- Map of attributes, only set if XMLEVENT=="STARTELEM"
+# XMLPATH -- Path to current tag, e.g. /TopLevelTag/SubTag1/SubTag2
+# XMLROW -- current line number in input file
+# XMLERROR -- error text, set on parse error
+# Returns:
+# 1 on successful read: XMLEVENT, XMLNAME, XMLATTR are set accordingly
+# "" at end of file or parse error, XMLERROR is set on error
+# Private Data:
+# _XMLIO -- buffer, XMLROW, XMLPATH for open files
+##
+
+function getXMLEVENT( file ,end,p,q,tag,att,accu,mline,mode,S0,ex,dtd) {
+ XMLEVENT=XMLNAME=XMLERROR=XMLSTARTELEM=XMLENDELEM = ""
+ split("", XMLATTR)
+ S0 = _XMLIO[file,"S0"]
+ XMLROW = _XMLIO[file,"line"];
+ XMLPATH = _XMLIO[file,"path"];
+ XMLDEPTH=_XMLIO[file,"depth"]+0;
+ dtd = _XMLIO[file,"dtd"];
+ while (!XMLEVENT) {
+ if (S0 == "") {
+ if (1 != (getline S0 < file))
+ break;
+ XMLROW ++;
+ S0 = S0 RS;
+ }
+ if (mode == "") {
+ mline = XMLROW
+ accu=""
+ p = substr(S0,1,1)
+ if (p != "<" && !(dtd && p=="]"))
+ mode="CHARDATA"
+ else if (p == "]") {
+ S0 = substr(S0,2)
+ mode="ENDDOCT"
+ end=">"
+ dtd=0
+ } else if ( substr(S0,1,4) == "<!--" ) {
+ S0=substr(S0,5)
+ mode="COMMENT"
+ end="-->"
+ } else if ( substr(S0,1,9) == "<!DOCTYPE" ) {
+ S0 = substr(S0,10)
+ mode = "STARTDOCT"
+ end = ">"
+ } else if (substr(S0,1,9) == "<![CDATA[" ) {
+ S0 = substr(S0,10)
+ mode = "CDA"
+ end = "]]>"
+ } else if ( substr(S0,1,2) == "<!" ) {
+ S0 = substr(S0,3)
+ mode = "DEC"
+ end = ">"
+ } else if (substr(S0,1,2) == "<?") {
+ S0 = substr(S0,3)
+ mode = "PROCINST"
+ end = "?>"
+ } else if ( substr(S0,1,2)=="</" ) {
+ S0 = substr(S0,3)
+ mode = "ENDELEM"
+ end = ">";
+ tag = S0
+ sub(/[ \n\r\t>].*$/,"",tag)
+ S0 = substr(S0,length(tag)+1)
+ ex = XMLPATH
+ sub(/\/[^\/]*$/,"",XMLPATH)
+ ex = substr(ex, length(XMLPATH)+2)
+ if (tag != ex) {
+ XMLERROR = "unexpected close tag <" ex ">..</" tag ">"
+ break
+ }
+ } else {
+ S0 = substr(S0,2)
+ mode = "STARTELEM"
+ tag = S0
+ sub(/[ \n\r\t\/>].*$/,"",tag)
+ S0 = substr(S0, length(tag)+1)
+ if (tag !~ /^[A-Za-z:_][0-9A-Za-z:_.-]*$/ ) { # /^[[:alpha:]:_][[:alnum:]:_.-]*$/
+ XMLERROR = "invalid tag name '" tag "'"
+ break
+ }
+ XMLPATH = XMLPATH "/" tag;
+ }
+ } else if (mode == "CHARDATA") { # terminated by "<" or EOF
+ p = index(S0, "<")
+ if (dtd && (q=index(S0,"]")) && (!p || q<p) )
+ p = q
+ if (p) {
+ XMLEVENT = "CHARDATA"
+ XMLNAME = accu unescapeXML(substr(S0, 1, p-1))
+ S0 = substr(S0, p)
+ mode = ""
+ } else {
+ accu = accu unescapeXML(S0)
+ S0 = ""
+ }
+ } else if ( mode == "STARTELEM" ) {
+ sub(/^[ \n\r\t]*/,"",S0)
+ if (S0 == "")
+ continue
+ if (substr(S0, 1, 2) == "/>" ) {
+ S0 = substr(S0, 3)
+ mode = ""
+ XMLEVENT = "STARTELEM"
+ XMLNAME = XMLSTARTELEM = tag
+ XMLDEPTH ++
+ S0 = "</" tag ">" S0
+ } else if (substr(S0, 1, 1) == ">" ) {
+ S0 = substr(S0, 2)
+ mode = ""
+ XMLEVENT = "STARTELEM"
+ XMLNAME = XMLSTARTELEM = tag
+ XMLDEPTH ++
+ } else {
+ att = S0
+ sub(/[= \n\r\t\/>].*$/,"",att)
+ S0 = substr(S0, length(att) + 1)
+ mode = "ATTR"
+ if (att !~ /^[A-Za-z:_][0-9A-Za-z:_.-]*$/ ) { # /^[[:alpha:]:_][[:alnum:]:_.-]*$/
+ XMLERROR = "invalid attribute name '" att "'"
+ break
+ }
+ }
+ } else if (mode == "ATTR") {
+ sub(/^[ \n\r\t]*/, "", S0)
+ if (S0 == "")
+ continue
+ if (substr(S0,1,1) == "=" ) {
+ S0 = substr(S0,2)
+ mode = "EQ"
+ } else {
+ XMLATTR[att] = att
+ mode = "STARTELEM"
+ }
+ } else if (mode == "EQ") {
+ sub(/^[ \n\r\t]*/,"",S0)
+ if (S0 == "")
+ continue
+ end = substr(S0,1,1)
+ if (end == "\"" || end == "'") {
+ S0 = substr(S0,2)
+ accu = ""
+ mode = "VALUE"
+ } else {
+ accu = S0
+ sub(/[ \n\r\t\/>].*$/,"", accu)
+ S0 = substr(S0, length(accu)+1)
+ XMLATTR[att] = unescapeXML(accu)
+ mode = "STARTELEM"
+ }
+ } else if (mode == "VALUE") { # terminated by end
+ if (p = index(S0, end)) {
+ XMLATTR[att] = accu unescapeXML(substr(S0,1,p-1))
+ S0 = substr(S0, p+length(end))
+ mode = "STARTELEM"
+ } else {
+ accu = accu unescapeXML(S0)
+ S0=""
+ }
+ } else if (mode == "STARTDOCT") { # terminated by "[" or ">"
+ if ((q = index(S0, "[")) && (!(p = index(S0,end)) || q<p )) {
+ XMLEVENT = mode
+ XMLNAME = accu substr(S0, 1, q-1)
+ S0 = substr(S0, q+1)
+ mode = ""
+ dtd = 1
+ } else if (p = index(S0,end)) {
+ XMLEVENT = mode
+ XMLNAME = accu substr(S0, 1, p-1)
+ S0 = "]" substr(S0, p)
+ mode = ""
+ dtd = 1
+ } else {
+ accu = accu S0
+ S0 = ""
+ }
+ } else if (p = index(S0,end)) { # terminated by end
+ XMLEVENT = mode
+ XMLNAME = XMLENDELEM = ( mode=="ENDELEM" ? tag : accu substr(S0,1,p-1))
+ if (mode=="ENDELEM") XMLDEPTH --
+ S0 = substr(S0, p+length(end))
+ mode = ""
+ } else {
+ accu = accu S0
+ S0 = ""
+ }
+ }
+ _XMLIO[file, "S0"] = S0;
+ _XMLIO[file, "line"] = XMLROW;
+ _XMLIO[file, "path"] = XMLPATH;
+ _XMLIO[file, "depth"] = XMLDEPTH;
+ _XMLIO[file, "dtd"] = dtd;
+ if (mode == "CHARDATA") {
+ mode = ""
+ if (accu != "")
+ XMLEVENT = "CHARDATA"
+ XMLNAME = ""
+ $0 = accu
+ }
+ if (XMLEVENT) {
+ if (XMLEVENT == "STARTELEM") {
+ # Copy attributes into $0.
+ NF=0
+ for (ex in XMLATTR) {
+ NF ++
+ $NF = ex
+ }
+ }
+ return 1
+ }
+ close(file);
+ delete _XMLIO[file, "S0"];
+ delete _XMLIO[file, "line"];
+ delete _XMLIO[file, "path"];
+ delete _XMLIO[file, "depth"];
+ delete _XMLIO[file, "dtd"];
+ if (XMLERROR)
+ XMLERROR = file ":" XMLROW": " XMLERROR
+ else if (mode) XMLERROR=file ":" mline ": " "unterminated " mode
+ else if (XMLPATH) XMLERROR=file ":" XMLROW": " "unclosed tag(s) " XMLPATH
+} # func. getXMLEVENT
+
+# unescape data and attribute values, used by getXMLEVENT
+function unescapeXML(text) {
+ gsub( "&apos;", "'", text )
+ gsub( "&quot;", "\"", text )
+ gsub( "&gt;", ">", text )
+ gsub( "&lt;", "<", text )
+ gsub( "&amp;", "\\&", text)
+ return text
+}
+
+# close xml file
+function closeXMLEVENT(file) {
+ close(file);
+ delete _XMLIO[file,"S0"]
+ delete _XMLIO[file,"line"]
+ delete _XMLIO[file,"path"];
+ delete _XMLIO[file,"depth"];
+ delete _XMLIO[file,"dtd"]
+ delete _XMLIO[file,"open"]
+ delete _XMLIO[file,"IND"]
+}