--- /cise/tmp/ppadala/tidy/include/html.h Sat Jun 1 02:50:02 2002 +++ include/html.h Mon Jul 1 18:27:41 2002 @@ -653,6 +653,8 @@ Lexer *lexer, Node *node); void PPrintXMLTree(Out *fout, uint mode, uint indent, Lexer *lexer, Node *node); +void PrintSgml(Out *fout, uint mode, uint indent, + Lexer *lexer, Node *node); void PFlushLine(Out *out, uint indent); void PCondFlushLine(Out *out, uint indent); void PrintBody(Out *fout, Lexer *lexer, Node *root); /* Feature request #434940 - fix by Dave Raggett/Ignacio Vazquez-Abrams 21 Jun 01 */ @@ -908,6 +910,8 @@ extern Bool XmlOut; extern Bool xHTML; extern Bool HtmlOut; /* Yes means set explicitly. */ +extern Bool DbSgml; +extern Bool DbXml; extern Bool XmlPi; /* add */ extern Bool XmlPIs; /* assume PIs end with ?> as per XML */ extern Bool XmlSpace; --- /cise/tmp/ppadala/tidy/src/config.c Sat Jun 1 02:50:02 2002 +++ src/config.c Mon Jul 1 18:16:33 2002 @@ -81,6 +81,8 @@ Bool XmlOut = no; /* create output as XML */ Bool xHTML = no; /* output extensible HTML */ Bool HtmlOut = no; /* output plain-old HTML, even for XHTML input. Yes means set explicitly. */ +Bool DbSgml = no; /* output docbook SGML */ +Bool DbXml = no; /* output docbook XML */ Bool XmlPi = no; /* add for XML docs */ Bool RawOut = no; /* avoid mapping values > 127 to entities: not used for anything yet */ Bool UpperCaseTags = no; /* output tags in upper not lower case */ --- /cise/tmp/ppadala/tidy/src/lexer.c Sat Jun 1 02:50:02 2002 +++ src/lexer.c Mon Jul 1 17:17:04 2002 @@ -1674,6 +1674,35 @@ return doctype; } +Bool SetSgmlDocType(Lexer *lexer, Node *root) +{ char *fpi, *sysid; + Node *doctype; + + if (doctype_mode == doctype_user && doctype_str) + { + fpi = doctype_str; + sysid = ""; + } + doctype = FindDocType(root); + if(!doctype) /* The html file doesn't contain doctype */ + if ( !(doctype = NewXhtmlDocTypeNode( root )) ) + return no; + + lexer->txtstart = lexer->txtend = lexer->lexsize; + + /* add public identifier */ + AddStringLiteral(lexer, fpi); + /* add system identifier */ + AddStringLiteral(lexer, sysid); + + lexer->txtend = lexer->lexsize; + + doctype->start = lexer->txtstart; + doctype->end = lexer->txtend; + + return no; +} + Bool SetXHTMLDocType(Lexer *lexer, Node *root) { char *fpi, *sysid, *dtdsub, *name_space = XHTML_NAMESPACE; --- /cise/tmp/ppadala/tidy/src/localize.c Sat Jun 1 02:50:02 2002 +++ src/localize.c Mon Jul 1 18:22:27 2002 @@ -1054,6 +1054,8 @@ tidy_out(out, " -asxml to convert HTML to well formed XHTML\n"); tidy_out(out, " -asxhtml to convert HTML to well formed XHTML\n"); tidy_out(out, " -ashtml to force XHTML to well formed HTML\n"); + tidy_out(out, " -dbsgml to convert HTML to Docbook SGML\n"); + tidy_out(out, " -dbxml to convert HTML to Docbook XML\n"); tidy_out(out, " -slides to burst into slides on H2 elements\n"); /* TRT */ --- /cise/tmp/ppadala/tidy/src/parser.c Sat Jun 1 02:50:02 2002 +++ src/parser.c Sun Jun 30 17:01:59 2002 @@ -501,7 +501,7 @@ TrimTrailingSpace(lexer, element, text); } -static Bool DescendantOf(Node *element, Dict *tag) +Bool DescendantOf(Node *element, Dict *tag) { Node *parent; --- /cise/tmp/ppadala/tidy/src/pprint.c Tue May 7 02:50:01 2002 +++ src/pprint.c Mon Jul 1 18:28:58 2002 @@ -36,6 +36,13 @@ static void PPrintPhp(Out *fout, uint indent, Lexer *lexer, Node *node); +/* Tag types to distinguish printing */ +typedef enum { + SgmlTagStart, + SgmlTagEnd +}SgmlTagType; + +extern Bool DescendantOf(Node *element, Dict *tag); #define NORMAL 0 #define PREFORMATTED 1 @@ -1766,6 +1773,467 @@ content != null; content = content->next) PPrintTree(fout, null, 0, lexer, content); + } +} + +void PrintSgmlDefault(Out *fout) +{ + char *str = "SGML cannot contain these elements"; + + fprintf(stderr, str); +} + +void PrintSgmlBodyStart(Out *fout, uint indent) +{ + char *str = "
"; + PPrintString(fout, indent, str); +} + +#define DIGIT(c) (c - 48) +#define TOTAL_H 6 +static Bool seen_h[TOTAL_H] = {no, no, no, no, no, no}; + +void PrintSgmlBodyEnd(Out *fout, uint indent) +{ int i = TOTAL_H - 1; + char str[10]; + + while(i >= 0) { + if(seen_h[i] == yes) { + sprintf(str, "", i + 1); + PPrintString(fout, indent, str); + seen_h[i] = no; + } + --i; + } + + sprintf(str, "
"); + PPrintString(fout, indent, str); +} + +char *GetContent(Lexer *lexer, Node *node) +{ Node *content = node->content; + char *str, *temp, c; + Bool flag = no; + + if(content->type == TextNode) { + int size = content->end - content->start; + + str = MemAlloc(size + 1); + str[size] = '\0'; + wstrncpy(str, lexer->lexbuf + content->start, size); + } + else if(content->tag == tag_a){ + AttVal *name; + int size; + + name = GetAttrByName(content, "name"); + if(name == NULL) + name = GetAttrByName(content, "href"); + + if(name == NULL) { /* No href or name, let's take empty id */ + size = 0; + str = MemAlloc(size + 1); + str[size] = '\0'; + } + else { + size = wstrlen(name->value); + str = MemAlloc(size + 1); + str[size] = '\0'; + wstrncpy(str, name->value, size); + } + } + + temp = str; + if(str[0] == '#') + flag = yes; + while(*temp) { + if(flag) + *temp = *(temp + 1); + if(*temp == ' ') + *temp = '_'; + ++temp; + } + return str; +} + +void PrintSectTag( Out *fout, uint indent, Lexer *lexer, Node *node) +{ char sectnum = node->element[1]; + char str[100]; + + char *id = GetContent(lexer, node); + + sprintf(str, "", sectnum, id); + PPrintString(fout, indent, str); + MemFree(id); +} + + +Bool ImmediateDescendantOfHTags(Node *element) +{ Node *parent = element->parent; + + if (strlen(parent->element) == 2 && + parent->element[0] == 'h' && + IsDigit(parent->element[1])) + return yes; + return no; +} + +void PrintSgmlLink(Out *fout, uint indent, Node *node) +{ AttVal *addr; + char str[100]; + + addr = GetAttrByName(node, "name"); + if(addr == NULL) { + addr = GetAttrByName(node, "href"); + if(!ImmediateDescendantOfHTags(node)) { + if(addr->value[0] == '#') + sprintf(str, "<link linkend=\"%s\">", addr->value + 1); + else + sprintf(str, "<ulink url=\"%s\">", addr->value); + if( !DescendantOf(node, tag_p) && + node->prev && node->prev->type == TextNode) + PPrintString(fout, indent, "<para>"); + PPrintString(fout, indent, str); + } + } + else { + if(!ImmediateDescendantOfHTags(node)) { + sprintf(str, "<para id=\"%s\">", addr->value); + PPrintString(fout, indent, str); + } + } +} + +void PrintSgmlLinkEnd(Out *fout, uint indent, Node *node) +{ AttVal *addr; + + addr = GetAttrByName(node, "name"); + if(addr == NULL) { + addr = GetAttrByName(node, "href"); + if(!ImmediateDescendantOfHTags(node)) { + if(addr->value[0] == '#') + PPrintString(fout, indent, "</link>"); + else + PPrintString(fout, indent, "</ulink>"); + if( !DescendantOf(node, tag_p) && + node->prev && node->prev->type == TextNode) + PPrintString(fout, indent, "</para>"); + } + } + else { + if(!ImmediateDescendantOfHTags(node)) + PPrintString(fout, indent, "</para>"); + } +} + + +void PrintSgmlTagString(Out *fout, uint mode, uint indent, + SgmlTagType sgmltag_type, char *str) +{ PPrintChar(str[0], mode | CDATA); + if(sgmltag_type == SgmlTagEnd) + PPrintChar('/', mode); + PPrintString(fout, indent, str + 1); +} + +void PrintSgmlList(Lexer *lexer, Out *fout, + uint mode, uint indent, + Node *node) +{ if(node->tag == tag_ul) + PPrintString(fout, indent, "<itemizedlist>"); + else if(node->tag == tag_ol) + PPrintString(fout, indent, "<orderedlist>"); + else if(node->tag == tag_dl) + PPrintString(fout, indent, "<variablelist>"); +} + +void PrintSgmlListEnd(Lexer *lexer, Out *fout, + uint mode, uint indent, + Node *node) +{ if(node->tag == tag_ul) + PPrintString(fout, indent, "</itemizedlist>"); + else if(node->tag == tag_ol) + PPrintString(fout, indent, "</orderedlist>"); + else if(node->tag == tag_dl) + PPrintString(fout, indent, "</variablelist>"); +} + +void PrintSgmlListItem(Out *fout, uint indent, Node *node) +{ if(node->tag == tag_li) + PPrintString(fout, indent, "<listitem><para>"); + else if(node->tag == tag_dd) + PPrintString(fout, indent, "<listitem>"); +} + +void PrintSgmlListItemEnd(Out *fout, uint indent, Node *node) +{ if(node->tag == tag_li) + PPrintString(fout, indent, "</para></listitem>"); + else if(node->tag == tag_dd) + PPrintString(fout, indent, "</listitem></varlistentry>"); +} + +void PrintSgmlTag( Out *fout, uint mode, uint indent, Lexer *lexer, Node *node, + SgmlTagType sgmltag_type) +{ static level = 0; /* We are at level 0(H1) initially */ + + if(node->tag == tag_html) { + if(sgmltag_type == SgmlTagStart) + PrintSgmlBodyStart(fout, indent); + else if(sgmltag_type == SgmlTagEnd) + PrintSgmlBodyEnd(fout, indent); + } + else if(node->tag == tag_head) + PrintSgmlTagString(fout, mode, indent, sgmltag_type,"<articleinfo>"); + else if(node->tag == tag_title) + PrintSgmlTagString(fout, mode, indent, sgmltag_type,"<title>"); + /* May be we can replace with node->model & CM_LIST */ + else if(node->tag == tag_ul || node->tag == tag_ol || + node->tag == tag_dl) { + if(sgmltag_type == SgmlTagStart) + PrintSgmlList(lexer, fout, mode, indent, node); + else if(sgmltag_type == SgmlTagEnd) + PrintSgmlListEnd(lexer, fout, mode, indent, node); + } + else if(node->tag == tag_dt) { + if(sgmltag_type == SgmlTagStart) + PPrintString(fout, indent, "<varlistentry><term>"); + else if(sgmltag_type == SgmlTagEnd) + PPrintString(fout, indent, "</term>"); + } + else if(node->tag == tag_li || node->tag == tag_dd) { + if(sgmltag_type == SgmlTagStart) + PrintSgmlListItem(fout, indent, node); + else if(sgmltag_type == SgmlTagEnd) + PrintSgmlListItemEnd(fout, indent, node); + } + else if(node->tag == tag_p) + PrintSgmlTagString(fout, mode, indent, sgmltag_type, "<para>"); + else if(node->tag == tag_blockquote) { + if(sgmltag_type == SgmlTagStart) + PPrintString(fout, indent, "<blockquote><para>"); + else if(sgmltag_type == SgmlTagEnd) + PPrintString(fout, indent, "</para></blockquote>"); + } + else if(node->tag == tag_pre) + PrintSgmlTagString(fout, mode, indent, sgmltag_type, + "<programlisting>"); + else if(node->tag == tag_a) { + if(sgmltag_type == SgmlTagStart) + PrintSgmlLink(fout, indent, node); + else if(sgmltag_type == SgmlTagEnd) + PrintSgmlLinkEnd(fout, indent, node); + } + else if(node->tag == tag_em) { + if(sgmltag_type == SgmlTagStart) { + if(DescendantOf(node, tag_p)) + PPrintString(fout, indent, "<emphasis>"); + else + PPrintString(fout, indent, "<para><emphasis>"); + } + else if(sgmltag_type == SgmlTagEnd) { + if(DescendantOf(node, tag_p)) + PPrintString(fout, indent, "</emphasis>"); + else + PPrintString(fout, indent, "</para></emphasis>"); + } + } + else { + if(wstrcasecmp(node->element, "code") == 0 && + !(node->parent->tag == tag_dd || + node->parent->tag == tag_li)) + PrintSgmlTagString(fout, mode, indent, + sgmltag_type, "<literal>"); + else if(strlen(node->element) == 2 && + node->element[0] == 'h' && + IsDigit(node->element[1])) { + if(sgmltag_type == SgmlTagStart) { + int sectnum = DIGIT(node->element[1]) - 1; + char str[10]; + if(seen_h[sectnum] == no) + seen_h[sectnum] = yes; + else { + int i = level; + while(i > sectnum && seen_h[i] == yes) { + sprintf(str, "</sect%d>", i + 1); + PPrintString(fout, indent, str); + seen_h[i] = no; + --i; + } + sprintf(str, "</sect%d>", sectnum + 1); + PPrintString(fout, indent, str); + } + PrintSectTag(fout, indent, lexer, node); + level = sectnum; + } + else + PPrintString(fout, indent, ""); + } + } +} + +void PrintSgml( Out *fout, uint mode, uint indent, + Lexer *lexer, Node *node) +{ Node *content; + + if (node == null) + return; + + if (node->type == TextNode) { + if(DescendantOf(node, tag_dd)&& !DescendantOf(node, tag_a) && + !DescendantOf(node, tag_p)) + /* && wstrcasecmp(node->parent->element, "code") != 0) + above line may be needed later to properly convert stuff */ + { + PPrintString(fout, indent, ""); + PPrintText(fout, mode, indent, lexer, node->start, node->end); + PPrintString(fout, indent, ""); + } + else + PPrintText(fout, mode, indent, lexer, node->start, node->end); + } + else if(node->type == CDATATag && EscapeCdata) + PPrintText(fout, mode, indent, lexer, node->start, node->end); + else if (node->type == CommentTag) + PPrintComment(fout, indent, lexer, node); + else if (node->type == RootNode) + { + for (content = node->content; + content != null; + content = content->next) + PrintSgml(fout, mode, indent, lexer, content); + } + else if (node->type == DocTypeTag) + PPrintDocType(fout, indent, lexer, node); + else if (node->type == CDATATag) + PPrintCDATA(fout, indent, lexer, node); + else if (node->type == SectionTag) + PPrintSection(fout, indent, lexer, node); + else if (node->type == AspTag || + node->type == JsteTag || + node->type == PhpTag ) + PrintSgmlDefault(fout); + else if (node->type == ProcInsTag) + PPrintPI(fout, indent, lexer, node); + else if (node->type == XmlDecl)// && DbXml May be this is needed + PPrintXmlDecl(fout, indent, lexer, node); + else if (node->tag->model & CM_EMPTY || + (node->type == StartEndTag && !xHTML)) + { + if (!(node->tag->model & CM_INLINE)) + PCondFlushLine(fout, indent); + + if (MakeClean && node->tag == tag_wbr) + PPrintString(fout, indent, " "); + else + PrintSgmlTag(fout, mode, indent, lexer, node, SgmlTagStart); + } + else { + if (node->type == StartEndTag) + node->type = StartTag; + + if (node->tag && node->tag->parser == ParsePre) + { + PCondFlushLine(fout, indent); + + indent = 0; + PCondFlushLine(fout, indent); + + PrintSgmlTag(fout, mode, indent, lexer, node, SgmlTagStart); + PFlushLine(fout, indent); + + for (content = node->content; + content != null; + content = content->next) + PrintSgml(fout, (mode | PREFORMATTED | NOWRAP), + indent, lexer, content); + + PCondFlushLine(fout, indent); + PrintSgmlTag(fout, mode, indent, lexer, node, SgmlTagEnd); + PFlushLine(fout, indent); + + if (IndentContent == no && node->next != null) + PFlushLine(fout, indent); + } + else if (node->tag->model & CM_INLINE) + { PrintSgmlTag(fout, mode, indent, lexer, node, SgmlTagStart); + + if (ShouldIndent(node)) + { + PCondFlushLine(fout, indent); + indent += spaces; + + for (content = node->content; + content != null; + content = content->next) + PrintSgml(fout, mode, indent, lexer, content); + + PCondFlushLine(fout, indent); + indent -= spaces; + PCondFlushLine(fout, indent); + } + else + { + + for (content = node->content; + content != null; + content = content->next) + PrintSgml(fout, mode, indent, lexer, content); + } + + PrintSgmlTag(fout, mode, indent, lexer, node, SgmlTagEnd); + } + else + { PCondFlushLine(fout, indent); + if (SmartIndent && node->prev != null) + PFlushLine(fout, indent); + + PrintSgmlTag(fout, mode ,indent, lexer, node, SgmlTagStart); + if (ShouldIndent(node)) + PCondFlushLine(fout, indent); + else if (node->tag->model & CM_HTML || + node->tag == tag_noframes || + (node->tag->model & CM_HEAD && !(node->tag == tag_title))) + PFlushLine(fout, indent); + + if (ShouldIndent(node)) + { PCondFlushLine(fout, indent); + indent += spaces; + + for (content = node->content; + content != null; + content = content->next) + PrintSgml(fout, mode, indent, lexer, content); + PCondFlushLine(fout, indent); + indent -= spaces; + PCondFlushLine(fout, indent); + } + else + { Node *last; + last = null; + for (content = node->content; + content != null; + content = content->next) { + /* kludge for naked text before block level tag */ + if (last && !IndentContent && last->type == TextNode && + content->tag && !(content->tag->model & CM_INLINE) ) + { + /* PFlushLine(fout, indent); */ + PFlushLine(fout, indent); + } + + PrintSgml(fout, mode, + (ShouldIndent(node) ? indent+spaces : indent), + lexer, content); + last = content; + } + } + PrintSgmlTag(fout, mode, indent, lexer, node, SgmlTagEnd); + PFlushLine(fout, indent); + if (IndentContent == no && + node->next != null && + HideEndTags == no && + (node->tag->model & (CM_BLOCK|CM_LIST|CM_DEFLIST|CM_TABLE))) + PFlushLine(fout, indent); + } } } --- /cise/tmp/ppadala/tidy/src/tidy.c Sat Jun 1 02:50:03 2002 +++ src/tidy.c Mon Jul 1 19:09:05 2002 @@ -1853,6 +1853,10 @@ IndentContent = yes; SmartIndent = yes; } + else if (wstrcasecmp(arg, "dbsgml") == 0) + DbSgml = yes; + else if(wstrcasecmp(arg, "dbxml") == 0) + DbXml = yes; else if (wstrcasecmp(arg, "omit") == 0) HideEndTags = yes; else if (wstrcasecmp(arg, "upper") == 0) @@ -2180,6 +2184,22 @@ else { lexer->warnings = 0; + + if (DbSgml) { + char *str = "article PUBLIC \"-//OASIS//DTD DocBook V3.1//EN\""; + + EncloseBodyText = yes; /* We want those

s */ + doctype_mode = doctype_user; + doctype_str = MemAlloc(wstrlen(str)); + wstrcpy(doctype_str, str); + } + else if(DbXml) { + char *str = "article PUBLIC \"-//OASIS//DTD DocBk XML V3.1.4 //EN\""; + EncloseBodyText = yes; /* We want those

s */ + doctype_mode = doctype_user; + doctype_str = MemAlloc(wstrlen(str)); + wstrcpy(doctype_str, str); + } document = ParseDocument(lexer); @@ -2226,6 +2246,10 @@ { if (xHTML) SetXHTMLDocType(lexer, document); + else if(DbSgml) + SetSgmlDocType(lexer, document); + else if(DbXml) + SetSgmlDocType(lexer, document); else FixDocType(lexer, document); @@ -2247,7 +2271,7 @@ } /* ensure presence of initial */ - if (XmlOut && XmlPi) + if ((XmlOut && XmlPi) || DbXml) FixXmlDecl(lexer, document); /* @@ -2381,9 +2405,12 @@ /* Feature request #434940 - fix by Dave Raggett/Ignacio Vazquez-Abrams 21 Jun 01 */ else if (BodyOnly) PrintBody(&out, lexer, document); - else - PPrintTree(&out, null, 0, lexer, document); - + else { + if(DbSgml || DbXml) + PrintSgml(&out, null, 0, lexer, document); + else + PPrintTree(&out, null, 0, lexer, document); + } PFlushLine(&out, 0); }