/* $Id$ Part of SWI-Prolog Author: Jan Wielemaker E-mail: wielemak@science.uva.nl WWW: http://www.swi-prolog.org Copyright (C): 1985-2006, University of Amsterdam This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #define _ISOC99_SOURCE 1 /* fwprintf(), etc prototypes */ #define DTD_IMPLEMENTATION 1 #include <stdio.h> #include <wchar.h> #include "dtd.h" #include "model.h" #include "util.h" #include "catalog.h" #include "parser.h" #include <stdlib.h> #include <assert.h> #include <stdarg.h> #include <ctype.h> #include <string.h> #include "utf8.h" #include <errno.h> #include <wctype.h> #include "xml_unicode.h" #define DEBUG(g) ((void)0) #define ZERO_TERM_LEN (-1) /* terminated by nul */ #ifdef __WINDOWS__ #define inline __inline #define swprintf _snwprintf #endif /******************************* * LOCAL TYPES * *******************************/ typedef struct locbuf { dtd_srcloc start; /* p->startloc */ dtd_srcloc here; /* p->location */ } locbuf; /******************************* * PROTOYPES * *******************************/ static const ichar * itake_name(dtd *dtd, const ichar *in, dtd_symbol **id); static const ichar * itake_entity_name(dtd *dtd, const ichar *in, dtd_symbol **id); static const ichar * itake_namegroup(dtd *dtd, const ichar *decl, dtd_symbol **names, int *n); static const ichar * iskip_layout(dtd *dtd, const ichar *in); static dtd_parser * clone_dtd_parser(dtd_parser *p); static void free_model(dtd_model *m); static int process_entity_declaration(dtd_parser *p, const ichar *decl); static void free_notations(dtd_notation *n); static void free_shortrefs(dtd_shortref *sr); static int process_cdata(dtd_parser *p, int last); static int process_entity(dtd_parser *p, const ichar *name); static int emit_cdata(dtd_parser *p, int last); static dtd_space_mode istr_to_space_mode(const ichar *val); static void update_space_mode(dtd_parser *p, dtd_element *e, int natts, sgml_attribute *atts); static dtd_model * make_model(dtd *dtd, const ichar *decl, const ichar **end); static void for_elements_in_model(dtd_model *m, void (*f)(dtd_element *e, void *closure), void *closure); void putchar_dtd_parser(dtd_parser *p, int chr); void free_dtd_parser(dtd_parser *p); static const ichar * isee_character_entity(dtd *dtd, const ichar *in, int *chr); static int add_default_attributes(dtd_parser *p, dtd_element *e, int natts, sgml_attribute *atts); static int prepare_cdata(dtd_parser *p); /******************************* * MACROS * *******************************/ #define WITH_CLASS(p, c, g) \ { sgml_event_class _oc = p->event_class; \ p->event_class = c; \ g; \ p->event_class = _oc; \ } #define WITH_PARSER(p, g) \ { dtd_parser *_old = p; \ current_parser = p; \ g; \ current_parser = _old; \ } /******************************* * STATISTICS * *******************************/ #ifdef O_STATISTICS int edefs_created = 0; int edefs_freed = 0; int edefs_implicit = 0; int edefs_atts = 0; int edefs_decl = 0; int dtd_created = 0; int dtd_freed = 0; void sgml_statistics(void) { fprintf(stderr, "EDEFS: created %d; freed %d\n", edefs_created, edefs_freed); fprintf(stderr, "EDEFS: implicit %d; atts %d; decl %d\n", edefs_implicit, edefs_atts, edefs_decl); fprintf(stderr, "DTDs: created: %d; freed: %d\n", dtd_created, dtd_freed); } #define STAT(g) g #else #define STAT(g) ((void)0) #endif /******************************* * SRC LOCATION * *******************************/ static void /* TBD: also handle startloc */ push_location(dtd_parser *p, locbuf *save) { save->here = p->location; save->start = p->startloc; p->location.parent = &save->here; p->startloc.parent = &save->start; } static void pop_location(dtd_parser *p, locbuf *saved) { p->location = saved->here; p->startloc = saved->start; } static inline void _sgml_cplocation(dtd_srcloc *d, dtd_srcloc *loc) { d->type = loc->type; d->name.file = loc->name.file; d->line = loc->line; d->linepos = loc->linepos; d->charpos = loc->charpos; /* but not the parent! */ } void sgml_cplocation(dtd_srcloc *d, dtd_srcloc *loc) { _sgml_cplocation(d, loc); } #define sgml_cplocation(d,s) _sgml_cplocation(d, s) static void inc_location(dtd_srcloc *l, int chr) { if ( chr == '\n' ) { l->linepos = 0; l->line++; } l->linepos++; l->charpos++; } static void dec_location(dtd_srcloc *l, int chr) { if ( chr == '\n' ) { l->linepos = 2; /* not good! */ l->line--; } l->linepos--; l->charpos--; } /******************************* * CLASSIFICATION PRIMITIVES * *******************************/ static inline int HasClass(dtd *dtd, wint_t chr, int mask) { if ( chr <= 0xff ) return (dtd->charclass->class[(chr)] & (mask)); else { switch(mask) { case CH_NAME: return ( xml_basechar(chr) || xml_digit(chr) || xml_ideographic(chr) || xml_combining_char(chr) || xml_extender(chr) ); case CH_NMSTART: return ( xml_basechar(chr) || xml_ideographic(chr) ); case CH_WHITE: return FALSE; /* only ' ' and '\t' */ case CH_BLANK: return iswspace(chr); case CH_DIGIT: return xml_digit(chr); case CH_RS: case CH_RE: return FALSE; default: assert(0); return FALSE; } } } static const ichar * isee_func(dtd *dtd, const ichar *in, charfunc func) { if ( dtd->charfunc->func[func] == *in ) return ++in; return NULL; } /******************************* * SYMBOLS * *******************************/ static dtd_symbol_table * new_symbol_table(void) { dtd_symbol_table *t = sgml_calloc(1, sizeof(*t)); t->size = SYMBOLHASHSIZE; t->entries = sgml_calloc(t->size, sizeof(dtd_symbol*)); return t; } static void free_symbol_table(dtd_symbol_table *t) { int i; for(i=0; i<t->size; i++) { dtd_symbol *s, *next; for(s=t->entries[i]; s; s=next) { next = s->next; sgml_free((ichar*)s->name); sgml_free(s); } } sgml_free(t->entries); sgml_free(t); } dtd_symbol * dtd_find_symbol(dtd *dtd, const ichar *name) { dtd_symbol_table *t = dtd->symbols; if ( dtd->case_sensitive ) { int k = istrhash(name, t->size); dtd_symbol *s; for(s=t->entries[k]; s; s = s->next) { if ( istreq(s->name, name) ) return s; } } else { int k = istrcasehash(name, t->size); dtd_symbol *s; for(s=t->entries[k]; s; s = s->next) { if ( istrcaseeq(s->name, name) ) return s; } } return NULL; } static dtd_symbol * dtd_find_entity_symbol(dtd *dtd, const ichar *name) { dtd_symbol_table *t = dtd->symbols; if ( dtd->ent_case_sensitive ) { int k = istrhash(name, t->size); dtd_symbol *s; for(s=t->entries[k]; s; s = s->next) { if ( istreq(s->name, name) ) return s; } } else { int k = istrcasehash(name, t->size); dtd_symbol *s; for(s=t->entries[k]; s; s = s->next) { if ( istrcaseeq(s->name, name) ) return s; } } return NULL; } dtd_symbol * dtd_add_symbol(dtd *dtd, const ichar *name) { dtd_symbol_table *t = dtd->symbols; int k = istrhash(name, t->size); dtd_symbol *s; for(s=t->entries[k]; s; s = s->next) { if ( istreq(s->name, name) ) return s; } s = sgml_calloc(1, sizeof(*s)); s->name = istrdup(name); s->next = t->entries[k]; t->entries[k] = s; return s; } /******************************* * ENTITIES * *******************************/ static void free_entity_list(dtd_entity *e) { dtd_entity *next; for( ; e; e=next) { next = e->next; if ( e->value ) sgml_free(e->value); if ( e->extid ) sgml_free(e->extid); if ( e->exturl ) sgml_free(e->exturl); if ( e->baseurl ) sgml_free(e->baseurl); sgml_free(e); } } static dtd_entity * find_pentity(dtd *dtd, dtd_symbol *id) { dtd_entity *e; for(e = dtd->pentities; e; e=e->next) { if ( e->name == id ) return e; } return NULL; } /* returned path must be freed when done */ static ichar * entity_file(dtd *dtd, dtd_entity *e) { switch(e->type) { case ET_SYSTEM: case ET_PUBLIC: { const ichar *f; f = find_in_catalogue(e->catalog_location, e->name->name, e->extid, e->exturl, dtd->dialect != DL_SGML); if ( f ) /* owned by catalog */ { ichar *file; if ( is_absolute_path(f) || !e->baseurl ) file = istrdup(f); else file = localpath(e->baseurl, f); return file; } } default: return NULL; } } static const ichar * entity_value(dtd_parser *p, dtd_entity *e, int *len) { ichar *file; if ( !e->value && (file=entity_file(p->dtd, e)) ) { int normalise = (e->content == EC_SGML || e->content == EC_CDATA); size_t l; e->value = load_sgml_file_to_charp(file, normalise, &l); e->length = (long)l; sgml_free(file); } if ( len ) *len = e->length; return e->value; } static int expand_pentities(dtd_parser *p, const ichar *in, int ilen, ichar *out, int len) { dtd *dtd = p->dtd; int pero = dtd->charfunc->func[CF_PERO]; /* % */ int ero = dtd->charfunc->func[CF_ERO]; /* & */ const ichar *s; const ichar *end; if ( ilen == ZERO_TERM_LEN ) { end = in + wcslen(in); } else { end = &in[ilen]; } while(in < end) { if ( *in == pero ) { dtd_symbol *id; if ( (s = itake_entity_name(dtd, in+1, &id)) ) { dtd_entity *e = find_pentity(dtd, id); const ichar *eval; int l; in = s; if ( (s=isee_func(dtd, s, CF_ERC)) ) /* ; is not obligatory? */ in = s; if ( !e ) return gripe(ERC_EXISTENCE, L"parameter entity", id->name); if ( !(eval = entity_value(p, e, NULL)) ) return FALSE; if ( !expand_pentities(p, eval, ZERO_TERM_LEN, out, len) ) return FALSE; l = (int)istrlen(out); /* could be better */ out += l; len -= l; continue; } } if ( --len <= 0 ) { gripe(ERC_REPRESENTATION, L"Declaration too long"); return FALSE; } if ( *in == ero && in[1] == '#' ) /* &# */ { int chr; if ( (s=isee_character_entity(dtd, in, &chr)) ) { if ( chr == 0 ) { gripe(ERC_SYNTAX_ERROR, L"Illegal character entity", in); } else { *out++ = chr; in = s; continue; } } } *out++ = *in++; } *out = '\0'; return TRUE; } static int char_entity_value(const ichar *decl) { if ( *decl == '#' ) { const ichar *s = decl+1; ichar *end; long v; /* do octal too? */ if ( s[0] == 'x' || s[0] == 'X' ) v = wcstoul(s+1, &end, 16); else v = wcstoul(s, &end, 10); if ( *end == '\0' ) { return (int)v; } else if ( istreq(s, L"RS") ) { return '\n'; } else if ( istreq(s, L"RE") ) { return '\r'; } else if ( istreq(s, L"TAB") ) { return '\t'; } else if ( istreq(s, L"SPACE") ) { return ' '; } } return -1; } static const ichar * isee_character_entity(dtd *dtd, const ichar *in, int *chr) { const ichar *s; if ( (s=isee_func(dtd, in, CF_ERO)) && *s == '#' ) { ichar e[32]; ichar *o = e; int v; *o++ = *s++; while(o < e+sizeof(e)/sizeof(ichar)-1 && HasClass(dtd, *s, CH_NAME)) *o++ = *s++; if ( isee_func(dtd, s, CF_ERC)) /* skip ; */ s++; *o = '\0'; if ( (v=char_entity_value(e)) >= 0 ) { *chr = v; return s; } } return NULL; } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Expand entities in a string. Used to expand CDATA attribute values. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static int expand_entities(dtd_parser *p, const ichar *in, int len, ocharbuf *out) { const ichar *s; const ichar *end = &in[len]; dtd *dtd = p->dtd; int ero = dtd->charfunc->func[CF_ERO]; /* & */ while(in < end) { if ( *in == ero ) { const ichar *estart = in; /* for recovery */ int chr; if ( (s=isee_character_entity(dtd, in, &chr)) ) { if ( chr == 0 ) gripe(ERC_SYNTAX_ERROR, L"Illegal character entity", in); add_ocharbuf(out, chr); in = s; continue; } if ( HasClass(dtd, in[1], CH_NMSTART) ) { dtd_symbol *id; dtd_entity *e; const ichar *eval; if ( !(in = itake_name(dtd, in+1, &id)) ) { in = estart; goto recover; } if ( isee_func(dtd, in, CF_ERC) || *in == '\n' ) in++; if ( !(e = id->entity) && !(e=dtd->default_entity) ) { gripe(ERC_EXISTENCE, L"entity", id->name); in = estart; goto recover; } if ( !(eval = entity_value(p, e, NULL)) ) { gripe(ERC_NO_VALUE, e->name->name); in = estart; goto recover; } if ( e->content == EC_SGML ) { if ( !expand_entities(p, eval, (int)istrlen(eval), out) ) return FALSE; } else { const ichar *s; for(s=eval; *s; s++) add_ocharbuf(out, *s); } continue; } if ( dtd->dialect != DL_SGML ) gripe(ERC_SYNTAX_ERROR, L"Illegal entity", estart); } recover: if ( *in == CR && in[1] == LF ) in++; if ( HasClass(dtd, *in, CH_BLANK) ) { add_ocharbuf(out, ' '); in++; } else { add_ocharbuf(out, *in++); } } terminate_ocharbuf(out); return TRUE; } /******************************* * ELEMENTS * *******************************/ static dtd_element * find_element(dtd *dtd, dtd_symbol *id) { dtd_element *e; if ( id->element ) return id->element; /* must check */ e = sgml_calloc(1, sizeof(*e)); e->space_mode = SP_INHERIT; e->undefined = TRUE; e->name = id; id->element = e; e->next = dtd->elements; dtd->elements = e; return e; } static dtd_edef * new_element_definition(dtd *dtd) { dtd_edef *def = sgml_calloc(1, sizeof(*def)); STAT(edefs_created++); return def; } static dtd_element * def_element(dtd *dtd, dtd_symbol *id) { dtd_element *e = find_element(dtd, id); if ( !e->structure ) { e->structure = new_element_definition(dtd); e->structure->references = 1; e->structure->type = C_EMPTY; } return e; } static void free_name_list(dtd_name_list *nl) { dtd_name_list *next; for( ; nl; nl=next) { next = nl->next; sgml_free(nl); } } #define REFS_VIRGIN (-42) static void free_attribute(dtd_attr *a) { if ( a->references == REFS_VIRGIN || --a->references == 0 ) { switch(a->type) { case AT_NAMEOF: case AT_NOTATION: free_name_list(a->typeex.nameof); default: ; } switch(a->def) { case AT_DEFAULT: case AT_FIXED: { if ( a->islist ) sgml_free(a->att_def.list); else if ( a->type == AT_CDATA && a->att_def.cdata ) sgml_free(a->att_def.cdata); } default: ; } sgml_free(a); } } static void free_attribute_list(dtd_attr_list *l) { dtd_attr_list *next; for(; l; l=next) { next = l->next; free_attribute(l->attribute); sgml_free(l); } } static void free_element_list(dtd_element_list *l) { dtd_element_list *next; for( ; l; l=next) { next = l->next; sgml_free(l); } } static void free_element_definition(dtd_edef *def) { if ( --def->references == 0 ) { STAT(edefs_freed++); if ( def->content ) free_model(def->content); free_element_list(def->included); free_element_list(def->excluded); free_state_engine(def->initial_state); sgml_free(def); } } static void free_elements(dtd_element *e) { dtd_element *next; for( ; e; e=next) { next = e->next; if ( e->structure ) free_element_definition(e->structure); free_attribute_list(e->attributes); sgml_free(e); } } /******************************* * ATTRIBUTES * *******************************/ static dtd_attr * find_attribute(dtd_element *e, dtd_symbol *name) { dtd_attr_list *a; for(a=e->attributes; a; a=a->next) { if ( a->attribute->name == name ) return a->attribute; } return NULL; } /******************************* * PARSE PRIMITIVES * *******************************/ static const ichar * iskip_layout(dtd *dtd, const ichar *in) { ichar cmt = dtd->charfunc->func[CF_CMT]; /* also skips comment */ for( ; *in; in++ ) { if ( HasClass(dtd, *in, CH_BLANK) ) continue; if ( in[0] == cmt && in[1] == cmt ) { in += 2; for( ; *in; in++ ) { if ( in[0] == cmt && in[1] == cmt ) break; } in++; continue; } return in; } return in; } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - See whether we are looking at identifier "id". "id" must be lowercase! This is only used for reserved words, and parsed case-insentive in both XML and SGML modes. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static const ichar * isee_identifier(dtd *dtd, const ichar *in, char *id) { in = iskip_layout(dtd, in); /* match */ while (*id && (wint_t)*id == towlower(*in) ) id++, in++; if ( *id == 0 && !HasClass(dtd, *in, CH_NAME) ) return iskip_layout(dtd, in); return NULL; } static const ichar * itake_name(dtd *dtd, const ichar *in, dtd_symbol **id) { ichar buf[MAXNMLEN]; ichar *o = buf; ichar *e = &buf[MAXNMLEN]-1; in = iskip_layout(dtd, in); if ( !HasClass(dtd, *in, CH_NMSTART) ) return NULL; if ( dtd->case_sensitive ) { while( HasClass(dtd, *in, CH_NAME) && o < e ) *o++ = *in++; } else { while( HasClass(dtd, *in, CH_NAME) && o < e ) *o++ = towlower(*in++); } if ( o == e ) { gripe(ERC_REPRESENTATION, L"NAME too long"); return NULL; } *o++ = '\0'; *id = dtd_add_symbol(dtd, buf); return iskip_layout(dtd, in); } static const ichar * itake_entity_name(dtd *dtd, const ichar *in, dtd_symbol **id) { ichar buf[MAXNMLEN]; ichar *o = buf; ichar *e = &buf[MAXNMLEN]-1; in = iskip_layout(dtd, in); if ( !HasClass(dtd, *in, CH_NMSTART) ) return NULL; if ( dtd->ent_case_sensitive ) { while( HasClass(dtd, *in, CH_NAME) && o < e ) *o++ = *in++; } else { while( HasClass(dtd, *in, CH_NAME) && o < e ) *o++ = towlower(*in++); } if ( o == e ) { gripe(ERC_REPRESENTATION, L"Entity NAME too long"); return NULL; } *o++ = '\0'; *id = dtd_add_symbol(dtd, buf); return in; } static const ichar * itake_nmtoken(dtd *dtd, const ichar *in, dtd_symbol **id) { ichar buf[MAXNMLEN]; ichar *o = buf; ichar *e = &buf[MAXNMLEN]-1; in = iskip_layout(dtd, in); if ( !HasClass(dtd, *in, CH_NAME) ) return NULL; if ( dtd->case_sensitive ) { while( HasClass(dtd, *in, CH_NAME) && o < e ) *o++ = *in++; } else { while( HasClass(dtd, *in, CH_NAME) && o < e ) *o++ = towlower(*in++); } if ( o == e ) { gripe(ERC_REPRESENTATION, L"NMTOKEN too long"); return NULL; } *o = '\0'; *id = dtd_add_symbol(dtd, buf); return iskip_layout(dtd, in); } static const ichar * itake_nutoken(dtd *dtd, const ichar *in, dtd_symbol **id) { ichar buf[MAXNMLEN]; ichar *o = buf; ichar *e = &buf[MAXNMLEN]-1; in = iskip_layout(dtd, in); if ( !HasClass(dtd, *in, CH_DIGIT) ) return NULL; if ( dtd->case_sensitive ) { while( HasClass(dtd, *in, CH_NAME) && o < e ) *o++ = *in++; } else { while( HasClass(dtd, *in, CH_NAME) && o < e ) *o++ = towlower(*in++); } if ( o == e ) { gripe(ERC_REPRESENTATION, L"NUTOKEN too long"); return NULL; } *o = '\0'; if ( o - buf > 8 ) gripe(ERC_LIMIT, L"nutoken length"); *id = dtd_add_symbol(dtd, buf); return iskip_layout(dtd, in); } static const ichar * itake_number(dtd *dtd, const ichar *in, dtd_attr *at) { in = iskip_layout(dtd, in); switch(dtd->number_mode) { case NU_TOKEN: { ichar buf[MAXNMLEN]; ichar *o = buf; while( HasClass(dtd, *in, CH_DIGIT) ) *o++ = *in++; if ( o == buf ) return NULL; /* empty */ *o = '\0'; at->att_def.name = dtd_add_symbol(dtd, buf); return iskip_layout(dtd, (const ichar *)in); } case NU_INTEGER: { ichar *end; at->att_def.number = wcstol(in, &end, 10); if ( end > in && errno != ERANGE ) return iskip_layout(dtd, end); } } return NULL; } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Get a quoted value. After successful return, *start points to the start of the string in the input and *len to the length. The data is *not* nul terminated. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static const ichar * itake_string(dtd *dtd, const ichar *in, ichar **start, int *len) { in = iskip_layout(dtd, in); if ( isee_func(dtd, in, CF_LIT) || isee_func(dtd, in, CF_LITA) ) { ichar q = *in++; *start = (ichar *)in; while( *in && *in != q ) in++; if ( *in ) { *len = (int)(in - (*start)); return iskip_layout(dtd, ++in); } } return NULL; } static const ichar * itake_dubbed_string(dtd *dtd, const ichar *in, ichar **out) { ichar *start; int len; const ichar *end; if ( (end=itake_string(dtd, in, &start, &len)) ) *out = istrndup(start, len); return end; } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - itake_url() is used to get the argument of a SYSTEM or 2nd argument of a PUBLIC reference. Once upon a time it tried to tag the argument as file:<path>, but this job cannot be before lookup in the catalogue. It is now the same as itake_dubbed_string(), so we simply call this one. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static const ichar * itake_url(dtd *dtd, const ichar *in, ichar **out) { return itake_dubbed_string(dtd, in, out); } static const ichar * itake_nmtoken_chars(dtd *dtd, const ichar *in, ichar *out, int len) { in = iskip_layout(dtd, in); if ( !HasClass(dtd, *in, CH_NAME) ) return NULL; while( HasClass(dtd, *in, CH_NAME) ) { if ( --len <= 0 ) gripe(ERC_REPRESENTATION, L"Name token too long"); *out++ = (dtd->case_sensitive ? *in++ : (ichar)towlower(*in++)); } *out++ = '\0'; return iskip_layout(dtd, in); } /* There used to be a function itake_nonblank_chars(dtd, in, out, len) -> new end which - skipped layout, - copied characters from in[] to out[] until layout or \0 was found, - added a terminating \0 to out[], - skipped any following layout, and - returned the new position. That function was only called by get_attribute_value(), which used it to parse an unquoted attribute value. According to SGML, that's not right: unquoted attribute values must look like NMTOKENs (but have a different length bound). In particular, elements like <foo a=bar>zoo</foo> <foo a=ugh/zip/ are perfectly legal, so scanning an unquoted attribute value MUST stop at a '/' or '>'. According to HTML practice, pretty much any old junk will be accepted, and some HTML parsers will allow bare slashes in such an attribute. Typical HTML is *so* bad that it doesn't agree with *any* part of the HTML specifications (e.g., <FONT> is commonly wrapped around block-level elements, which has never been legal). It's not clear that there is much point in trying to accomodate bad HTML; if you really need to do that, use the free program HTML Tidy (from the http://www.w3c.org/ site) to clean up, and parse its output instead. However, in order to break as little as possible, the new (sgml-1.0.14) function accepts anything except > / \0 and blanks. JW: I decided to accept / as part of an unquoted in SGML-mode if shorttag is disabled as well as in XML mode if it is not the end of the begin-element */ static ichar const * itake_unquoted(dtd *dtd, ichar const *in, ichar *out, int len) { ichar const end2 = dtd->charfunc->func[CF_ETAGO2]; /* / */ ichar c; /* skip leading layout. Do NOT skip comments! --x-- is a value! */ while (c = *in, HasClass(dtd, c, CH_BLANK)) in++; /* copy the attribute to out[] */ while ( !HasClass(dtd, c, CH_BLANK) && c != '\0' ) { if ( c == end2 && (dtd->shorttag || (in[1] == '\0' && dtd->dialect != DL_SGML)) ) break; if ( --len > 0 ) *out++ = c; else if ( len == 0 ) gripe(ERC_REPRESENTATION, L"Attribute too long"); c = *++in; } *out = '\0'; /* skip trailing layout. While it is kind to skip comments here, it is technically wrong to do so. Tags may not contain comments. */ return iskip_layout(dtd, in); } /******************************* * DTD * *******************************/ dtd * new_dtd(const ichar *doctype) { dtd *dtd = sgml_calloc(1, sizeof(*dtd)); STAT(dtd_created++); dtd->magic = SGML_DTD_MAGIC; dtd->implicit = TRUE; dtd->dialect = DL_SGML; if ( doctype ) dtd->doctype = istrdup(doctype); dtd->symbols = new_symbol_table(); dtd->charclass = new_charclass(); dtd->charfunc = new_charfunc(); dtd->space_mode = SP_SGML; dtd->ent_case_sensitive = TRUE; /* case-sensitive entities */ dtd->shorttag = TRUE; /* allow for <tag/value/ */ dtd->number_mode = NU_TOKEN; return dtd; } void free_dtd(dtd *dtd) { if ( --dtd->references == 0 ) { STAT(dtd_freed++); if ( dtd->doctype ) sgml_free(dtd->doctype); free_entity_list(dtd->entities); free_entity_list(dtd->pentities); free_notations(dtd->notations); free_shortrefs(dtd->shortrefs); free_elements(dtd->elements); free_symbol_table(dtd->symbols); sgml_free(dtd->charfunc); sgml_free(dtd->charclass); dtd->magic = 0; sgml_free(dtd); } } static const wchar_t *xml_entities[] = { L"lt CDATA \"<\"", /* < */ L"gt CDATA \">\"", /* > */ L"amp CDATA \"&\"", /* & */ L"apos CDATA \"'\"", /* ' */ L"quot CDATA \""\"", /* " */ NULL }; int set_dialect_dtd(dtd *dtd, dtd_dialect dialect) { if ( dtd->dialect != dialect ) { dtd->dialect = dialect; switch(dialect) { case DL_SGML: { dtd->case_sensitive = FALSE; dtd->space_mode = SP_SGML; dtd->shorttag = TRUE; break; } case DL_XML: case DL_XMLNS: { const ichar **el; dtd_parser p; dtd->case_sensitive = TRUE; dtd->encoding = SGML_ENC_UTF8; dtd->space_mode = SP_PRESERVE; dtd->shorttag = FALSE; memset(&p, 0, sizeof(p)); p.dtd = dtd; for(el = xml_entities; *el; el++) process_entity_declaration(&p, *el); break; } } } return TRUE; } int set_option_dtd(dtd *dtd, dtd_option option, int set) { switch(option) { case OPT_SHORTTAG: dtd->shorttag = set; break; } return TRUE; } static const ichar * baseurl(dtd_parser *p) { if ( p->location.type == IN_FILE && p->location.name.file ) { return p->location.name.file; } return NULL; } static const ichar * process_entity_value_declaration(dtd_parser *p, const ichar *decl, dtd_entity *e) { dtd *dtd = p->dtd; const ichar *s; if ( e->type == ET_SYSTEM ) { if ( (s=itake_url(dtd, decl, &e->exturl)) ) { e->baseurl = istrdup(baseurl(p)); return s; } goto string_expected; } else { ichar *start; int len; ichar val[MAXSTRINGLEN]; if ( !(s = itake_string(dtd, decl, &start, &len)) ) goto string_expected; decl = s; expand_pentities(p, start, len, val, sizeof(val)/sizeof(ichar)); switch ( e->type ) { case ET_PUBLIC: { e->extid = istrdup(val); if ( isee_func(dtd, decl, CF_LIT) || isee_func(dtd, decl, CF_LITA) ) { if ( (s=itake_url(dtd, decl, &e->exturl)) ) { e->baseurl = istrdup(baseurl(p)); decl = s; } } return decl; } case ET_LITERAL: { e->value = istrdup(val); e->length = (int)wcslen(e->value); return decl; } default: assert(0); return NULL; } } string_expected: gripe(ERC_SYNTAX_ERROR, L"String expected", decl); return NULL; } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - The sgml-standard tells us to accept the first definition of an entity, silently suppressing any further attempt to redefine the entity. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static int process_entity_declaration(dtd_parser *p, const ichar *decl) { dtd *dtd = p->dtd; const ichar *s; dtd_symbol *id; dtd_entity *e; int isparam; int isdef = FALSE; /* parameter entity */ if ( (s=isee_func(dtd, decl, CF_PERO)) ) { isparam = TRUE; decl = s; } else isparam = FALSE; if ( !(s = itake_entity_name(dtd, decl, &id)) ) { if ( !(s = isee_identifier(dtd, decl, "#default")) ) return gripe(ERC_SYNTAX_ERROR, L"Name expected", decl); id = dtd_add_symbol(dtd, (ichar*)"#DEFAULT"); isdef = TRUE; } if ( isparam && find_pentity(dtd, id) ) { gripe(ERC_REDEFINED, L"parameter entity", id); return TRUE; /* already defined parameter entity */ } if ( id->entity ) { gripe(ERC_REDEFINED, L"entity", id); return TRUE; /* already defined normal entity */ } decl = iskip_layout(dtd, s); e = sgml_calloc(1, sizeof(*e)); e->name = id; e->catalog_location = (isparam ? CAT_PENTITY : CAT_ENTITY); if ( (s = isee_identifier(dtd, decl, "system")) ) { e->type = ET_SYSTEM; e->content = EC_SGML; decl = s; } else if ( (s = isee_identifier(dtd, decl, "public")) ) { e->type = ET_PUBLIC; e->content = EC_SGML; decl = s; } else { e->type = ET_LITERAL; if ( !isparam ) { if ( (s=isee_identifier(dtd, decl, "cdata")) ) { decl = s; e->content = EC_CDATA; } else if ( (s=isee_identifier(dtd, decl, "sdata")) ) { decl = s; e->content = EC_SDATA; } else if ( (s=isee_identifier(dtd, decl, "pi")) ) { decl = s; e->content = EC_PI; } else if ( (s=isee_identifier(dtd, decl, "starttag")) ) { decl = s; e->content = EC_STARTTAG; } else if ( (s=isee_identifier(dtd, decl, "endtag")) ) { decl = s; e->content = EC_ENDTAG; } else e->content = EC_SGML; } } if ( (decl=process_entity_value_declaration(p, decl, e)) ) { if ( e->type == ET_LITERAL ) { switch(e->content) { case EC_STARTTAG: { ichar *buf = sgml_malloc((e->length + 3)*sizeof(ichar)); buf[0] = dtd->charfunc->func[CF_STAGO]; istrcpy(&buf[1], e->value); buf[++e->length] = dtd->charfunc->func[CF_STAGC]; buf[++e->length] = 0; sgml_free(e->value); e->value = buf; e->content = EC_SGML; break; } case EC_ENDTAG: { ichar *buf = sgml_malloc((e->length + 4)*sizeof(ichar)); buf[0] = dtd->charfunc->func[CF_ETAGO1]; buf[1] = dtd->charfunc->func[CF_ETAGO2]; istrcpy(&buf[2], e->value); e->length++; buf[++e->length] = dtd->charfunc->func[CF_STAGC]; buf[++e->length] = 0; sgml_free(e->value); e->value = buf; e->content = EC_SGML; break; } default: break; } } else { if ( *decl ) { dtd_symbol *nname; if ( (s=isee_identifier(dtd, decl, "cdata")) ) { decl = s; e->content = EC_CDATA; } else if ( (s=isee_identifier(dtd, decl, "sdata")) ) { decl = s; e->content = EC_SDATA; } else if ( (s=isee_identifier(dtd, decl, "ndata")) ) { decl = s; e->content = EC_NDATA; } else return gripe(ERC_SYNTAX_ERROR, L"Bad datatype declaration", decl); if ( (s=itake_name(dtd, decl, &nname)) ) /* what is this? */ { decl = s; } else return gripe(ERC_SYNTAX_ERROR, L"Bad notation declaration", decl); } } if ( *decl ) return gripe(ERC_SYNTAX_ERROR, L"Unexpected end of declaraction", decl); } if ( isparam ) { e->next = dtd->pentities; dtd->pentities = e; } else { e->name->entity = e; e->next = dtd->entities; dtd->entities = e; } if ( isdef ) dtd->default_entity = e; return TRUE; } /******************************* * NOTATIONS * *******************************/ static dtd_notation * find_notation(dtd *dtd, dtd_symbol *name) { dtd_notation *n; for(n=dtd->notations; n; n = n->next) { if ( n->name == name ) return n; } return NULL; } static void add_notation(dtd *dtd, dtd_notation *not) { dtd_notation **n = &dtd->notations; for( ; *n; n = &(*n)->next) ; *n = not; } static int process_notation_declaration(dtd_parser *p, const ichar *decl) { dtd *dtd = p->dtd; dtd_symbol *nname; const ichar *s; ichar *system = NULL, *public = NULL; dtd_notation *not; if ( !(s=itake_name(dtd, decl, &nname)) ) return gripe(ERC_SYNTAX_ERROR, L"Notation name expected", decl); decl = s; if ( find_notation(dtd, nname) ) { gripe(ERC_REDEFINED, L"notation", nname); return TRUE; } if ( (s=isee_identifier(dtd, decl, "system")) ) { ; } else if ( (s=isee_identifier(dtd, decl, "public")) ) { decl = s; if ( !(s=itake_dubbed_string(dtd, decl, &public)) ) return gripe(ERC_SYNTAX_ERROR, L"Public identifier expected", decl); } else return gripe(ERC_SYNTAX_ERROR, L"SYSTEM or PUBLIC expected", decl); decl = s; if ( (s=itake_dubbed_string(dtd, decl, &system)) ) decl = s; if ( *decl ) return gripe(ERC_SYNTAX_ERROR, L"Unexpected end of declaraction", decl); not = sgml_calloc(1, sizeof(*not)); not->name = nname; not->system = system; not->public = public; not->next = NULL; add_notation(dtd, not); return TRUE; } static void free_notations(dtd_notation *n) { dtd_notation *next; for( ; n; n=next) { next = n->next; sgml_free(n->system); sgml_free(n->public); sgml_free(n); } } /******************************* * SHORTREF * *******************************/ static void free_maps(dtd_map *map) { dtd_map *next; for( ; map; map=next) { next = map->next; if ( map->from ) sgml_free(map->from); sgml_free(map); } } static void free_shortrefs(dtd_shortref *sr) { dtd_shortref *next; for( ; sr; sr=next) { next = sr->next; free_maps(sr->map); sgml_free(sr); } } static const ichar * shortref_add_map(dtd *dtd, const ichar *decl, dtd_shortref *sr) { ichar *start; int len; ichar from[MAXMAPLEN]; ichar *f = from; dtd_symbol *to; const ichar *s; const ichar *end; dtd_map **p; dtd_map *m; if ( !(s=itake_string(dtd, decl, &start, &len)) ) { gripe(ERC_SYNTAX_ERROR, L"map-string expected", decl); return NULL; } decl = s; if ( !(s=itake_entity_name(dtd, decl, &to)) ) { gripe(ERC_SYNTAX_ERROR, L"map-to name expected", decl); return NULL; } end = s; for(decl=start; len > 0;) { if ( *decl == 'B' ) /* blank */ { if ( decl[1] == 'B' ) { *f++ = CHR_DBLANK; decl += 2; len -= 2; continue; } *f++ = CHR_BLANK; decl++; len--; } else { *f++ = *decl++; /* any other character */ len--; } } *f = 0; for(p=&sr->map; *p; p = &(*p)->next) ; m = sgml_calloc(1, sizeof(*m)); m->from = istrdup(from); m->len = (int)istrlen(from); m->to = to; *p = m; return end; } static dtd_shortref * def_shortref(dtd_parser *p, dtd_symbol *name) { dtd *dtd = p->dtd; dtd_shortref *sr, **pr; for(pr=&dtd->shortrefs; *pr; pr = &(*pr)->next) { dtd_shortref *r = *pr; if ( r->name == name ) return r; } sr = sgml_calloc(1, sizeof(*sr)); sr->name = name; *pr = sr; return sr; } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Create an array with TRUE in any character that can be the last of the shortref map. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static void compile_map(dtd *dtd, dtd_shortref *sr) { dtd_map *map; for(map = sr->map; map; map = map->next) { ichar last = map->from[map->len-1]; switch( last ) { case CHR_BLANK: case CHR_DBLANK: { wint_t i; for( i=0; i< SHORTMAP_SIZE; i++) { if ( HasClass(dtd, i, CH_BLANK) ) sr->ends[i] = TRUE; } } default: sr->ends[last] = TRUE; } } } static int process_shortref_declaration(dtd_parser *p, const ichar *decl) { dtd *dtd = p->dtd; ichar buf[MAXDECL]; dtd_shortref *sr; dtd_symbol *name; const ichar *s; if ( !expand_pentities(p, decl, ZERO_TERM_LEN, buf, sizeof(buf)/sizeof(ichar)) ) return FALSE; decl = buf; if ( !(s=itake_name(dtd, decl, &name)) ) return gripe(ERC_SYNTAX_ERROR, L"Name expected", decl); decl = s; sr = def_shortref(p, name); if ( sr->defined ) { gripe(ERC_REDEFINED, L"shortref", name); return TRUE; } sr->defined = TRUE; while( *(decl = iskip_layout(dtd, decl)) != '\0' && (s=shortref_add_map(dtd, decl, sr)) ) decl = s; compile_map(dtd, sr); if ( *decl ) return gripe(ERC_SYNTAX_ERROR, L"Map expected", decl); return TRUE; } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Find named name. The name NULL stands for the #empty map - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static dtd_shortref * find_map(dtd *dtd, dtd_symbol *name) { dtd_shortref *sr; if ( !name ) { static dtd_shortref *empty; if ( !empty ) { empty = sgml_calloc(1, sizeof(*empty)); empty->name = dtd_add_symbol(dtd, (ichar*)"#EMPTY"); empty->defined = TRUE; } return empty; } for( sr = dtd->shortrefs; sr; sr = sr->next ) { if ( sr->name == name ) { if ( !sr->defined ) break; return sr; } } return NULL; } static void set_map_element(dtd_element *e, void *closure) { e->map = closure; } static int process_usemap_declaration(dtd_parser *p, const ichar *decl) { dtd *dtd = p->dtd; ichar buf[MAXDECL]; dtd_symbol *name; const ichar *s; dtd_symbol *ename; dtd_element *e; dtd_shortref *map; if ( !expand_pentities(p, decl, ZERO_TERM_LEN, buf, sizeof(buf)/sizeof(ichar)) ) return FALSE; decl = buf; if ( !(s=itake_name(dtd, decl, &name)) ) { if ( (s=isee_identifier(dtd, decl, "#empty")) ) name = NULL; else return gripe(ERC_SYNTAX_ERROR, L"map-name expected", decl); } decl = s; if ( !(map = find_map(dtd, name)) ) map = def_shortref(p, name); /* make undefined map */ if ( isee_func(dtd, decl, CF_GRPO) ) /* ( */ { dtd_model *model; if ( (model = make_model(dtd, decl, &s)) ) { for_elements_in_model(model, set_map_element, map); free_model(model); decl = s; } else return FALSE; } else if ( (s=itake_name(dtd, decl, &ename)) ) { e = find_element(dtd, ename); e->map = map; decl = s; } else if ( p->environments ) { if ( !map->defined ) gripe(ERC_EXISTENCE, L"map", name->name); p->environments->map = map; p->map = p->environments->map; } else return gripe(ERC_SYNTAX_ERROR, L"element-name expected", decl); if ( *decl ) return gripe(ERC_SYNTAX_ERROR, L"Unparsed", decl); return TRUE; } static int match_map(dtd *dtd, dtd_map *map, ocharbuf *buf) { wchar_t *data = buf->data.w; wchar_t *e = data+buf->size-1; ichar *m = map->from+map->len-1; while( m >= map->from ) { if ( e < data ) return 0; if ( *m == *e ) { m--; e--; continue; } if ( *m == CHR_DBLANK ) { if ( e>data && HasClass(dtd, *e, CH_WHITE) ) e--; else return FALSE; goto wblank; } if ( *m == CHR_BLANK ) { wblank: while( e>data && HasClass(dtd, *e, CH_WHITE) ) e--; m--; continue; } return 0; } return (int)(data+buf->size-1-e); } static int match_shortref(dtd_parser *p) { dtd_map *map; for(map = p->map->map; map; map = map->next) { int len; if ( (len=match_map(p->dtd, map, p->cdata)) ) { p->cdata->size -= len; if ( p->cdata_must_be_empty ) { int blank = TRUE; const wchar_t *s; int i; for(s = p->cdata->data.w, i=0; i++ < p->cdata->size; s++) { if ( !iswspace(*s) ) { blank = FALSE; break; } } p->blank_cdata = blank; } WITH_CLASS(p, EV_SHORTREF, { sgml_cplocation(&p->startloc, &p->location); p->startloc.charpos -= len; p->startloc.linepos -= len; if ( p->startloc.linepos < 0 ) { p->startloc.line--; p->startloc.linepos = 0; /* not correct! */ } DEBUG(printf("%d-%d: Matched map '%s' --> %s, len = %d\n", p->startloc.charpos, p->location.charpos, map->from, map->to->name, len)); process_entity(p, map->to->name); }) /* TBD: optimise */ return TRUE; } } return FALSE; } /******************************* * ELEMENTS * *******************************/ static void add_submodel(dtd_model *m, dtd_model *sub) { dtd_model **d; for( d = &m->content.group; *d; d = &(*d)->next ) ; *d = sub; } /* for_elements_in_model() Walk along the model, calling f(e, closure) for any element found in the model. Used for <!SHORTREF name model> */ static void for_elements_in_model(dtd_model *m, void (*f)(dtd_element *e, void *closure), void *closure) { switch(m->type) { case MT_SEQ: case MT_AND: case MT_OR: { dtd_model *sub = m->content.group; for(; sub; sub = sub->next) for_elements_in_model(sub, f, closure); break; } case MT_ELEMENT: (*f)(m->content.element, closure); break; default: ; } } static void free_model(dtd_model *m) { switch(m->type) { case MT_SEQ: case MT_AND: case MT_OR: { dtd_model *sub = m->content.group; dtd_model *next; for(; sub; sub = next) { next = sub->next; free_model(sub); } } default: ; } sgml_free(m); } static dtd_model * make_model(dtd *dtd, const ichar *decl, const ichar **end) { const ichar *s; dtd_model *m = sgml_calloc(1, sizeof(*m)); dtd_symbol *id; decl = iskip_layout(dtd, decl); if ( (s=isee_identifier(dtd, decl, "#pcdata")) ) { m->type = MT_PCDATA; m->cardinality = MC_ONE; /* actually don't care */ *end = s; return m; } if ( (s=itake_name(dtd, decl, &id)) ) { m->type = MT_ELEMENT; m->content.element = find_element(dtd, id); decl = s; } else { if ( !(s=isee_func(dtd, decl, CF_GRPO)) ) { gripe(ERC_SYNTAX_ERROR, L"Name group expected", decl); free_model(m); return NULL; } decl = s; for(;;) { dtd_model *sub; modeltype mt; if ( !(sub = make_model(dtd, decl, &s)) ) { free_model(sub); return NULL; } decl = s; add_submodel(m, sub); if ( (s = isee_func(dtd, decl, CF_OR)) ) { decl = s; mt = MT_OR; } else if ( (s = isee_func(dtd, decl, CF_SEQ)) ) { decl = s; mt = MT_SEQ; } else if ( (s = isee_func(dtd, decl, CF_AND)) ) { decl = s; mt = MT_AND; } else if ( (s = isee_func(dtd, decl, CF_GRPC)) ) { decl = s; break; } else { gripe(ERC_SYNTAX_ERROR, L"Connector ('|', ',' or '&') expected", decl); free_model(m); return NULL; } decl = iskip_layout(dtd, decl); if ( m->type != mt ) { if ( !m->type ) m->type = mt; else { gripe(ERC_SYNTAX_ERROR, L"Different connector types in model", decl); free_model(m); return NULL; } } } } if ( (s = isee_func(dtd, decl, CF_OPT)) ) { decl = s; m->cardinality = MC_OPT; } else if ( (s=isee_func(dtd, decl, CF_REP)) ) { decl = s; m->cardinality = MC_REP; } else if ( (s=isee_func(dtd, decl, CF_PLUS)) ) { /* ROK: watch out for (x) +(y) */ if ( isee_func(dtd, iskip_layout(dtd, s), CF_GRPO) == NULL ) { decl = s; m->cardinality = MC_PLUS; } } else m->cardinality = MC_ONE; if ( m->type == MT_UNDEF ) /* simplify (e+), etc. */ { dtd_model *sub = m->content.group; modelcard card; assert(!sub->next); if ( sub->cardinality == MC_ONE ) card = m->cardinality; else if ( m->cardinality == MC_ONE ) card = sub->cardinality; else { m->type = MT_OR; goto out; } *m = *sub; m->cardinality = card; sgml_free(sub); } out: *end = iskip_layout(dtd, decl); return m; } static const ichar * process_model(dtd *dtd, dtd_edef *e, const ichar *decl) { const ichar *s; decl = iskip_layout(dtd, decl); if ( (s = isee_identifier(dtd, decl, "empty")) ) { e->type = C_EMPTY; return s; } if ( (s = isee_identifier(dtd, decl, "cdata")) ) { e->type = C_CDATA; return s; } if ( (s = isee_identifier(dtd, decl, "rcdata")) ) { e->type = C_RCDATA; return s; } if ( (s = isee_identifier(dtd, decl, "any")) ) { e->type = C_ANY; return s; } e->type = C_PCDATA; if ( !(e->content = make_model(dtd, decl, &decl)) ) return FALSE; return decl; } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - See a name-group separator. As long as we haven't decided, this can be CF_NG. If we have decided they must all be the same. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static const ichar * isee_ngsep(dtd *dtd, const ichar *decl, charfunc *sep) { const ichar *s; if ( (s=isee_func(dtd, decl, *sep)) ) return iskip_layout(dtd, s); if ( *sep == CF_NG ) /* undecided */ { static const charfunc ng[] = { CF_SEQ, CF_OR, CF_AND }; int n; for(n=0; n<3; n++) { if ( (s=isee_func(dtd, decl, ng[n])) ) { *sep = ng[n]; return iskip_layout(dtd, s); } } } return NULL; } static const ichar * itake_namegroup(dtd *dtd, const ichar *decl, dtd_symbol **names, int *n) { const ichar *s; int en = 0; if ( (s=isee_func(dtd, decl, CF_GRPO)) ) { charfunc ngs = CF_NG; for(;;) { if ( !(decl=itake_name(dtd, s, &names[en++])) ) { gripe(ERC_SYNTAX_ERROR, L"Name expected", s); return NULL; } if ( (s=isee_ngsep(dtd, decl, &ngs)) ) { decl = iskip_layout(dtd, s); continue; } if ( (s=isee_func(dtd, decl, CF_GRPC)) ) { *n = en; decl = s; return iskip_layout(dtd, decl); } gripe(ERC_SYNTAX_ERROR, L"Bad name-group", decl); return NULL; } } return NULL; } typedef struct { dtd_symbol **list; int size; } namelist; static void add_list_element(dtd_element *e, void *closure) { namelist *nl = closure; nl->list[nl->size++] = e->name; } static const ichar * itake_el_or_model_element_list(dtd *dtd, const ichar *decl, dtd_symbol **names, int *n) { const ichar *s; if ( isee_func(dtd, decl, CF_GRPO) ) { dtd_model *model; if ( (model = make_model(dtd, decl, &s)) ) { namelist nl; nl.list = names; nl.size = 0; for_elements_in_model(model, add_list_element, &nl); free_model(model); *n = nl.size; return s; } else return NULL; } else { if ( !(s = itake_name(dtd, decl, &names[0])) ) { gripe(ERC_SYNTAX_ERROR, L"Name expected", decl); return NULL; } *n = 1; return s; } } static void add_element_list(dtd_element_list **l, dtd_element *e) { dtd_element_list *n = sgml_calloc(1, sizeof(*n)); n->value = e; for( ; *l; l = &(*l)->next ) ; *l = n; } static int process_element_declaraction(dtd_parser *p, const ichar *decl) { dtd *dtd = p->dtd; ichar buf[MAXDECL]; const ichar *s; dtd_symbol *eid[MAXATTELEM]; dtd_edef *def; int en; int i; /* expand parameter entities */ if ( !expand_pentities(p, decl, ZERO_TERM_LEN, buf, sizeof(buf)/sizeof(ichar)) ) return FALSE; decl = buf; if ( !(s=itake_el_or_model_element_list(dtd, decl, eid, &en)) ) return gripe(ERC_SYNTAX_ERROR, L"Name or name-group expected", decl); decl = s; if ( en == 0 ) return TRUE; /* 0 elements */ STAT(edefs_decl++); def = new_element_definition(dtd); for(i=0; i<en; i++) { find_element(dtd, eid[i]); assert(eid[i]->element->structure == NULL); eid[i]->element->structure = def; eid[i]->element->undefined = FALSE; } def->references = en; /* for GC */ /* omitted tag declarations (opt) */ if ( (s = isee_identifier(dtd, decl, "-")) ) { def->omit_close = FALSE; goto seeclose; } else if ( (s = isee_identifier(dtd, decl, "o")) ) { def->omit_open = TRUE; seeclose: decl = s; if ( (s = isee_identifier(dtd, decl, "-")) ) { def->omit_close = FALSE; } else if ( (s = isee_identifier(dtd, decl, "o")) ) { for(i=0; i<en; i++) def->omit_close = TRUE; } else return gripe(ERC_SYNTAX_ERROR, L"Bad omit-tag declaration", decl); decl = s; } /* content model */ if ( !(decl=process_model(dtd, def, decl)) ) return FALSE; /* in/excluded elements */ if ( decl[0] == '-' || decl[0] == '+' ) { dtd_symbol *ng[MAXNAMEGROUP]; int ns; dtd_element_list **l; if ( decl[0] == '-' ) l = &def->excluded; else l = &def->included; decl++; if ( (s=itake_namegroup(dtd, decl, ng, &ns)) ) { int i; decl = s; for(i=0; i<ns; i++) add_element_list(l, find_element(dtd, ng[i])); } else { return gripe(ERC_SYNTAX_ERROR, L"Name group expected", decl); } } if (*decl) return gripe(ERC_SYNTAX_ERROR, L"Unexpected end of declaration", decl); return TRUE; } static void add_name_list(dtd_name_list **nl, dtd_symbol *s) { dtd_name_list *n = sgml_calloc(1, sizeof(*n)); n->value = s; for( ; *nl; nl = &(*nl)->next ) ; *nl = n; } static void set_element_properties(dtd_element *e, dtd_attr *a) { if ( istreq(a->name->name, L"xml:space") ) { switch(a->def) { case AT_FIXED: case AT_DEFAULT: break; default: return; } switch (a->type ) { case AT_NAMEOF: case AT_NAME: case AT_NMTOKEN: e->space_mode = istr_to_space_mode(a->att_def.name->name); break; case AT_CDATA: e->space_mode = istr_to_space_mode((ichar *)a->att_def.cdata); break; default: break; } } } static void add_attribute(dtd *dtd, dtd_element *e, dtd_attr *a) { dtd_attr_list **l; dtd_attr_list *n; for(l = &e->attributes; *l; l = &(*l)->next) { if ( (*l)->attribute->name == a->name ) { gripe(ERC_REDEFINED, L"attribute", a->name); a->references++; /* attempt to redefine attribute: */ free_attribute(a); /* first wins according to standard */ return; } } n = sgml_calloc(1, sizeof(*n)); n->attribute = a; a->references++; *l = n; set_element_properties(e, a); } static int process_attlist_declaraction(dtd_parser *p, const ichar *decl) { dtd *dtd = p->dtd; dtd_symbol *eid[MAXATTELEM]; int i, en; ichar buf[MAXDECL]; const ichar *s; /* expand parameter entities */ if ( !expand_pentities(p, decl, ZERO_TERM_LEN, buf, sizeof(buf)/sizeof(ichar)) ) return FALSE; decl = iskip_layout(dtd, buf); DEBUG(printf("Expanded to %s\n", decl)); if ( !(decl=itake_el_or_model_element_list(dtd, decl, eid, &en)) ) return FALSE; /* fetch attributes */ while(*decl) { dtd_attr *at = sgml_calloc(1, sizeof(*at)); at->references = REFS_VIRGIN; /* name of attribute */ if ( !(s = itake_name(dtd, decl, &at->name)) ) { free_attribute(at); return gripe(ERC_SYNTAX_ERROR, L"Name expected", decl); } decl = s; /* (name1|name2|...) type */ if ( (s=isee_func(dtd, decl, CF_GRPO)) ) { charfunc ngs = CF_NG; at->type = AT_NAMEOF; decl=s; for(;;) { dtd_symbol *nm; if ( !(s = itake_nmtoken(dtd, decl, &nm)) ) { free_attribute(at); return gripe(ERC_SYNTAX_ERROR, L"Name expected", decl); } decl = s; add_name_list(&at->typeex.nameof, nm); if ( (s=isee_ngsep(dtd, decl, &ngs)) ) { decl = s; continue; } if ( (s = isee_func(dtd, decl, CF_GRPC)) ) { decl=s; decl = iskip_layout(dtd, decl); break; } free_attribute(at); return gripe(ERC_SYNTAX_ERROR, L"Illegal name-group", decl); } } else if ( (s=isee_identifier(dtd, decl, "cdata")) ) { decl = s; at->type = AT_CDATA; } else if ( (s=isee_identifier(dtd, decl, "entity")) ) { decl = s; at->type = AT_ENTITY; } else if ( (s=isee_identifier(dtd, decl, "entities")) ) { decl = s; at->type = AT_ENTITIES; at->islist = TRUE; } else if ( (s=isee_identifier(dtd, decl, "id")) ) { decl = s; at->type = AT_ID; } else if ( (s=isee_identifier(dtd, decl, "idref")) ) { decl = s; at->type = AT_IDREF; } else if ( (s=isee_identifier(dtd, decl, "idrefs")) ) { decl = s; at->type = AT_IDREFS; at->islist = TRUE; } else if ( (s=isee_identifier(dtd, decl, "name")) ) { decl = s; at->type = AT_NAME; } else if ( (s=isee_identifier(dtd, decl, "names")) ) { decl = s; at->type = AT_NAMES; at->islist = TRUE; } else if ( (s=isee_identifier(dtd, decl, "nmtoken")) ) { decl = s; at->type = AT_NMTOKEN; } else if ( (s=isee_identifier(dtd, decl, "nmtokens")) ) { decl = s; at->type = AT_NMTOKENS; at->islist = TRUE; } else if ( (s=isee_identifier(dtd, decl, "number")) ) { decl = s; at->type = AT_NUMBER; } else if ( (s=isee_identifier(dtd, decl, "numbers")) ) { decl = s; at->type = AT_NUMBERS; at->islist = TRUE; } else if ( (s=isee_identifier(dtd, decl, "nutoken")) ) { decl = s; at->type = AT_NUTOKEN; } else if ( (s=isee_identifier(dtd, decl, "nutokens")) ) { decl = s; at->type = AT_NUTOKENS; at->islist = TRUE; } else if ( (s=isee_identifier(dtd, decl, "notation")) ) { dtd_symbol *ng[MAXNAMEGROUP]; int ns; at->type = AT_NOTATION; decl=s; if ( (s=itake_namegroup(dtd, decl, ng, &ns)) ) { decl = s; for(i=0; i<ns; i++) add_name_list(&at->typeex.nameof, ng[i]); } else { free_attribute(at); return gripe(ERC_SYNTAX_ERROR, L"name-group expected", decl); } } else { free_attribute(at); return gripe(ERC_SYNTAX_ERROR, L"Attribute-type expected", decl); } /* Attribute Defaults */ if ( (s=isee_identifier(dtd, decl, "#fixed")) ) { decl = s; at->def = AT_FIXED; } else if ( (s=isee_identifier(dtd, decl, "#required")) ) { decl = s; at->def = AT_REQUIRED; } else if ( (s=isee_identifier(dtd, decl, "#current")) ) { decl = s; at->def = AT_CURRENT; } else if ( (s=isee_identifier(dtd, decl, "#conref")) ) { decl = s; at->def = AT_CONREF; } else if ( (s=isee_identifier(dtd, decl, "#implied")) ) { decl = s; at->def = AT_IMPLIED; } else /* real default */ at->def = AT_DEFAULT; if ( at->def == AT_DEFAULT || at->def == AT_FIXED ) { ichar buf[MAXSTRINGLEN]; ichar *start; int len; const ichar *end; if ( !(end=itake_string(dtd, decl, &start, &len)) ) { end=itake_nmtoken_chars(dtd, decl, buf, sizeof(buf)/sizeof(ichar)); start = buf; len = (int)istrlen(buf); } if ( !end ) return gripe(ERC_SYNTAX_ERROR, L"Bad attribute default", decl); /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Note: itake_name(), etc. work on nul-terminated strings. The result of itake_string() is a pointer in a nul-terminated string and these functions will stop scanning at the quote anyway, so we can use the length of the parsed data to verify we parsed all of it. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ switch(at->type) { case AT_CDATA: { at->att_def.cdata = istrndup(start, len); break; } case AT_ENTITY: case AT_NOTATION: case AT_NAME: { if ( !(s=itake_name(dtd, start, &at->att_def.name)) || (s-start) != len ) return gripe(ERC_DOMAIN, L"name", decl); break; } case AT_NMTOKEN: case AT_NAMEOF: { if ( !(s=itake_nmtoken(dtd, start, &at->att_def.name)) || (s-start) != len ) return gripe(ERC_DOMAIN, L"nmtoken", decl); break; } case AT_NUTOKEN: { if ( !(s=itake_nutoken(dtd, start, &at->att_def.name)) || (s-start) != len ) return gripe(ERC_DOMAIN, L"nutoken", decl); break; } case AT_NUMBER: { if ( !(s=itake_number(dtd, start, at)) || (s-start) != len ) return gripe(ERC_DOMAIN, L"number", decl); break; } case AT_NAMES: case AT_ENTITIES: case AT_IDREFS: case AT_NMTOKENS: case AT_NUMBERS: case AT_NUTOKENS: { at->att_def.list = istrndup(buf, len); break; } default: { free_attribute(at); return gripe(ERC_REPRESENTATION, L"No default for type"); } } decl = end; } /* add to list */ at->references = 0; for(i=0; i<en; i++) { dtd_element *e = def_element(dtd, eid[i]); add_attribute(dtd, e, at); } } return TRUE; } /******************************* * GENERIC TAG PROCESSING * *******************************/ typedef enum { IE_NORMAL, IE_INCLUDED, /* is included */ IE_EXCLUDED /* is excluded */ } includetype; static includetype in_or_excluded(sgml_environment *env, dtd_element *e) { for(; env; env=env->parent) { if ( env->element->structure ) { dtd_edef *def = env->element->structure; dtd_element_list *el; for(el=def->excluded; el; el=el->next) { if ( el->value == e ) return IE_EXCLUDED; } for(el=def->included; el; el=el->next) { if ( el->value == e ) return IE_INCLUDED; } } } return IE_NORMAL; } static int complete(sgml_environment *env) { if ( env->element->structure && !env->element->undefined && env->element->structure->type != C_ANY ) { dtd_edef *def = env->element->structure; if ( !same_state(def->final_state, env->state) ) return FALSE; } return TRUE; } static void validate_completeness(sgml_environment *env) { if ( !complete(env) ) { wchar_t buf[MAXNMLEN+50]; swprintf(buf, MAXNMLEN+50, L"Incomplete element: <%s>", env->element->name->name); gripe(ERC_VALIDATE, buf); /* TBD: expected */ } } static sgml_environment * push_element(dtd_parser *p, dtd_element *e, int callback) { if ( e != CDATA_ELEMENT ) { sgml_environment *env = sgml_calloc(1, sizeof(*env)); emit_cdata(p, FALSE); env->element = e; env->state = make_state_engine(e); env->space_mode = (p->environments ? p->environments->space_mode : p->dtd->space_mode); env->parent = p->environments; p->environments = env; if ( p->dtd->shorttag ) { env->saved_waiting_for_net = p->waiting_for_net; if ( p->event_class == EV_SHORTTAG ) { p->waiting_for_net = TRUE; env->wants_net = TRUE; } else { env->wants_net = FALSE; if ( e->structure && e->structure->omit_close == FALSE ) p->waiting_for_net = FALSE; } } if ( e->map ) p->map = env->map = e->map; else if ( env->parent ) p->map = env->map = env->parent->map; p->first = TRUE; if ( callback && p->on_begin_element ) { sgml_attribute atts[MAXATTRIBUTES]; int natts = 0; if ( !(p->flags & SGML_PARSER_NODEFS) ) natts = add_default_attributes(p, e, natts, atts); (*p->on_begin_element)(p, e, natts, atts); } if ( e->structure ) { if ( e->structure->type == C_CDATA || e->structure->type == C_RCDATA ) { p->state = (e->structure->type == C_CDATA ? S_CDATA : S_RCDATA); p->cdata_state = p->state; p->etag = e->name->name; p->etaglen = (int)istrlen(p->etag); sgml_cplocation(&p->startcdata, &p->location); } else p->cdata_state = S_PCDATA; } } return p->environments; } static void free_environment(sgml_environment *env) { #ifdef XMLNS if ( env->xmlns ) xmlns_free(env); #endif sgml_free(env); } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Pop the stack, closing all environment uptil `to'. The close was initiated by pushing the element `e'. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static int pop_to(dtd_parser *p, sgml_environment *to, dtd_element *e0) { sgml_environment *env, *parent; for(env = p->environments; env != to; env=parent) { dtd_element *e = env->element; validate_completeness(env); parent = env->parent; if ( e->structure && !e->structure->omit_close ) gripe(ERC_OMITTED_CLOSE, e->name->name); if ( e0 != CDATA_ELEMENT ) emit_cdata(p, TRUE); p->first = FALSE; p->environments = env; if ( p->dtd->shorttag ) p->waiting_for_net = env->saved_waiting_for_net; WITH_CLASS(p, EV_OMITTED, if ( p->on_end_element ) (*p->on_end_element)(p, e)); free_environment(env); } p->environments = to; p->map = to->map; return TRUE; } static void allow_for(dtd_element *in, dtd_element *e) { dtd_edef *def = in->structure; dtd_model *g; if ( def->type == C_EMPTY ) { def->type = C_PCDATA; def->content = sgml_calloc(1, sizeof(*def->content)); def->content->type = MT_OR; def->content->cardinality = MC_REP; } assert(def->content->type == MT_OR); g = def->content->content.group; if ( e == CDATA_ELEMENT ) { dtd_model *m; for(; g; g = g->next) { if ( g->type == MT_PCDATA ) return; } m = sgml_calloc(1, sizeof(*m)); m->type = MT_PCDATA; m->cardinality = MC_ONE; /* ignored */ add_submodel(def->content, m); } else { dtd_model *m; for(; g; g = g->next) { if ( g->type == MT_ELEMENT && g->content.element == e ) return; } m = sgml_calloc(1, sizeof(*m)); m->type = MT_ELEMENT; m->cardinality = MC_ONE; /* ignored */ m->content.element = e; add_submodel(def->content, m); } } static int open_element(dtd_parser *p, dtd_element *e, int warn) { if ( !p->environments && p->enforce_outer_element ) { dtd_element *f = p->enforce_outer_element->element; if ( f && f != e ) { if ( !f->structure || !f->structure->omit_open ) gripe(ERC_OMITTED_OPEN, f->name->name); WITH_CLASS(p, EV_OMITTED, { open_element(p, f, TRUE); if ( p->on_begin_element ) { sgml_attribute atts[MAXATTRIBUTES]; int natts = 0; if ( !(p->flags & SGML_PARSER_NODEFS) ) natts = add_default_attributes(p, f, natts, atts); (*p->on_begin_element)(p, f, natts, atts); } }); } } /* no DTD available yet */ if ( !p->environments && !p->dtd->doctype && e != CDATA_ELEMENT ) { const ichar *file; file = find_in_catalogue(CAT_DOCTYPE, e->name->name, NULL, NULL, p->dtd->dialect != DL_SGML); if ( file ) { dtd_parser *clone = clone_dtd_parser(p); gripe(ERC_NO_DOCTYPE, e->name->name, file); if ( load_dtd_from_file(clone, file) ) p->dtd->doctype = istrdup(e->name->name); else gripe(ERC_EXISTENCE, L"file", file); free_dtd_parser(clone); } } if ( p->environments ) { sgml_environment *env = p->environments; if ( env->element->undefined ) { allow_for(env->element, e); /* <!ELEMENT x - - (model) +(y)> */ push_element(p, e, FALSE); return TRUE; } if ( env->element->structure && env->element->structure->type == C_ANY ) { if ( e != CDATA_ELEMENT && e->undefined ) gripe(ERC_EXISTENCE, L"Element", e->name->name); push_element(p, e, FALSE); return TRUE; } switch(in_or_excluded(env, e)) { case IE_INCLUDED: push_element(p, e, FALSE); return TRUE; case IE_EXCLUDED: if ( warn ) gripe(ERC_NOT_ALLOWED, e->name->name); /*FALLTHROUGH*/ case IE_NORMAL: for(; env; env=env->parent) { dtd_state *new; if ( (new = make_dtd_transition(env->state, e)) ) { env->state = new; pop_to(p, env, e); push_element(p, e, FALSE); return TRUE; } else { dtd_element *oe[MAXOMITTED]; /* omitted open */ int olen; int i; if ( (olen=find_omitted_path(env->state, e, oe)) > 0 ) { pop_to(p, env, e); WITH_CLASS(p, EV_OMITTED, for(i=0; i<olen; i++) { env->state = make_dtd_transition(env->state, oe[i]); env = push_element(p, oe[i], TRUE); }) env->state = make_dtd_transition(env->state, e); push_element(p, e, FALSE); return TRUE; } } if ( !env->element->structure || !env->element->structure->omit_close ) break; } } if ( warn ) { if ( e == CDATA_ELEMENT ) gripe(ERC_VALIDATE, L"#PCDATA not allowed here"); else if ( e->undefined ) gripe(ERC_EXISTENCE, L"Element", e->name->name); else gripe(ERC_NOT_ALLOWED, e->name->name); } } if ( warn ) { push_element(p, e, FALSE); return TRUE; } else return FALSE; } static int close_element(dtd_parser *p, dtd_element *e, int conref) { sgml_environment *env; for(env = p->environments; env; env=env->parent) { if ( env->element == e ) /* element is open */ { sgml_environment *parent; for(env = p->environments; ; env=parent) { dtd_element *ce = env->element; if ( !(conref && env == p->environments) ) validate_completeness(env); parent = env->parent; p->first = FALSE; if ( p->on_end_element ) (*p->on_end_element)(p, env->element); free_environment(env); p->environments = parent; if ( ce == e ) /* closing current element */ { p->map = (parent ? parent->map : NULL); return TRUE; } else /* omited close */ { if ( ce->structure && !ce->structure->omit_close ) gripe(ERC_OMITTED_CLOSE, ce->name->name); } } } } return gripe(ERC_NOT_OPEN, e->name->name); } static int close_current_element(dtd_parser *p) { if ( p->environments ) { dtd_element *e = p->environments->element; emit_cdata(p, TRUE); return close_element(p, e, FALSE); } return gripe(ERC_SYNTAX_ERROR, L"No element to close", ""); } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - get_attribute_value() Get the value for an attribute. Once I thought this was simple, but Richard O'Keefe pointed to the complex handling of white-space in SGML attributes. Basically, if the attribute is quoted, we need: * If CDATA, map all blank to space characters, then expand entities * If !CDATA expand all entities, canonise white space by deleting leading and trailing space and squishing multiple space characters to a single (lower for us) case. This almost, but not completely matches the XML definition. This however is so complex we will ignore it for now. [Rewritten by Richard O'Keefe with these addional comments] Reads a value, the attribute name and value indicator having been processed already. It calls itake_string() to read quoted values, and itake_unquoted() to read unquoted values. itake_string(dtd, in, buf, size) - skips layout INCLUDING comments, - returns NULL if the next character is not ' or ", - copies characters from in to buf until a matching ' or " is found, - adds a terminating \0, - skips more layout INCLUDING comments, and - returns the new input position. It is quite wrong to skip leading comments here. In the tag <foo bar = --ugh-- zoo> the characters "--ugh--" *are the value*. They are not a comment. Comments are not in fact allowed inside tags, unfortunately. This tag is equivalent to <foo bar="--ugh--" something="zoo"> where something is an attribute that has zoo as one of its enumerals. Because itake_string() is called in many other places, this bug has not yet been fixed. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static ichar const * get_attribute_value(dtd_parser *p, ichar const *decl, sgml_attribute *att) { ichar tmp[MAXSTRINGLEN]; ichar *buf = tmp; ichar const *s; ichar c; dtd *dtd = p->dtd; ichar const *end; ichar *start; int len; enum { DIG_FIRST = 8, /* any token start with digit? */ NAM_FIRST = 4, /* any token start with non-digit name char? */ NAM_LATER = 2, /* any token have non-digit name char later? */ ANY_OTHER = 1, /* any token have illegal character? */ YET_EMPTY = 0 } token = YET_EMPTY; att->value.textW = NULL; /* UCS text */ att->value.number = 0; att->flags = 0; end = itake_string(dtd, decl, &start, &len); if ( end != NULL ) { ocharbuf out; init_ocharbuf(&out); expand_entities(p, start, len, &out); if ( att->definition->type == AT_CDATA ) { malloc_ocharbuf(&out); att->value.number = out.size; att->value.textW = out.data.w; return end; } else { ichar *d; buf = out.data.w; /* canonicalise blanks */ s = buf; while ((c = *s++) != '\0' && HasClass(dtd, c, CH_BLANK)) ; d = buf; while ( c != '\0' ) { token |= HasClass(dtd, c, CH_DIGIT) ? DIG_FIRST : HasClass(dtd, c, CH_NAME) ? NAM_FIRST : /* oops! */ ANY_OTHER; if ( d != buf ) *d++ = ' '; if ( dtd->case_sensitive ) { *d++ = c; while ((c = *s++) != '\0' && !HasClass(dtd, c, CH_BLANK)) { token |= HasClass(dtd, c, CH_DIGIT) ? 0 : HasClass(dtd, c, CH_NAME) ? NAM_LATER : /* oops! */ ANY_OTHER; *d++ = c; } } else { *d++ = towlower(c); while ((c = *s++) != '\0' && !HasClass(dtd, c, CH_BLANK)) { token |= HasClass(dtd, c, CH_DIGIT) ? 0 : HasClass(dtd, c, CH_NAME) ? NAM_LATER : /* oops! */ ANY_OTHER; *d++ = towlower(c); } } while (c != '\0' && HasClass(dtd, c, CH_BLANK)) c = *s++; } *d = '\0'; } } else { end = itake_unquoted(dtd, decl, tmp, sizeof(tmp)/sizeof(ichar)); if (end == NULL) return NULL; s = buf; c = *s++; if (c != '\0') { token |= HasClass(dtd, c, CH_DIGIT) ? DIG_FIRST : HasClass(dtd, c, CH_NAME) ? NAM_FIRST : /* oops! */ ANY_OTHER; while ((c = *s++) != 0) { token |= HasClass(dtd, c, CH_DIGIT) ? 0 : HasClass(dtd, c, CH_NAME) ? NAM_LATER : /* oops! */ ANY_OTHER; } } if ( token == YET_EMPTY || (token & ANY_OTHER) != 0) gripe(ERC_SYNTAX_WARNING, L"Attribute value requires quotes", buf); if (!dtd->case_sensitive && att->definition->type != AT_CDATA) istrlower(buf); } switch (att->definition->type) { case AT_NUMBER: /* number */ if (token != DIG_FIRST) { gripe(ERC_SYNTAX_WARNING, L"NUMBER expected", decl); } else if (dtd->number_mode == NU_INTEGER) { (void) istrtol(buf, &att->value.number); } else { att->value.textW = istrdup(buf); att->value.number = (long)istrlen(buf); } return end; case AT_CDATA: /* CDATA attribute */ att->value.textW = istrdup(buf); att->value.number = (long)istrlen(buf); return end; case AT_ID: /* identifier */ case AT_IDREF: /* identifier reference */ case AT_NAME: /* name token */ case AT_NOTATION: /* notation-name */ if (token == YET_EMPTY || (token & (DIG_FIRST | ANY_OTHER)) != 0) gripe(ERC_SYNTAX_WARNING, L"NAME expected", decl); break; case AT_NAMEOF: /* one of these names */ case AT_NMTOKEN: /* name-token */ if (token == YET_EMPTY || (token & ANY_OTHER) != 0) gripe(ERC_SYNTAX_WARNING, L"NMTOKEN expected", decl); if ( att->definition->type == AT_NAMEOF ) { dtd_name_list *nl; for(nl=att->definition->typeex.nameof; nl; nl = nl->next) { if ( istreq(nl->value->name, buf) ) goto passed; } gripe(ERC_SYNTAX_WARNING, L"unexpected value", decl); } break; case AT_NUTOKEN: /* number token */ if ((token & (NAM_FIRST | ANY_OTHER)) != 0) gripe(ERC_SYNTAX_WARNING, L"NUTOKEN expected", decl); break; case AT_ENTITY: /* entity-name */ if (token == YET_EMPTY || (token & (DIG_FIRST | ANY_OTHER)) != 0) gripe(ERC_SYNTAX_WARNING, L"entity NAME expected", decl); break; case AT_NAMES: /* list of names */ case AT_IDREFS: /* list of identifier references */ if (token == YET_EMPTY || (token & (DIG_FIRST | ANY_OTHER)) != 0) gripe(ERC_SYNTAX_WARNING, L"NAMES expected", decl); break; case AT_ENTITIES: /* entity-name list */ if (token == YET_EMPTY || (token & (DIG_FIRST | ANY_OTHER)) != 0) gripe(ERC_SYNTAX_WARNING, L"entity NAMES expected", decl); break; case AT_NMTOKENS: /* name-token list */ if (token == YET_EMPTY || (token & ANY_OTHER) != 0) gripe(ERC_SYNTAX_WARNING, L"NMTOKENS expected", decl); break; case AT_NUMBERS: /* number list */ if (token != DIG_FIRST) gripe(ERC_SYNTAX_WARNING, L"NUMBERS expected", decl); break; case AT_NUTOKENS: if ((token & (NAM_FIRST | ANY_OTHER)) != 0) gripe(ERC_SYNTAX_WARNING, L"NUTOKENS expected", decl); break; default: assert(0); return NULL; } passed: att->value.textW = istrdup(buf); /* TBD: more validation */ att->value.number = (long)istrlen(buf); return end; } static const ichar * process_attributes(dtd_parser *p, dtd_element *e, const ichar *decl, sgml_attribute *atts, int *argc) { int attn = 0; dtd *dtd = p->dtd; decl = iskip_layout(dtd, decl); while(decl && *decl) { dtd_symbol *nm; const ichar *s; if ( (s=itake_nmtoken(dtd, decl, &nm)) ) { decl = s; if ( (s=isee_func(dtd, decl, CF_VI)) ) /* name= */ { dtd_attr *a; if ( !HasClass(dtd, nm->name[0], CH_NMSTART) ) gripe(ERC_SYNTAX_WARNING, "Illegal start of attribute-name", decl); decl = s; if ( !(a=find_attribute(e, nm)) ) { a = sgml_calloc(1, sizeof(*a)); a->name = nm; a->type = AT_CDATA; a->def = AT_IMPLIED; add_attribute(dtd, e, a); if ( !e->undefined && !(dtd->dialect != DL_SGML && (istreq(L"xmlns", nm->name) || istrprefix(L"xmlns:", nm->name))) ) gripe(ERC_NO_ATTRIBUTE, e->name->name, nm->name); } atts[attn].definition = a; if ( (decl=get_attribute_value(p, decl, atts+attn)) ) { attn++; continue; } } else if ( e->structure ) { dtd_attr_list *al; /* value shorthand */ for(al=e->attributes; al; al=al->next) { dtd_attr *a = al->attribute; if ( a->type == AT_NAMEOF || a->type == AT_NOTATION ) { dtd_name_list *nl; for(nl=a->typeex.nameof; nl; nl = nl->next) { if ( nl->value == nm ) { if ( dtd->dialect != DL_SGML ) gripe(ERC_SYNTAX_WARNING, "Value short-hand in XML mode", decl); atts[attn].flags = 0; atts[attn].definition = a; atts[attn].value.textW = istrdup(nm->name); atts[attn].value.number = (long)istrlen(nm->name); attn++; goto next; } } } } gripe(ERC_NO_ATTRIBUTE_VALUE, e->name->name, nm->name); decl = s; } else { gripe(ERC_SYNTAX_ERROR, L"Bad attribute", decl); decl = s; } } else { *argc = attn; return decl; } next: ; } *argc = attn; return decl; } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - sgml_add_default_attributes() This function adds attributes for omitted default and fixed attributes. These attributes are added to the end of the attribute list. This function returns the new number of attributes. The `atts' array is assumed to be MAXATTRIBUTES long, normally passed from process_begin_element. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static int add_default_attributes(dtd_parser *p, dtd_element *e, int natts, sgml_attribute *atts) { dtd_attr_list *al; if ( e == CDATA_ELEMENT ) return natts; for(al=e->attributes; al; al=al->next) { dtd_attr *a = al->attribute; switch(a->def) { case AT_REQUIRED: /* TBD: check if present */ case AT_CURRENT: /* TBD: register in DTD and reuse */ case AT_CONREF: case AT_IMPLIED: goto next; case AT_FIXED: case AT_DEFAULT: { int i; sgml_attribute *ap; for(i=0, ap=atts; i<natts; i++, ap++) { if ( ap->definition == a ) goto next; } ap->definition = a; ap->value.textW = NULL; ap->value.number = 0; ap->flags = SGML_AT_DEFAULT; switch(a->type) { case AT_CDATA: ap->value.textW = a->att_def.cdata; ap->value.number = (long)istrlen(ap->value.textW); break; case AT_NUMBER: if ( p->dtd->number_mode == NU_TOKEN ) { ap->value.textW = (ichar*)a->att_def.name->name; ap->value.number = (long)istrlen(ap->value.textW); } else { ap->value.number = a->att_def.number; } break; default: if ( a->islist ) { ap->value.textW = a->att_def.list; } else { ap->value.textW = (ichar*)a->att_def.name->name; } ap->value.number = (long)istrlen(ap->value.textW); } natts++; } } next:; } return natts; } static void free_attribute_values(int argc, sgml_attribute *argv) { int i; for(i=0; i<argc; i++, argv++) { if ( (argv->flags & SGML_AT_DEFAULT) ) continue; /* shared with the DTD */ if ( argv->value.textW ) sgml_free(argv->value.textW); } } static int process_begin_element(dtd_parser *p, const ichar *decl) { dtd *dtd = p->dtd; dtd_symbol *id; const ichar *s; if ( (s=itake_name(dtd, decl, &id)) ) { sgml_attribute atts[MAXATTRIBUTES]; int natts; dtd_element *e = find_element(dtd, id); int empty = FALSE; int conref = FALSE; if ( !e->structure ) { dtd_edef *def; e->undefined = TRUE; STAT(edefs_implicit++); def_element(dtd, id); def = e->structure; def->type = C_EMPTY; } open_element(p, e, TRUE); decl=s; if ( (s=process_attributes(p, e, decl, atts, &natts)) ) decl=s; if ( dtd->dialect != DL_SGML ) { if ( (s=isee_func(dtd, decl, CF_ETAGO2)) ) { empty = TRUE; /* XML <tag/> */ decl = s; } #ifdef XMLNS if ( dtd->dialect == DL_XMLNS ) update_xmlns(p, e, natts, atts); #endif if ( dtd->dialect != DL_SGML ) update_space_mode(p, e, natts, atts); } else { int i; for(i=0; i<natts; i++) { if ( atts[i].definition->def == AT_CONREF ) { empty = TRUE; conref = TRUE; } } } if ( *decl ) gripe(ERC_SYNTAX_ERROR, L"Bad attribute list", decl); if ( !(p->flags & SGML_PARSER_NODEFS) ) natts = add_default_attributes(p, e, natts, atts); if ( empty || (dtd->dialect == DL_SGML && e->structure && e->structure->type == C_EMPTY && !e->undefined) ) p->empty_element = e; else p->empty_element = NULL; if ( p->on_begin_element ) (*p->on_begin_element)(p, e, natts, atts); free_attribute_values(natts, atts); if ( p->empty_element ) { p->empty_element = NULL; close_element(p, e, conref); if ( conref ) /* might be S_CDATA due to declared content */ p->cdata_state = p->state = S_PCDATA; } return TRUE; } return gripe(ERC_SYNTAX_ERROR, L"Bad open-element tag", decl); } static int process_end_element(dtd_parser *p, const ichar *decl) { dtd *dtd = p->dtd; dtd_symbol *id; const ichar *s; emit_cdata(p, TRUE); if ( (s=itake_name(dtd, decl, &id)) && *s == '\0' ) return close_element(p, find_element(dtd, id), FALSE); if ( p->dtd->shorttag && *decl == '\0' ) /* </>: close current element */ return close_current_element(p); return gripe(ERC_SYNTAX_ERROR, L"Bad close-element tag", decl); } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - process_net(dtd_parser *p) We've seen a / of a shorttag element. Close this one. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static int process_net(dtd_parser *p) { sgml_environment *env; prepare_cdata(p); for(env = p->environments; env; env=env->parent) { if ( env->wants_net ) { sgml_environment *parent; pop_to(p, env, NULL); /* close parents */ validate_completeness(env); parent = env->parent; emit_cdata(p, TRUE); p->first = FALSE; if ( p->on_end_element ) { WITH_CLASS(p, EV_SHORTTAG, (*p->on_end_element)(p, env->element)); } free_environment(env); p->environments = parent; p->map = (parent ? parent->map : NULL); return TRUE; } } return FALSE; } static int /* <!DOCTYPE ...> */ process_doctype(dtd_parser *p, const ichar *decl, const ichar *decl0) { dtd *dtd = p->dtd; dtd_symbol *id; const ichar *s; dtd_entity *et = NULL; if ( !(s=itake_name(dtd, decl, &id)) ) return gripe(ERC_SYNTAX_ERROR, L"Name expected", decl); decl = s; if ( (s=isee_identifier(dtd, decl, "system")) ) { et = sgml_calloc(1, sizeof(*et)); et->type = ET_SYSTEM; decl = s; } else if ( (s=isee_identifier(dtd, decl, "public")) ) { et = sgml_calloc(1, sizeof(*et)); et->type = ET_PUBLIC; decl = s; } else if ( isee_func(dtd, decl, CF_DSO) ) goto local; if ( et ) { et->name = id; et->catalog_location = CAT_DOCTYPE; if ( !(s=process_entity_value_declaration(p, decl, et)) ) return FALSE; decl = s; } if ( !dtd->doctype ) /* i.e. anonymous DTD */ { ichar *file; dtd_parser *clone; dtd->doctype = istrdup(id->name); /* Fill it */ if ( et ) file = entity_file(dtd, et); else file = istrdup(find_in_catalogue(CAT_DOCTYPE, dtd->doctype, NULL, NULL, dtd->dialect != DL_SGML)); if ( !file ) { gripe(ERC_EXISTENCE, L"DTD", dtd->doctype); } else { clone = clone_dtd_parser(p); if ( !load_dtd_from_file(clone, file) ) gripe(ERC_EXISTENCE, L"file", file); free_dtd_parser(clone); sgml_free(file); } } if ( et ) free_entity_list(et); local: if ( (s=isee_func(dtd, decl, CF_DSO)) ) /* [...] */ { int grouplevel = 1; data_mode oldmode = p->dmode; dtdstate oldstate = p->state; locbuf oldloc; const ichar *q; icharbuf *saved_ibuf = p->buffer; push_location(p, &oldloc); /* try to find start-location. */ /* fails if there is comment before */ /* the []! */ sgml_cplocation(&p->location, &p->startloc); inc_location(&p->location, '<'); for(q=decl0; q < s; q++) inc_location(&p->location, *q); p->dmode = DM_DTD; p->state = S_PCDATA; p->buffer = new_icharbuf(); for( ; *s; s++ ) { if ( isee_func(dtd, s, CF_LIT) || /* skip quoted strings */ isee_func(dtd, s, CF_LITA) ) { ichar q = *s; putchar_dtd_parser(p, *s++); /* pass open quote */ for( ; *s && *s != q; s++ ) putchar_dtd_parser(p, *s); if ( *s == q ) /* pass closing quote */ putchar_dtd_parser(p, *s); continue; } if ( isee_func(dtd, s, CF_DSO) ) grouplevel++; else if ( isee_func(dtd, s, CF_DSC) && --grouplevel == 0 ) break; putchar_dtd_parser(p, *s); } p->dtd->implicit = FALSE; p->state = oldstate; p->dmode = oldmode; free_icharbuf(p->buffer); p->buffer = saved_ibuf; pop_location(p, &oldloc); } p->enforce_outer_element = id; /* make this the outer element */ return TRUE; } static void init_decoding(dtd_parser *p) { #ifdef UTF8 int decode; dtd *dtd = p->dtd; if ( dtd->encoding == SGML_ENC_UTF8 && p->encoded == TRUE ) decode = TRUE; else decode = FALSE; if ( p->utf8_decode != decode ) { DEBUG(fprintf(stderr, "%s UTF-8 decoding on %p\n", decode ? "Enable" : "Disable", p)); p->utf8_decode = decode; } #endif } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - xml_set_encoding() is the public interface to set the encoding for the parser. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static int /* strcasecmp() with C locale */ posix_strcasecmp(const char *s1, const char *s2) { for(; *s1 && *s2; s1++, s2++) { int c1 = *s1&0xff; int c2 = *s2&0xff; if ( c1 >= 'A' && c1 <= 'Z' ) c1 += 'a'-'A'; if ( c2 >= 'A' && c2 <= 'Z' ) c2 += 'a'-'A'; if ( c1 != c2 ) return c1-c2; } return *s1 - *s2; } int xml_set_encoding(dtd_parser *p, const char *enc) { dtd *dtd = p->dtd; if ( posix_strcasecmp(enc, "iso-8859-1") == 0 ) { dtd->encoding = SGML_ENC_ISO_LATIN1; } else if ( posix_strcasecmp(enc, "us-ascii") == 0 ) { dtd->encoding = SGML_ENC_ISO_LATIN1; /* doesn't make a difference */ } else if ( posix_strcasecmp(enc, "utf-8") == 0 ) { dtd->encoding = SGML_ENC_UTF8; } else return FALSE; init_decoding(p); return TRUE; } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - set_encoding() sets the encoding from the encoding="..." field of the XML header. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static void set_encoding(dtd_parser *p, const ichar *enc) { char buf[32]; char *e = buf+sizeof(buf)-1; char *o; const ichar *i; for(i=enc, o=buf; *i; ) { if ( *i < 128 && o < e ) { *o++ = (char)*i++; } else { goto error; } } *o = '\0'; if ( !xml_set_encoding(p, buf) ) { error: gripe(ERC_EXISTENCE, L"character encoding", enc); } } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Process <? ... ?> Should deal with character encoding for XML documents. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static int process_pi(dtd_parser *p, const ichar *decl) { const ichar *s; dtd *dtd = p->dtd; if ( (s=isee_identifier(dtd, decl, "xml")) ) /* <?xml version="1.0"?> */ { decl = s; switch(dtd->dialect) { case DL_SGML: set_dialect_dtd(dtd, DL_XML); break; case DL_XML: case DL_XMLNS: break; } while(*decl) { dtd_symbol *nm; if ( (s=itake_name(dtd, decl, &nm)) && (s=isee_func(dtd, s, CF_VI)) ) /* = */ { ichar *start; int len; ichar buf[MAXSTRINGLEN]; const ichar *end; if ( !(end=itake_string(dtd, s, &start, &len)) ) { end=itake_nmtoken_chars(dtd, s, buf, sizeof(buf)/sizeof(ichar)); start = buf; len = (int)istrlen(buf); } if ( end ) { decl = end; if ( istrcaseeq(nm->name, L"encoding") ) { ichar tmp[32]; if ( len < (int)(sizeof(tmp)/sizeof(ichar)-1) ) { istrncpy(tmp, start, len); tmp[len] = 0; set_encoding(p, tmp); } else { gripe(ERC_SYNTAX_ERROR, L"Unterminated encoding?", decl); } } /* fprintf(stderr, "XML %s = %s\n", nm->name, buf); */ continue; } } gripe(ERC_SYNTAX_ERROR, L"Illegal XML parameter", decl); break; } return TRUE; } if ( p->on_pi ) (*p->on_pi)(p, decl); return FALSE; /* Warn? */ } static int process_sgml_declaration(dtd_parser *p, const ichar *decl) { return gripe(ERC_SYNTAX_WARNING, L"Ignored <!SGML ...> declaration", NULL); } static int process_declaration(dtd_parser *p, const ichar *decl) { const ichar *s; dtd *dtd = p->dtd; if ( p->dmode != DM_DTD ) { if ( (s=isee_func(dtd, decl, CF_ETAGO2)) ) /* </ ... > */ { return process_end_element(p, s); } else if ( HasClass(dtd, *decl, CH_NAME) ) /* <letter */ { return process_begin_element(p, decl); } } if ( (s=isee_func(dtd, decl, CF_MDO2)) ) /* <! ... >*/ { decl = s; if ( p->on_decl ) (*p->on_decl)(p, decl); if ( (s = isee_identifier(dtd, decl, "entity")) ) process_entity_declaration(p, s); else if ( (s = isee_identifier(dtd, decl, "element")) ) process_element_declaraction(p, s); else if ( (s = isee_identifier(dtd, decl, "attlist")) ) process_attlist_declaraction(p, s); else if ( (s = isee_identifier(dtd, decl, "notation")) ) process_notation_declaration(p, s); else if ( (s = isee_identifier(dtd, decl, "shortref")) ) process_shortref_declaration(p, s); else if ( (s = isee_identifier(dtd, decl, "usemap")) ) process_usemap_declaration(p, s); else if ( (s = isee_identifier(dtd, decl, "sgml")) ) process_sgml_declaration(p, s); else if ( (s = isee_identifier(dtd, decl, "doctype")) ) { if ( p->dmode != DM_DTD ) process_doctype(p, s, decl-1); } else { s = iskip_layout(dtd, decl); if ( *s ) gripe(ERC_SYNTAX_ERROR, L"Invalid declaration", s); } return TRUE; } return gripe(ERC_SYNTAX_ERROR, L"Invalid declaration", decl); } /******************************* * STREAM BINDING * *******************************/ static dtd_parser *current_parser; /* For gripes */ void set_file_dtd_parser(dtd_parser *p, input_type type, const ichar *name) { p->location.type = type; p->location.name.file = name; p->location.line = 1; p->location.linepos = 0; p->location.charpos = 0; } static void set_src_dtd_parser(dtd_parser *p, input_type type, const ichar *name) { p->location.type = type; p->location.name.entity = name; p->location.line = 1; p->location.linepos = 0; p->location.charpos = 0; } void set_mode_dtd_parser(dtd_parser *p, data_mode m) { p->dmode = m; /* DM_DTD or DM_DATA */ p->state = S_PCDATA; p->blank_cdata = TRUE; } dtd_parser * new_dtd_parser(dtd *dtd) { dtd_parser *p = sgml_calloc(1, sizeof(*p)); if ( !dtd ) dtd = new_dtd(NULL); dtd->references++; p->magic = SGML_PARSER_MAGIC; p->dtd = dtd; p->state = S_PCDATA; p->mark_state = MS_INCLUDE; p->dmode = DM_DTD; p->encoded = TRUE; /* encoded octet stream */ p->buffer = new_icharbuf(); p->cdata = new_ocharbuf(); p->event_class = EV_EXPLICIT; set_src_dtd_parser(p, IN_NONE, NULL); return p; } static dtd_parser * clone_dtd_parser(dtd_parser *p) { dtd_parser *clone = sgml_calloc(1, sizeof(*p)); *clone = *p; clone->dtd->references++; clone->environments = NULL; clone->marked = NULL; clone->etag = NULL; clone->grouplevel = 0; clone->state = S_PCDATA; clone->mark_state = MS_INCLUDE; clone->dmode = DM_DTD; clone->buffer = new_icharbuf(); clone->cdata = new_ocharbuf(); return clone; } void free_dtd_parser(dtd_parser *p) { free_icharbuf(p->buffer); free_ocharbuf(p->cdata); free_dtd(p->dtd); sgml_free(p); } static int process_chars(dtd_parser *p, input_type in, const ichar *name, const ichar *s) { locbuf old; push_location(p, &old); set_src_dtd_parser(p, in, name); empty_icharbuf(p->buffer); /* dubious */ for(; *s; s++) putchar_dtd_parser(p, *s); pop_location(p, &old); return TRUE; } static int process_include(dtd_parser *p, const ichar *entity_name) { dtd_symbol *id; dtd_entity *pe; dtd *dtd = p->dtd; if ( (id=dtd_find_entity_symbol(dtd, entity_name)) && (pe=find_pentity(p->dtd, id)) ) { ichar *file; if ( (file = entity_file(dtd, pe)) ) { int rc = sgml_process_file(p, file, SGML_SUB_DOCUMENT); sgml_free(file); return rc; } else { const ichar *text = entity_value(p, pe, NULL); if ( !text ) return gripe(ERC_NO_VALUE, pe->name->name); return process_chars(p, IN_ENTITY, entity_name, text); } } return gripe(ERC_EXISTENCE, L"parameter entity", entity_name); } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Process <![ KEYWORD [ Switches ->mark_state according to KEYWORD. Processes the rest in normal S_PCDATA style, which pops the mark-stack on seeing ]]> For the purpose of <!DOCTYPE spec [additions]> we switch to S_GROUP if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static void process_marked_section(dtd_parser *p) { ichar buf[MAXDECL]; dtd *dtd = p->dtd; const ichar *decl = p->buffer->data; const ichar *s; if ( (decl=isee_func(dtd, decl, CF_MDO2)) && /* ! */ (decl=isee_func(dtd, decl, CF_DSO)) && /* [ */ expand_pentities(p, decl, ZERO_TERM_LEN, buf, sizeof(buf)/sizeof(ichar)) ) { dtd_symbol *kwd; decl = buf; if ( (s=itake_name(dtd, decl, &kwd)) && isee_func(dtd, s, CF_DSO) ) /* [ */ { dtd_marked *m = sgml_calloc(1, sizeof(*m)); m->keyword = kwd; /* push on the stack */ m->parent = p->marked; p->marked = m; if ( istrcaseeq(kwd->name, L"IGNORE") ) m->type = MS_IGNORE; else if ( istrcaseeq(kwd->name, L"INCLUDE") ) m->type = MS_INCLUDE; else if ( istrcaseeq(kwd->name, L"TEMP") ) m->type = MS_INCLUDE; else if ( istrcaseeq(kwd->name, L"CDATA") ) m->type = MS_CDATA; else if ( istrcaseeq(kwd->name, L"RCDATA") ) m->type = MS_RCDATA; else m->type = MS_INCLUDE; /* default */ empty_icharbuf(p->buffer); if ( m->type == MS_CDATA ) p->state = S_MSCDATA; else p->state = S_PCDATA; if ( p->mark_state != MS_IGNORE ) p->mark_state = m->type; } } else { decl = p->buffer->data; if ( (decl=isee_func(dtd, decl, CF_MDO2)) && /* ! */ !isee_func(dtd, decl, CF_DSO) ) /* [ */ { p->state = S_GROUP; p->grouplevel = 1; } } } static void pop_marked_section(dtd_parser *p) { dtd_marked *m = p->marked; if ( m ) { p->marked = m->parent; sgml_free(m); p->mark_state = (p->marked ? p->marked->type : MS_INCLUDE); } } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Update the space-mode for the current element. The space mode defines how spaces are handled in the CDATA output. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static dtd_space_mode istr_to_space_mode(const ichar *val) { if ( istreq(val, L"default") ) return SP_DEFAULT; if ( istreq(val, L"preserve") ) return SP_PRESERVE; if ( istreq(val, L"sgml") ) return SP_SGML; if ( istreq(val, L"remove") ) return SP_REMOVE; return SP_INHERIT; /* interpret as error */ } static void update_space_mode(dtd_parser *p, dtd_element *e, int natts, sgml_attribute *atts) { for( ; natts-- > 0; atts++ ) { const ichar *name = atts->definition->name->name; if ( istreq(name, L"xml:space") && atts->definition->type == AT_CDATA && atts->value.textW ) { dtd_space_mode m = istr_to_space_mode(atts->value.textW); if ( m != SP_INHERIT ) p->environments->space_mode = m; else gripe(ERC_EXISTENCE, L"xml:space-mode", atts->value.textW); return; } } if ( e->space_mode != SP_INHERIT ) p->environments->space_mode = e->space_mode; } static void empty_cdata(dtd_parser *p) { if ( p->dmode == DM_DATA ) { empty_ocharbuf(p->cdata); p->blank_cdata = TRUE; p->cdata_must_be_empty = FALSE; } } static void cb_cdata(dtd_parser *p, ocharbuf *buf, int offset, int size) { if ( p->on_data ) (*p->on_data)(p, EC_CDATA, size, buf->data.w+offset); } static int emit_cdata(dtd_parser *p, int last) { dtd *dtd = p->dtd; locbuf locsafe; ocharbuf *cdata = p->cdata; int offset = 0; int size = cdata->size; if ( size == 0 ) return TRUE; /* empty or done */ push_location(p, &locsafe); sgml_cplocation(&p->location, &p->startloc); /* start of markup */ sgml_cplocation(&p->startloc, &p->startcdata); /* real start of CDATA */ if ( p->environments ) { switch(p->environments->space_mode) { case SP_SGML: case SP_DEFAULT: if ( p->first ) { wint_t c = fetch_ocharbuf(cdata, offset); if ( HasClass(dtd, c, CH_RE) ) { inc_location(&p->startloc, c); offset++; size--; c = fetch_ocharbuf(cdata, offset); } if ( HasClass(dtd, c, CH_RS) ) { inc_location(&p->startloc, c); offset++; size--; } } if ( last && size > 0 ) { wint_t c = fetch_ocharbuf(cdata, offset+size-1); if ( HasClass(dtd, c, CH_RS) ) { dec_location(&p->location, c); size--; poke_ocharbuf(cdata, offset+size, '\0'); if ( size > 0 ) c = fetch_ocharbuf(cdata, offset+size-1); else c = 0; /* HasClass(CH_RE) must fail */ } if ( HasClass(dtd, c, CH_RE) ) { dec_location(&p->location, c); size--; poke_ocharbuf(cdata, offset+size, '\0'); } } if ( p->environments->space_mode == SP_DEFAULT ) { int o = 0; int i; for(i=0; i<size; i++) { wint_t c = fetch_ocharbuf(cdata, offset+i); if ( HasClass(dtd, c, CH_BLANK) ) { for(i++; i<size; i++) { wint_t c = fetch_ocharbuf(cdata, offset+i); if ( !HasClass(dtd, c, CH_BLANK) ) break; } i--; poke_ocharbuf(cdata, o++, ' '); continue; } poke_ocharbuf(cdata, o++, c); } poke_ocharbuf(cdata, o, '\0'); offset = 0; /* wrote new output from offset=0 */ size = o; } break; case SP_REMOVE: { int o = 0; int i; int end = 0; for(i=0; i<size; i++) { wint_t c = fetch_ocharbuf(cdata, offset+i); if ( HasClass(dtd, c, CH_BLANK) ) inc_location(&p->startloc, c); else break; } if ( i<size ) { for(; i<size; i++) { wint_t c = fetch_ocharbuf(cdata, offset+i); if ( HasClass(dtd, c, CH_BLANK) ) { i++; while(i<size && HasClass(dtd, (wint_t)fetch_ocharbuf(cdata, offset+i), CH_BLANK)) i++; i--; poke_ocharbuf(cdata, o++, ' '); continue; } poke_ocharbuf(cdata, o++, c); end = o; } } /* TBD: adjust end */ poke_ocharbuf(cdata, end, '\0'); size = end; break; } case SP_PRESERVE: break; case SP_INHERIT: assert(0); return FALSE; } } if ( size == 0 ) { pop_location(p, &locsafe); empty_cdata(p); return TRUE; } assert(size > 0); if ( !p->blank_cdata ) { if ( p->cdata_must_be_empty ) { gripe(ERC_NOT_ALLOWED_PCDATA, p->cdata); /* TBD: now passes buffer! */ } cb_cdata(p, cdata, offset, size); } else if ( p->environments ) { sgml_environment *env = p->environments; dtd_state *new; /* If an element is not in the DTD we must */ /* assume mixed content and emit spaces */ if ( (new=make_dtd_transition(env->state, CDATA_ELEMENT)) ) { env->state = new; cb_cdata(p, cdata, offset, size); } else if ( env->element->undefined && p->environments->space_mode == SP_PRESERVE ) { cb_cdata(p, cdata, offset, size); } } pop_location(p, &locsafe); empty_cdata(p); return TRUE; } static int prepare_cdata(dtd_parser *p) { if ( p->cdata->size == 0 ) return TRUE; terminate_ocharbuf(p->cdata); if ( p->mark_state == MS_INCLUDE ) { dtd *dtd = p->dtd; if ( p->environments ) /* needed for <img> <img> */ { dtd_element *e = p->environments->element; if ( e->structure && e->structure->type == C_EMPTY && !e->undefined ) close_element(p, e, FALSE); } if ( p->blank_cdata == TRUE ) { int blank = TRUE; int i; for(i=0; i<p->cdata->size; i++) { wint_t c = fetch_ocharbuf(p->cdata, i); if ( !HasClass(dtd, c, CH_BLANK) ) { blank = FALSE; break; } } p->blank_cdata = blank; if ( !blank ) { if ( p->dmode == DM_DTD ) gripe(ERC_SYNTAX_ERROR, L"CDATA in DTD", p->cdata->data); else open_element(p, CDATA_ELEMENT, TRUE); } } } return TRUE; } static int process_cdata(dtd_parser *p, int last) { int rc; WITH_PARSER(p, (prepare_cdata(p), rc=emit_cdata(p, last))); return rc; } static int process_entity(dtd_parser *p, const ichar *name) { if ( name[0] == '#' ) /* #charcode: character entity */ { int v = char_entity_value(name); if ( v <= 0 ) return gripe(ERC_SYNTAX_ERROR, L"Bad character entity", name); add_ocharbuf(p->cdata, v); } else { dtd_symbol *id; dtd_entity *e; dtd *dtd = p->dtd; int len; const ichar *text; const ichar *s; int chr; ichar *file; if ( !(id=dtd_find_entity_symbol(dtd, name)) || !(e=id->entity) ) { if ( dtd->default_entity ) e = dtd->default_entity; else return gripe(ERC_EXISTENCE, L"entity", name); } if ( !e->value && e->content == EC_SGML && (file=entity_file(p->dtd, e)) ) { int rc; empty_icharbuf(p->buffer); /* dubious */ rc = sgml_process_file(p, file, SGML_SUB_DOCUMENT); sgml_free(file); return rc; } if ( !(text = entity_value(p, e, &len)) ) return gripe(ERC_NO_VALUE, e->name->name); switch ( e->content ) { case EC_SGML: case EC_CDATA: if ( (s=isee_character_entity(dtd, text, &chr)) && *s == '\0' ) { if ( chr == 0 ) return gripe(ERC_SYNTAX_ERROR, L"Illegal character entity", text); if ( p->blank_cdata == TRUE && !HasClass(dtd, (wint_t)chr, CH_BLANK) ) { p->cdata_must_be_empty = !open_element(p, CDATA_ELEMENT, FALSE); p->blank_cdata = FALSE; } add_ocharbuf(p->cdata, chr); return TRUE; } if ( e->content == EC_SGML ) { locbuf oldloc; int decode = p->utf8_decode; push_location(p, &oldloc); p->utf8_decode = FALSE; set_src_dtd_parser(p, IN_ENTITY, e->name->name); empty_icharbuf(p->buffer); /* dubious */ for(s=text; *s; s++) putchar_dtd_parser(p, *s); p->utf8_decode = decode; pop_location(p, &oldloc); } else if ( *text ) { const ichar *o; if ( p->blank_cdata == TRUE ) { p->cdata_must_be_empty = !open_element(p, CDATA_ELEMENT, FALSE); p->blank_cdata = FALSE; } for(o=text; *o; o++) add_ocharbuf(p->cdata, *o); } break; case EC_SDATA: case EC_NDATA: process_cdata(p, FALSE); if ( p->on_data ) (*p->on_data)(p, e->content, len, text); break; case EC_PI: process_cdata(p, FALSE); if ( p->on_pi ) (*p->on_pi)(p, text); case EC_STARTTAG: #if 0 prepare_cdata(p); process_begin_element(p, text); #endif break; case EC_ENDTAG: #if 0 prepare_cdata(p); process_end_element(p, text); #endif break; } return TRUE; } return TRUE; } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Deal with end of input. We should give a proper error message depending on the state and the start-location of the error. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static int end_document_dtd_parser_(dtd_parser *p) { int rval; switch(p->state) { case S_RCDATA: case S_CDATA: case S_PCDATA: rval = TRUE; break; case S_CMT: case S_CMT1: case S_CMTE0: case S_CMTE1: case S_DECLCMT0: case S_DECLCMT: case S_DECLCMTE0: rval = gripe(ERC_SYNTAX_ERROR, L"Unexpected end-of-file in comment", L""); break; case S_ECDATA1: case S_ECDATA2: case S_EMSC1: case S_EMSC2: case S_DECL0: case S_DECL: case S_MDECL0: case S_STRING: case S_CMTO: case S_GROUP: case S_PENT: case S_ENT: case S_ENT0: rval = gripe(ERC_SYNTAX_ERROR, L"Unexpected end-of-file", L""); break; #ifdef UTF8 case S_UTF8: rval = gripe(ERC_SYNTAX_ERROR, L"Unexpected end-of-file in UTF-8 sequence", L""); break; #endif case S_MSCDATA: case S_EMSCDATA1: case S_EMSCDATA2: rval = gripe(ERC_SYNTAX_ERROR, L"Unexpected end-of-file in CDATA marked section", L""); break; case S_PI: case S_PI2: rval = gripe(ERC_SYNTAX_ERROR, L"Unexpected end-of-file in processing instruction", L""); break; default: rval = gripe(ERC_SYNTAX_ERROR, L"Unexpected end-of-file in ???"); break; } if ( p->dmode == DM_DATA ) { sgml_environment *env; if ( p->cdata->size > 0 && fetch_ocharbuf(p->cdata, p->cdata->size-1) == CR ) del_ocharbuf(p->cdata); process_cdata(p, TRUE); if ( (env=p->environments) ) { dtd_element *e; while(env->parent) env = env->parent; pop_to(p, env, CDATA_ELEMENT); e = env->element; if ( e->structure && !e->structure->omit_close ) gripe(ERC_OMITTED_CLOSE, e->name->name); close_element(p, e, FALSE); } } return rval; } int end_document_dtd_parser(dtd_parser *p) { int rval; WITH_PARSER(p, rval = end_document_dtd_parser_(p)); return rval; } int begin_document_dtd_parser(dtd_parser *p) { init_decoding(p); return TRUE; } void reset_document_dtd_parser(dtd_parser *p) { if ( p->environments ) { sgml_environment *env, *parent; for(env = p->environments; env; env=parent) { parent = env->parent; free_environment(env); } p->environments = NULL; } while(p->marked) pop_marked_section(p); empty_icharbuf(p->buffer); empty_ocharbuf(p->cdata); p->mark_state = MS_INCLUDE; p->state = S_PCDATA; p->grouplevel = 0; p->blank_cdata = TRUE; p->event_class = EV_EXPLICIT; p->dmode = DM_DATA; begin_document_dtd_parser(p); } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Set the UTF-8 state - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ #ifdef UTF8 static void process_utf8(dtd_parser *p, int chr) { int bytes; int mask; for( bytes=1, mask=0x20; chr&mask; bytes++, mask >>= 1 ) ; mask--; /* 0x20 --> 0x1f */ p->utf8_saved_state = p->state; /* state to return to */ p->state = S_UTF8; p->utf8_char = chr & mask; p->utf8_left = bytes; } #endif /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - add_cdata() adds a character to the output data. It also maps \r\n onto a single \n for Windows newline conventions. There is a problem here in shortref handling. We open the CDATA_ELEMENT as soon as we find a character as this may open other elements through omitted tags and thus install a new shortref map. If, at a later stage, all CDATA read sofar turns out to be a shortref we have incorrectly opened the CDATA_ELEMENT. As `undoing' the open_element() is not an option (it may already have caused `events' on omitted tags) we are in trouble. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static void add_cdata(dtd_parser *p, int chr) { if ( p->mark_state == MS_INCLUDE ) { ocharbuf *buf = p->cdata; if ( p->blank_cdata == TRUE && !HasClass(p->dtd, (wint_t)chr, CH_BLANK) ) { p->cdata_must_be_empty = !open_element(p, CDATA_ELEMENT, FALSE); p->blank_cdata = FALSE; } if ( chr == '\n' ) /* insert missing CR */ { int sz; if ( (sz=buf->size) == 0 || fetch_ocharbuf(buf, sz-1) != CR ) add_cdata(p, CR); } add_ocharbuf(buf, chr); if ( p->map && chr <= 0xff && p->map->ends[chr] && match_shortref(p) ) return; if ( chr == '\n' ) /* dubious. Whould we do that */ { int sz; /* here or in space-handling? */ if ( (sz=buf->size) > 1 && fetch_ocharbuf(buf, sz-1) == LF && fetch_ocharbuf(buf, sz-2) == CR ) { poke_ocharbuf(buf, sz-2, LF); buf->size--; } } } } static void add_verbatim_cdata(dtd_parser *p, int chr) { if ( p->mark_state != MS_IGNORE ) { ocharbuf *buf = p->cdata; if ( p->blank_cdata == TRUE && !HasClass(p->dtd, (wint_t)chr, CH_BLANK) ) { p->cdata_must_be_empty = !open_element(p, CDATA_ELEMENT, FALSE); p->blank_cdata = FALSE; } if ( chr == '\n' && buf->size > 0 && fetch_ocharbuf(buf, buf->size-1) == '\r' ) buf->size--; add_ocharbuf(buf, chr); } } /* We discovered illegal markup and now process it as normal CDATA */ static void recover_parser(dtd_parser *p) { const ichar *s; terminate_icharbuf(p->buffer); add_cdata(p, p->saved); for(s=p->buffer->data; *s; s++) add_cdata(p, *s); p->state = S_PCDATA; } static inline void setlocation(dtd_srcloc *d, dtd_srcloc *loc, int line, int lpos) { d->line = line; d->linepos = lpos; d->charpos = loc->charpos - 1; d->type = loc->type; d->name = loc->name; } void putchar_dtd_parser(dtd_parser *p, int chr) { dtd *dtd = p->dtd; const ichar *f = dtd->charfunc->func; int line = p->location.line; int lpos = p->location.linepos; p->location.charpos++; /* TBD: actually `bytepos' */ #ifdef UTF8 if ( p->state == S_UTF8 ) { if ( (chr & 0xc0) != 0x80 ) /* TBD: recover */ gripe(ERC_SYNTAX_ERROR, L"Bad UTF-8 sequence", L""); p->utf8_char <<= 6; p->utf8_char |= (chr & ~0xc0); if ( --p->utf8_left == 0 ) { chr = p->utf8_char; p->state = p->utf8_saved_state; } else { return; } } else if ( ISUTF8_MB(chr) && p->utf8_decode ) { process_utf8(p, chr); return; } #endif if ( f[CF_RS] == chr ) { p->location.line++; p->location.linepos = 0; } else { if ( f[CF_RE] == chr ) p->location.linepos = 0; else p->location.linepos++; } reprocess: switch(p->state) { case S_PCDATA: { if ( f[CF_MDO1] == chr ) /* < */ { setlocation(&p->startloc, &p->location, line, lpos); p->state = S_DECL0; empty_icharbuf(p->buffer); return; } if ( p->dmode == DM_DTD ) { if ( f[CF_PERO] == chr ) /* % */ { setlocation(&p->startloc, &p->location, line, lpos); p->state = S_PENT; return; } } else { if ( f[CF_ERO] == chr ) /* & */ { setlocation(&p->startloc, &p->location, line, lpos); p->state = S_ENT0; return; } } if ( p->marked && f[CF_DSC] == chr ) /* ] in marked section */ { empty_icharbuf(p->buffer); p->state = S_EMSC1; p->saved = chr; /* for recovery */ return; } if ( p->waiting_for_net && f[CF_ETAGO2] == chr ) /* shorttag */ { setlocation(&p->startloc, &p->location, line, lpos); WITH_PARSER(p, process_net(p)); return; } /* Real character data */ if ( p->cdata->size == 0 ) setlocation(&p->startcdata, &p->location, line, lpos); add_cdata(p, chr); return; } case S_ECDATA2: /* Seen </ in CDATA/RCDATA */ { if ( f[CF_MDC] == chr && p->etaglen == p->buffer->size && istrncaseeq(p->buffer->data, p->etag, p->etaglen) ) { p->cdata->size -= p->etaglen+2; /* 2 for </ */ terminate_ocharbuf(p->cdata); terminate_icharbuf(p->buffer); if ( p->mark_state == MS_INCLUDE ) { WITH_PARSER(p, process_cdata(p, TRUE); process_end_element(p, p->buffer->data)); empty_cdata(p); } empty_icharbuf(p->buffer); p->cdata_state = p->state = S_PCDATA; } else { add_verbatim_cdata(p, chr); if ( p->etaglen < p->buffer->size || !HasClass(dtd, (wint_t)chr, CH_NAME)) { empty_icharbuf(p->buffer); /* mismatch */ p->state = p->cdata_state; } else add_icharbuf(p->buffer, chr); } return; } case S_ECDATA1: /* seen < in CDATA */ { add_verbatim_cdata(p, chr); if ( f[CF_ETAGO2] == chr ) /* / */ { empty_icharbuf(p->buffer); p->state = S_ECDATA2; } else if ( f[CF_ETAGO1] != chr ) /* <: do not change state */ p->state = p->cdata_state; return; } case S_RCDATA: { if ( f[CF_ERO] == chr ) /* & */ { setlocation(&p->startloc, &p->location, line, lpos); p->state = S_ENT0; return; } /*FALLTHROUGH*/ } case S_CDATA: { add_verbatim_cdata(p, chr); if ( f[CF_MDO1] == chr ) /* < */ { setlocation(&p->startloc, &p->location, line, lpos); p->state = S_ECDATA1; } /* / in CDATA shorttag element */ if ( p->waiting_for_net && f[CF_ETAGO2] == chr ) { setlocation(&p->startloc, &p->location, line, lpos); p->cdata->size--; terminate_ocharbuf(p->cdata); terminate_icharbuf(p->buffer); if ( p->mark_state == MS_INCLUDE ) { WITH_PARSER(p, process_cdata(p, TRUE); process_net(p)); empty_cdata(p); } empty_icharbuf(p->buffer); p->cdata_state = p->state = S_PCDATA; } return; } case S_MSCDATA: { add_verbatim_cdata(p, chr); if ( f[CF_DSC] == chr ) /* ] */ p->state = S_EMSCDATA1; return; } case S_EMSCDATA1: { add_verbatim_cdata(p, chr); if ( f[CF_DSC] == chr ) /* ]] */ p->state = S_EMSCDATA2; else p->state = S_MSCDATA; return; } case S_EMSCDATA2: { add_verbatim_cdata(p, chr); if ( f[CF_MDC] == chr ) /* ]]> */ { p->cdata->size -= 3; /* Delete chars for ]] */ pop_marked_section(p); p->state = S_PCDATA; } else if ( f[CF_DSC] != chr ) /* if ]]], stay in this state */ p->state = S_MSCDATA; return; } case S_EMSC1: { if ( f[CF_DSC] == chr ) /* ]] in marked section */ { p->state = S_EMSC2; return; } else { add_icharbuf(p->buffer, chr); recover_parser(p); return; } } case S_EMSC2: { if ( f[CF_MDC] == chr ) /* ]]> in marked section */ { pop_marked_section(p); p->state = S_PCDATA; return; } else { add_icharbuf(p->buffer, chr); recover_parser(p); return; } } case S_PENT: /* %parameter entity; */ { if ( f[CF_ERC] == chr ) { p->state = S_PCDATA; terminate_icharbuf(p->buffer); if ( p->mark_state == MS_INCLUDE ) { WITH_PARSER(p, process_include(p, p->buffer->data)); } empty_icharbuf(p->buffer); return; } if ( HasClass(dtd, (wint_t)chr, CH_NAME) ) { add_icharbuf(p->buffer, chr); return; } terminate_icharbuf(p->buffer); gripe(ERC_SYNTAX_ERROR, L"Illegal parameter entity", p->buffer->data); break; } case S_ENT0: /* Seen & */ { if ( chr == '#' || HasClass(dtd, (wint_t)chr, CH_NAME) ) { empty_icharbuf(p->buffer); add_icharbuf(p->buffer, chr); p->state = S_ENT; } else { if ( dtd->dialect != DL_SGML ) { wchar_t buf[3]; buf[0] = '&'; buf[1] = chr; buf[2] = '\0'; gripe(ERC_SYNTAX_ERROR, L"Illegal entity", buf); } add_cdata(p, f[CF_ERO]); p->state = p->cdata_state; goto reprocess; } return; } case S_ENT: /* &entity; */ { if ( HasClass(dtd, (wint_t)chr, CH_NAME) ) { add_icharbuf(p->buffer, chr); return; } terminate_icharbuf(p->buffer); p->state = p->cdata_state; if ( p->mark_state == MS_INCLUDE ) { WITH_PARSER(p, process_entity(p, p->buffer->data)); } empty_icharbuf(p->buffer); if ( chr == CR ) p->state = S_ENTCR; else if ( f[CF_ERC] != chr && chr != '\n' ) goto reprocess; break; } case S_ENTCR: /* seen &entCR, eat the LF */ { p->state = p->cdata_state; if ( chr != LF ) goto reprocess; break; } case S_DECL0: /* Seen < */ { if ( f[CF_ETAGO2] == chr ) /* </ */ { add_icharbuf(p->buffer, chr); p->state = S_DECL; } else if ( HasClass(dtd, (wint_t)chr, CH_NAME) ) /* <letter */ { add_icharbuf(p->buffer, chr); p->state = S_DECL; } else if ( f[CF_MDO2] == chr ) /* <! */ { p->state = S_MDECL0; } else if ( f[CF_PRO2] == chr ) /* <? */ { p->state = S_PI; } else /* recover */ { add_cdata(p, f[CF_MDO1]); add_cdata(p, chr); p->state = S_PCDATA; } return; } case S_MDECL0: /* Seen <! */ { if ( f[CF_CMT] == chr ) /* <!- */ { p->state = S_CMTO; return; } add_icharbuf(p->buffer, f[CF_MDO2]); add_icharbuf(p->buffer, chr); p->state = S_DECL; return; } case S_DECL: /* <...> */ { if ( f[CF_MDC] == chr ) /* > */ { prepare_cdata(p); p->state = S_PCDATA; terminate_icharbuf(p->buffer); if ( p->mark_state == MS_INCLUDE ) { WITH_PARSER(p, process_declaration(p, p->buffer->data)); } empty_icharbuf(p->buffer); return; } if ( dtd->shorttag && f[CF_ETAGO2] == chr && p->buffer->size > 0 ) { prepare_cdata(p); p->state = S_PCDATA; terminate_icharbuf(p->buffer); if ( p->mark_state == MS_INCLUDE ) { WITH_CLASS(p, EV_SHORTTAG, WITH_PARSER(p, process_declaration(p, p->buffer->data))); } empty_icharbuf(p->buffer); p->waiting_for_net = TRUE; return; } add_icharbuf(p->buffer, chr); if ( f[CF_LIT] == chr ) /* " */ { p->state = S_STRING; p->saved = chr; p->lit_saved_state = S_DECL; } else if ( f[CF_LITA] == chr ) /* ' */ { p->state = S_STRING; p->saved = chr; p->lit_saved_state = S_DECL; return; } else if ( f[CF_CMT] == chr && /* - */ p->buffer->data[0] == f[CF_MDO2] ) /* Started <! */ { p->state = S_DECLCMT0; } else if ( f[CF_DSO] == chr ) /* [: marked section */ { terminate_icharbuf(p->buffer); process_marked_section(p); } break; } case S_DECLCMT0: /* <...- */ { if ( f[CF_CMT] == chr ) { p->buffer->size--; p->state = S_DECLCMT; } else { add_icharbuf(p->buffer, chr); p->state = S_DECL; } break; } case S_DECLCMT: /* <...--.. */ { if ( f[CF_CMT] == chr ) p->state = S_DECLCMTE0; break; } case S_DECLCMTE0: /* <...--..- */ { if ( f[CF_CMT] == chr ) p->state = S_DECL; else p->state = S_DECLCMT; break; } case S_PI: { add_icharbuf(p->buffer, chr); if ( f[CF_PRO2] == chr ) /* <? ... ? */ p->state = S_PI2; if ( f[CF_PRC] == chr ) /* no ? is ok too (XML/SGML) */ goto pi; return; } case S_PI2: { if ( f[CF_PRC] == chr ) { pi: process_cdata(p, FALSE); p->state = S_PCDATA; p->buffer->size--; terminate_icharbuf(p->buffer); if ( p->mark_state == MS_INCLUDE ) { WITH_PARSER(p, process_pi(p, p->buffer->data)); } empty_icharbuf(p->buffer); return; } add_icharbuf(p->buffer, chr); p->state = S_PI; return; } case S_STRING: { add_icharbuf(p->buffer, chr); if ( chr == p->saved ) p->state = p->lit_saved_state; break; } case S_CMTO: /* Seen <!- */ { if ( f[CF_CMT] == chr ) /* - */ { p->state = S_CMT1; return; } else { add_cdata(p, f[CF_MDO1]); add_cdata(p, f[CF_MDO2]); add_cdata(p, f[CF_CMT]); add_cdata(p, chr); p->state = S_PCDATA; return; } } case S_CMT1: /* <!-- */ { if ( f[CF_CMT] == chr ) /* <!--- */ { if ( dtd->dialect != DL_SGML ) gripe(ERC_SYNTAX_ERROR, L"Illegal comment", L"<!---"); } p->state = S_CMT; break; } case S_CMT: { if ( f[CF_CMT] == chr ) p->state = S_CMTE0; /* <!--...- */ break; } case S_CMTE0: /* <!--... -- */ { if ( f[CF_CMT] == chr ) p->state = S_CMTE1; else p->state = S_CMT; break; } case S_CMTE1: /* <!--...-- seen */ { if ( f[CF_MDC] == chr ) /* > */ { if ( p->on_decl ) (*p->on_decl)(p, (ichar*)""); p->state = S_PCDATA; } else { if ( dtd->dialect != DL_SGML ) gripe(ERC_SYNTAX_ERROR, L"Illegal comment", L""); if ( f[CF_CMT] != chr ) p->state = S_CMT; } break; } case S_GROUP: /* [...] in declaration */ { add_icharbuf(p->buffer, chr); if ( f[CF_DSO] == chr ) { p->grouplevel++; } else if ( f[CF_DSC] == chr ) { if ( --p->grouplevel == 0 ) p->state = S_DECL; } else if ( f[CF_LIT] == chr ) /* " */ { p->state = S_STRING; p->saved = chr; p->lit_saved_state = S_GROUP; } else if ( f[CF_LITA] == chr ) /* ' */ { p->state = S_STRING; p->saved = chr; p->lit_saved_state = S_GROUP; return; } break; } #ifdef UTF8 case S_UTF8: assert(0); break; #endif } } /******************************* * TOPLEVEL * *******************************/ int load_dtd_from_file(dtd_parser *p, const ichar *file) { FILE *fd; int rval; data_mode oldmode = p->dmode; dtdstate oldstate = p->state; locbuf oldloc; push_location(p, &oldloc); p->dmode = DM_DTD; p->state = S_PCDATA; empty_icharbuf(p->buffer); /* dubious */ set_file_dtd_parser(p, IN_FILE, file); if ( (fd = wfopen(file, "rb")) ) { int chr; while( (chr = getc(fd)) != EOF ) putchar_dtd_parser(p, chr); fclose(fd); p->dtd->implicit = FALSE; rval = TRUE; } else rval = FALSE; pop_location(p, &oldloc); p->dmode = oldmode; p->state = oldstate; return rval; } dtd * file_to_dtd(const ichar *file, const ichar *doctype, dtd_dialect dialect) { dtd_parser *p = new_dtd_parser(new_dtd(doctype)); set_dialect_dtd(p->dtd, dialect); if ( load_dtd_from_file(p, file) ) { dtd *dtd = p->dtd; dtd->references++; /* avoid deletion */ free_dtd_parser(p); return dtd; } else { free_dtd_parser(p); return NULL; } } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - SGML sees a file as [<LF>]Line 1<CR> <LF> Line 2<CR> I.e. the newline appearing just before the end-of-file should be ignored. In addition, Unix-style files are mapped to CR-LF. Thanks to Richard O'Keefe. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ int sgml_process_stream(dtd_parser *p, FILE *fd, unsigned flags) { int p0, p1; if ( (p0 = getc(fd)) == EOF ) return TRUE; if ( (p1 = getc(fd)) == EOF ) { putchar_dtd_parser(p, p0); return end_document_dtd_parser(p); } for(;;) { int p2 = getc(fd); if ( p2 == EOF ) { putchar_dtd_parser(p, p0); if ( p1 != LF ) putchar_dtd_parser(p, p1); else if ( p0 != CR ) putchar_dtd_parser(p, CR); if ( flags & SGML_SUB_DOCUMENT ) return TRUE; else return end_document_dtd_parser(p); } putchar_dtd_parser(p, p0); p0 = p1; p1 = p2; } } int sgml_process_file(dtd_parser *p, const ichar *file, unsigned flags) { FILE *fd; int rval; locbuf oldloc; push_location(p, &oldloc); set_file_dtd_parser(p, IN_FILE, file); if ( !(flags & SGML_SUB_DOCUMENT) ) set_mode_dtd_parser(p, DM_DATA); if ( (fd = wfopen(file, "rb")) ) { rval = sgml_process_stream(p, fd, flags); fclose(fd); } else rval = FALSE; pop_location(p, &oldloc); return rval; } /******************************* * ERRORS * *******************************/ static wchar_t * format_location(wchar_t *s, size_t len, dtd_srcloc *l) { int first = TRUE; if ( !l || l->type == IN_NONE ) return s; for( ; l && l->type != IN_NONE; l = l->parent, first = FALSE ) { if ( !first ) { swprintf(s, len, L" (from "); s += wcslen(s); } switch(l->type) { case IN_NONE: assert(0); case IN_FILE: swprintf(s, len, L"%ls:%d:%d", l->name.file, l->line, l->linepos); break; case IN_ENTITY: swprintf(s, len, L"&%ls;%d:%d", l->name.entity, l->line, l->linepos); break; } s += wcslen(s); if ( !first ) { *s++ = L')'; } } *s++ = L':'; *s++ = L' '; return s; } static void format_message(dtd_error *e) { wchar_t buf[1024]; wchar_t *s; int prefix_len; int left; switch(e->severity) { case ERS_ERROR: wcscpy(buf, L"Error: "); break; case ERS_WARNING: wcscpy(buf, L"Warning: "); break; default: buf[0] = '\0'; } s = buf+wcslen(buf); s = format_location(s, 1024-(s-buf), e->location); prefix_len = (int)(s-buf); left = 1024-prefix_len; switch(e->id) { case ERC_REPRESENTATION: swprintf(s, left, L"Cannot represent due to %ls", e->argv[0]); break; case ERC_RESOURCE: swprintf(s, left, L"Insufficient %ls resources", e->argv[0]); break; case ERC_LIMIT: swprintf(s, left, L"%ls limit exceeded", e->argv[0]); break; case ERC_VALIDATE: swprintf(s, left, L"%ls", e->argv[0]); break; case ERC_SYNTAX_ERROR: swprintf(s, left, L"Syntax error: %ls", e->argv[0]); break; case ERC_EXISTENCE: swprintf(s, left, L"%ls \"%ls\" does not exist", e->argv[0], e->argv[1]); break; case ERC_REDEFINED: swprintf(s, left, L"Redefined %ls \"%ls\"", e->argv[0], e->argv[1]); break; default: ; } e->message = str2ring(buf); e->plain_message = e->message + prefix_len; } int gripe(dtd_error_id e, ...) { va_list args; wchar_t buf[1024]; dtd_error error; int dtdmode = FALSE; void *freeme = NULL; va_start(args, e); memset(&error, 0, sizeof(error)); error.minor = e; /* detailed error code */ if ( current_parser ) { error.location = ¤t_parser->location; if ( current_parser->dmode == DM_DTD ) dtdmode = TRUE; } else { error.location = NULL; } switch(e) { case ERC_REPRESENTATION: case ERC_RESOURCE: error.severity = ERS_ERROR; error.argv[0] = va_arg(args, wchar_t *); break; case ERC_LIMIT: error.severity = ERS_WARNING; error.argv[0] = va_arg(args, wchar_t *); break; case ERC_SYNTAX_ERROR: case ERC_SYNTAX_WARNING: { wchar_t *m = va_arg(args, wchar_t *); const wchar_t *s = va_arg(args, const wchar_t *); if ( s && *s ) { swprintf(buf, 1024, L"%ls, found \"%ls\"", m, str_summary(s, 25)); error.argv[0] = buf; } else error.argv[0] = m; error.severity = (e == ERC_SYNTAX_WARNING ? ERS_WARNING : ERS_ERROR); e = ERC_SYNTAX_ERROR; break; } case ERC_DOMAIN: { const wchar_t *expected = va_arg(args, const wchar_t *); const wchar_t *found = str_summary(va_arg(args, const wchar_t *), 25); swprintf(buf, 1024, L"Expected type %ls, found \"%ls\"", expected, found); error.argv[0] = buf; error.severity = ERS_ERROR; e = (dtdmode ? ERC_SYNTAX_ERROR : ERC_VALIDATE); break; } case ERC_REDEFINED: { dtd_symbol *name; error.argv[0] = va_arg(args, wchar_t *); /* type */ name = va_arg(args, dtd_symbol *); /* name */ error.argv[1] = (ichar*)name->name; error.severity = ERS_STYLE; break; } case ERC_EXISTENCE: { error.argv[0] = va_arg(args, wchar_t *); /* type */ error.argv[1] = va_arg(args, wchar_t *); /* name */ error.severity = ERS_ERROR; break; } case ERC_VALIDATE: { error.argv[0] = va_arg(args, wchar_t *); /* message */ error.severity = ERS_WARNING; break; } case ERC_OMITTED_CLOSE: { const wchar_t *element = va_arg(args, const wchar_t *); swprintf(buf, 1024, L"Inserted omitted end-tag for \"%ls\"", element); error.argv[0] = buf; error.severity = ERS_WARNING; e = ERC_VALIDATE; break; } case ERC_OMITTED_OPEN: { const wchar_t *element = va_arg(args, const wchar_t *); swprintf(buf, 1024, L"Inserted omitted start-tag for \"%ls\"", element); error.argv[0] = buf; error.severity = ERS_WARNING; e = ERC_VALIDATE; break; } case ERC_NOT_OPEN: { const wchar_t *element = va_arg(args, const wchar_t *); swprintf(buf, 1024, L"Ignored end-tag for \"%ls\" which is not open", element); error.argv[0] = buf; error.severity = ERS_WARNING; e = ERC_VALIDATE; break; } case ERC_NOT_ALLOWED: { const wchar_t *element = va_arg(args, const wchar_t *); swprintf(buf, 1024, L"Element \"%ls\" not allowed here", element); error.argv[0] = buf; error.severity = ERS_WARNING; e = ERC_VALIDATE; break; } case ERC_NOT_ALLOWED_PCDATA: { const ocharbuf *cdata = va_arg(args, const ocharbuf *); swprintf(buf, 1024, L"#PCDATA (\"%ls\") not allowed here", str_summary(cdata->data.w, 25)); error.argv[0] = buf; error.severity = ERS_WARNING; e = ERC_VALIDATE; break; } case ERC_NO_ATTRIBUTE: { const wchar_t *elem = va_arg(args, wchar_t *); /* element */ const wchar_t *attr = va_arg(args, wchar_t *); /* attribute */ swprintf(buf, 1024, L"Element \"%ls\" has no attribute \"%ls\"", elem, attr); error.argv[0] = buf; error.severity = ERS_WARNING; e = ERC_VALIDATE; break; } case ERC_NO_ATTRIBUTE_VALUE: { const wchar_t *elem = va_arg(args, wchar_t *); /* element */ const wchar_t *value = va_arg(args, wchar_t *); /* attribute value */ swprintf(buf, 1024, L"Element \"%ls\" has no attribute with value \"%ls\"", elem, value); error.argv[0] = buf; error.severity = ERS_WARNING; e = ERC_VALIDATE; break; } case ERC_NO_VALUE: { error.argv[0] = L"entity value"; error.argv[1] = va_arg(args, wchar_t *); /* entity */ error.severity = ERS_ERROR; e = ERC_EXISTENCE; break; } case ERC_NO_DOCTYPE: { const wchar_t *doctype = va_arg(args, wchar_t *); /* element */ const wchar_t *file = va_arg(args, wchar_t *); /* DTD file */ swprintf(buf, 1024, L"No <!DOCTYPE ...>, assuming \"%ls\" from DTD file \"%s\"", doctype, file); error.argv[0] = buf; error.severity = ERS_WARNING; e = ERC_VALIDATE; break; } case ERC_NO_CATALOGUE: { char *file = va_arg(args, char *); /* catalogue file */ error.argv[0] = L"catalogue file"; freeme = error.argv[1] = utf8towcs(file); error.severity = ERS_WARNING; e = ERC_EXISTENCE; break; } } error.id = e; format_message(&error); if ( current_parser && current_parser->on_error ) (*current_parser->on_error)(current_parser, &error); else fwprintf(stderr, L"SGML: %ls\n", error.message); if ( freeme ) sgml_free(freeme); va_end(args); return FALSE; }