/* $Id$ Part of SWI-Prolog Author: Jan Wielemaker E-mail: wielemak@science.uva.nl WWW: http://www.swi-prolog.org Copyright (C): 1985-2006, University of Amsterdam This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #define _ISOC99_SOURCE 1 /* fwprintf(), etc prototypes */ #define DTD_IMPLEMENTATION 1 #include #include #include "dtd.h" #include "model.h" #include "util.h" #include "catalog.h" #include "parser.h" #include #include #include #include #include #include "utf8.h" #include #include #include "xml_unicode.h" #define DEBUG(g) ((void)0) #define ZERO_TERM_LEN (-1) /* terminated by nul */ #ifdef __WINDOWS__ #define inline __inline #define swprintf _snwprintf #endif /******************************* * LOCAL TYPES * *******************************/ typedef struct locbuf { dtd_srcloc start; /* p->startloc */ dtd_srcloc here; /* p->location */ } locbuf; /******************************* * PROTOYPES * *******************************/ static const ichar * itake_name(dtd *dtd, const ichar *in, dtd_symbol **id); static const ichar * itake_entity_name(dtd *dtd, const ichar *in, dtd_symbol **id); static const ichar * itake_namegroup(dtd *dtd, const ichar *decl, dtd_symbol **names, int *n); static const ichar * iskip_layout(dtd *dtd, const ichar *in); static dtd_parser * clone_dtd_parser(dtd_parser *p); static void free_model(dtd_model *m); static int process_entity_declaration(dtd_parser *p, const ichar *decl); static void free_notations(dtd_notation *n); static void free_shortrefs(dtd_shortref *sr); static int process_cdata(dtd_parser *p, int last); static int process_entity(dtd_parser *p, const ichar *name); static int emit_cdata(dtd_parser *p, int last); static dtd_space_mode istr_to_space_mode(const ichar *val); static void update_space_mode(dtd_parser *p, dtd_element *e, int natts, sgml_attribute *atts); static dtd_model * make_model(dtd *dtd, const ichar *decl, const ichar **end); static void for_elements_in_model(dtd_model *m, void (*f)(dtd_element *e, void *closure), void *closure); void putchar_dtd_parser(dtd_parser *p, int chr); void free_dtd_parser(dtd_parser *p); static const ichar * isee_character_entity(dtd *dtd, const ichar *in, int *chr); static int add_default_attributes(dtd_parser *p, dtd_element *e, int natts, sgml_attribute *atts); static int prepare_cdata(dtd_parser *p); /******************************* * MACROS * *******************************/ #define WITH_CLASS(p, c, g) \ { sgml_event_class _oc = p->event_class; \ p->event_class = c; \ g; \ p->event_class = _oc; \ } #define WITH_PARSER(p, g) \ { dtd_parser *_old = p; \ current_parser = p; \ g; \ current_parser = _old; \ } /******************************* * STATISTICS * *******************************/ #ifdef O_STATISTICS int edefs_created = 0; int edefs_freed = 0; int edefs_implicit = 0; int edefs_atts = 0; int edefs_decl = 0; int dtd_created = 0; int dtd_freed = 0; void sgml_statistics(void) { fprintf(stderr, "EDEFS: created %d; freed %d\n", edefs_created, edefs_freed); fprintf(stderr, "EDEFS: implicit %d; atts %d; decl %d\n", edefs_implicit, edefs_atts, edefs_decl); fprintf(stderr, "DTDs: created: %d; freed: %d\n", dtd_created, dtd_freed); } #define STAT(g) g #else #define STAT(g) ((void)0) #endif /******************************* * SRC LOCATION * *******************************/ static void /* TBD: also handle startloc */ push_location(dtd_parser *p, locbuf *save) { save->here = p->location; save->start = p->startloc; p->location.parent = &save->here; p->startloc.parent = &save->start; } static void pop_location(dtd_parser *p, locbuf *saved) { p->location = saved->here; p->startloc = saved->start; } static inline void _sgml_cplocation(dtd_srcloc *d, dtd_srcloc *loc) { d->type = loc->type; d->name.file = loc->name.file; d->line = loc->line; d->linepos = loc->linepos; d->charpos = loc->charpos; /* but not the parent! */ } void sgml_cplocation(dtd_srcloc *d, dtd_srcloc *loc) { _sgml_cplocation(d, loc); } #define sgml_cplocation(d,s) _sgml_cplocation(d, s) static void inc_location(dtd_srcloc *l, int chr) { if ( chr == '\n' ) { l->linepos = 0; l->line++; } l->linepos++; l->charpos++; } static void dec_location(dtd_srcloc *l, int chr) { if ( chr == '\n' ) { l->linepos = 2; /* not good! */ l->line--; } l->linepos--; l->charpos--; } /******************************* * CLASSIFICATION PRIMITIVES * *******************************/ static inline int HasClass(dtd *dtd, wint_t chr, int mask) { if ( chr <= 0xff ) return (dtd->charclass->class[(chr)] & (mask)); else { switch(mask) { case CH_NAME: return ( xml_basechar(chr) || xml_digit(chr) || xml_ideographic(chr) || xml_combining_char(chr) || xml_extender(chr) ); case CH_NMSTART: return ( xml_basechar(chr) || xml_ideographic(chr) ); case CH_WHITE: return FALSE; /* only ' ' and '\t' */ case CH_BLANK: return iswspace(chr); case CH_DIGIT: return xml_digit(chr); case CH_RS: case CH_RE: return FALSE; default: assert(0); return FALSE; } } } static const ichar * isee_func(dtd *dtd, const ichar *in, charfunc func) { if ( dtd->charfunc->func[func] == *in ) return ++in; return NULL; } /******************************* * SYMBOLS * *******************************/ static dtd_symbol_table * new_symbol_table(void) { dtd_symbol_table *t = sgml_calloc(1, sizeof(*t)); t->size = SYMBOLHASHSIZE; t->entries = sgml_calloc(t->size, sizeof(dtd_symbol*)); return t; } static void free_symbol_table(dtd_symbol_table *t) { int i; for(i=0; isize; i++) { dtd_symbol *s, *next; for(s=t->entries[i]; s; s=next) { next = s->next; sgml_free((ichar*)s->name); sgml_free(s); } } sgml_free(t->entries); sgml_free(t); } dtd_symbol * dtd_find_symbol(dtd *dtd, const ichar *name) { dtd_symbol_table *t = dtd->symbols; if ( dtd->case_sensitive ) { int k = istrhash(name, t->size); dtd_symbol *s; for(s=t->entries[k]; s; s = s->next) { if ( istreq(s->name, name) ) return s; } } else { int k = istrcasehash(name, t->size); dtd_symbol *s; for(s=t->entries[k]; s; s = s->next) { if ( istrcaseeq(s->name, name) ) return s; } } return NULL; } static dtd_symbol * dtd_find_entity_symbol(dtd *dtd, const ichar *name) { dtd_symbol_table *t = dtd->symbols; if ( dtd->ent_case_sensitive ) { int k = istrhash(name, t->size); dtd_symbol *s; for(s=t->entries[k]; s; s = s->next) { if ( istreq(s->name, name) ) return s; } } else { int k = istrcasehash(name, t->size); dtd_symbol *s; for(s=t->entries[k]; s; s = s->next) { if ( istrcaseeq(s->name, name) ) return s; } } return NULL; } dtd_symbol * dtd_add_symbol(dtd *dtd, const ichar *name) { dtd_symbol_table *t = dtd->symbols; int k = istrhash(name, t->size); dtd_symbol *s; for(s=t->entries[k]; s; s = s->next) { if ( istreq(s->name, name) ) return s; } s = sgml_calloc(1, sizeof(*s)); s->name = istrdup(name); s->next = t->entries[k]; t->entries[k] = s; return s; } /******************************* * ENTITIES * *******************************/ static void free_entity_list(dtd_entity *e) { dtd_entity *next; for( ; e; e=next) { next = e->next; if ( e->value ) sgml_free(e->value); if ( e->extid ) sgml_free(e->extid); if ( e->exturl ) sgml_free(e->exturl); if ( e->baseurl ) sgml_free(e->baseurl); sgml_free(e); } } static dtd_entity * find_pentity(dtd *dtd, dtd_symbol *id) { dtd_entity *e; for(e = dtd->pentities; e; e=e->next) { if ( e->name == id ) return e; } return NULL; } /* returned path must be freed when done */ static ichar * entity_file(dtd *dtd, dtd_entity *e) { switch(e->type) { case ET_SYSTEM: case ET_PUBLIC: { const ichar *f; f = find_in_catalogue(e->catalog_location, e->name->name, e->extid, e->exturl, dtd->dialect != DL_SGML); if ( f ) /* owned by catalog */ { ichar *file; if ( is_absolute_path(f) || !e->baseurl ) file = istrdup(f); else file = localpath(e->baseurl, f); return file; } } default: return NULL; } } static const ichar * entity_value(dtd_parser *p, dtd_entity *e, int *len) { ichar *file; if ( !e->value && (file=entity_file(p->dtd, e)) ) { int normalise = (e->content == EC_SGML || e->content == EC_CDATA); size_t l; e->value = load_sgml_file_to_charp(file, normalise, &l); e->length = (long)l; sgml_free(file); } if ( len ) *len = e->length; return e->value; } static int expand_pentities(dtd_parser *p, const ichar *in, int ilen, ichar *out, int len) { dtd *dtd = p->dtd; int pero = dtd->charfunc->func[CF_PERO]; /* % */ int ero = dtd->charfunc->func[CF_ERO]; /* & */ const ichar *s; const ichar *end; if ( ilen == ZERO_TERM_LEN ) { end = in + wcslen(in); } else { end = &in[ilen]; } while(in < end) { if ( *in == pero ) { dtd_symbol *id; if ( (s = itake_entity_name(dtd, in+1, &id)) ) { dtd_entity *e = find_pentity(dtd, id); const ichar *eval; int l; in = s; if ( (s=isee_func(dtd, s, CF_ERC)) ) /* ; is not obligatory? */ in = s; if ( !e ) return gripe(ERC_EXISTENCE, L"parameter entity", id->name); if ( !(eval = entity_value(p, e, NULL)) ) return FALSE; if ( !expand_pentities(p, eval, ZERO_TERM_LEN, out, len) ) return FALSE; l = (int)istrlen(out); /* could be better */ out += l; len -= l; continue; } } if ( --len <= 0 ) { gripe(ERC_REPRESENTATION, L"Declaration too long"); return FALSE; } if ( *in == ero && in[1] == '#' ) /* &# */ { int chr; if ( (s=isee_character_entity(dtd, in, &chr)) ) { if ( chr == 0 ) { gripe(ERC_SYNTAX_ERROR, L"Illegal character entity", in); } else { *out++ = chr; in = s; continue; } } } *out++ = *in++; } *out = '\0'; return TRUE; } static int char_entity_value(const ichar *decl) { if ( *decl == '#' ) { const ichar *s = decl+1; ichar *end; long v; /* do octal too? */ if ( s[0] == 'x' || s[0] == 'X' ) v = wcstoul(s+1, &end, 16); else v = wcstoul(s, &end, 10); if ( *end == '\0' ) { return (int)v; } else if ( istreq(s, L"RS") ) { return '\n'; } else if ( istreq(s, L"RE") ) { return '\r'; } else if ( istreq(s, L"TAB") ) { return '\t'; } else if ( istreq(s, L"SPACE") ) { return ' '; } } return -1; } static const ichar * isee_character_entity(dtd *dtd, const ichar *in, int *chr) { const ichar *s; if ( (s=isee_func(dtd, in, CF_ERO)) && *s == '#' ) { ichar e[32]; ichar *o = e; int v; *o++ = *s++; while(o < e+sizeof(e)/sizeof(ichar)-1 && HasClass(dtd, *s, CH_NAME)) *o++ = *s++; if ( isee_func(dtd, s, CF_ERC)) /* skip ; */ s++; *o = '\0'; if ( (v=char_entity_value(e)) >= 0 ) { *chr = v; return s; } } return NULL; } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Expand entities in a string. Used to expand CDATA attribute values. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static int expand_entities(dtd_parser *p, const ichar *in, int len, ocharbuf *out) { const ichar *s; const ichar *end = &in[len]; dtd *dtd = p->dtd; int ero = dtd->charfunc->func[CF_ERO]; /* & */ while(in < end) { if ( *in == ero ) { const ichar *estart = in; /* for recovery */ int chr; if ( (s=isee_character_entity(dtd, in, &chr)) ) { if ( chr == 0 ) gripe(ERC_SYNTAX_ERROR, L"Illegal character entity", in); add_ocharbuf(out, chr); in = s; continue; } if ( HasClass(dtd, in[1], CH_NMSTART) ) { dtd_symbol *id; dtd_entity *e; const ichar *eval; if ( !(in = itake_name(dtd, in+1, &id)) ) { in = estart; goto recover; } if ( isee_func(dtd, in, CF_ERC) || *in == '\n' ) in++; if ( !(e = id->entity) && !(e=dtd->default_entity) ) { gripe(ERC_EXISTENCE, L"entity", id->name); in = estart; goto recover; } if ( !(eval = entity_value(p, e, NULL)) ) { gripe(ERC_NO_VALUE, e->name->name); in = estart; goto recover; } if ( e->content == EC_SGML ) { if ( !expand_entities(p, eval, (int)istrlen(eval), out) ) return FALSE; } else { const ichar *s; for(s=eval; *s; s++) add_ocharbuf(out, *s); } continue; } if ( dtd->dialect != DL_SGML ) gripe(ERC_SYNTAX_ERROR, L"Illegal entity", estart); } recover: if ( *in == CR && in[1] == LF ) in++; if ( HasClass(dtd, *in, CH_BLANK) ) { add_ocharbuf(out, ' '); in++; } else { add_ocharbuf(out, *in++); } } terminate_ocharbuf(out); return TRUE; } /******************************* * ELEMENTS * *******************************/ static dtd_element * find_element(dtd *dtd, dtd_symbol *id) { dtd_element *e; if ( id->element ) return id->element; /* must check */ e = sgml_calloc(1, sizeof(*e)); e->space_mode = SP_INHERIT; e->undefined = TRUE; e->name = id; id->element = e; e->next = dtd->elements; dtd->elements = e; return e; } static dtd_edef * new_element_definition(dtd *dtd) { dtd_edef *def = sgml_calloc(1, sizeof(*def)); STAT(edefs_created++); return def; } static dtd_element * def_element(dtd *dtd, dtd_symbol *id) { dtd_element *e = find_element(dtd, id); if ( !e->structure ) { e->structure = new_element_definition(dtd); e->structure->references = 1; e->structure->type = C_EMPTY; } return e; } static void free_name_list(dtd_name_list *nl) { dtd_name_list *next; for( ; nl; nl=next) { next = nl->next; sgml_free(nl); } } #define REFS_VIRGIN (-42) static void free_attribute(dtd_attr *a) { if ( a->references == REFS_VIRGIN || --a->references == 0 ) { switch(a->type) { case AT_NAMEOF: case AT_NOTATION: free_name_list(a->typeex.nameof); default: ; } switch(a->def) { case AT_DEFAULT: case AT_FIXED: { if ( a->islist ) sgml_free(a->att_def.list); else if ( a->type == AT_CDATA && a->att_def.cdata ) sgml_free(a->att_def.cdata); } default: ; } sgml_free(a); } } static void free_attribute_list(dtd_attr_list *l) { dtd_attr_list *next; for(; l; l=next) { next = l->next; free_attribute(l->attribute); sgml_free(l); } } static void free_element_list(dtd_element_list *l) { dtd_element_list *next; for( ; l; l=next) { next = l->next; sgml_free(l); } } static void free_element_definition(dtd_edef *def) { if ( --def->references == 0 ) { STAT(edefs_freed++); if ( def->content ) free_model(def->content); free_element_list(def->included); free_element_list(def->excluded); free_state_engine(def->initial_state); sgml_free(def); } } static void free_elements(dtd_element *e) { dtd_element *next; for( ; e; e=next) { next = e->next; if ( e->structure ) free_element_definition(e->structure); free_attribute_list(e->attributes); sgml_free(e); } } /******************************* * ATTRIBUTES * *******************************/ static dtd_attr * find_attribute(dtd_element *e, dtd_symbol *name) { dtd_attr_list *a; for(a=e->attributes; a; a=a->next) { if ( a->attribute->name == name ) return a->attribute; } return NULL; } /******************************* * PARSE PRIMITIVES * *******************************/ static const ichar * iskip_layout(dtd *dtd, const ichar *in) { ichar cmt = dtd->charfunc->func[CF_CMT]; /* also skips comment */ for( ; *in; in++ ) { if ( HasClass(dtd, *in, CH_BLANK) ) continue; if ( in[0] == cmt && in[1] == cmt ) { in += 2; for( ; *in; in++ ) { if ( in[0] == cmt && in[1] == cmt ) break; } in++; continue; } return in; } return in; } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - See whether we are looking at identifier "id". "id" must be lowercase! This is only used for reserved words, and parsed case-insentive in both XML and SGML modes. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static const ichar * isee_identifier(dtd *dtd, const ichar *in, char *id) { in = iskip_layout(dtd, in); /* match */ while (*id && (wint_t)*id == towlower(*in) ) id++, in++; if ( *id == 0 && !HasClass(dtd, *in, CH_NAME) ) return iskip_layout(dtd, in); return NULL; } static const ichar * itake_name(dtd *dtd, const ichar *in, dtd_symbol **id) { ichar buf[MAXNMLEN]; ichar *o = buf; ichar *e = &buf[MAXNMLEN]-1; in = iskip_layout(dtd, in); if ( !HasClass(dtd, *in, CH_NMSTART) ) return NULL; if ( dtd->case_sensitive ) { while( HasClass(dtd, *in, CH_NAME) && o < e ) *o++ = *in++; } else { while( HasClass(dtd, *in, CH_NAME) && o < e ) *o++ = towlower(*in++); } if ( o == e ) { gripe(ERC_REPRESENTATION, L"NAME too long"); return NULL; } *o++ = '\0'; *id = dtd_add_symbol(dtd, buf); return iskip_layout(dtd, in); } static const ichar * itake_entity_name(dtd *dtd, const ichar *in, dtd_symbol **id) { ichar buf[MAXNMLEN]; ichar *o = buf; ichar *e = &buf[MAXNMLEN]-1; in = iskip_layout(dtd, in); if ( !HasClass(dtd, *in, CH_NMSTART) ) return NULL; if ( dtd->ent_case_sensitive ) { while( HasClass(dtd, *in, CH_NAME) && o < e ) *o++ = *in++; } else { while( HasClass(dtd, *in, CH_NAME) && o < e ) *o++ = towlower(*in++); } if ( o == e ) { gripe(ERC_REPRESENTATION, L"Entity NAME too long"); return NULL; } *o++ = '\0'; *id = dtd_add_symbol(dtd, buf); return in; } static const ichar * itake_nmtoken(dtd *dtd, const ichar *in, dtd_symbol **id) { ichar buf[MAXNMLEN]; ichar *o = buf; ichar *e = &buf[MAXNMLEN]-1; in = iskip_layout(dtd, in); if ( !HasClass(dtd, *in, CH_NAME) ) return NULL; if ( dtd->case_sensitive ) { while( HasClass(dtd, *in, CH_NAME) && o < e ) *o++ = *in++; } else { while( HasClass(dtd, *in, CH_NAME) && o < e ) *o++ = towlower(*in++); } if ( o == e ) { gripe(ERC_REPRESENTATION, L"NMTOKEN too long"); return NULL; } *o = '\0'; *id = dtd_add_symbol(dtd, buf); return iskip_layout(dtd, in); } static const ichar * itake_nutoken(dtd *dtd, const ichar *in, dtd_symbol **id) { ichar buf[MAXNMLEN]; ichar *o = buf; ichar *e = &buf[MAXNMLEN]-1; in = iskip_layout(dtd, in); if ( !HasClass(dtd, *in, CH_DIGIT) ) return NULL; if ( dtd->case_sensitive ) { while( HasClass(dtd, *in, CH_NAME) && o < e ) *o++ = *in++; } else { while( HasClass(dtd, *in, CH_NAME) && o < e ) *o++ = towlower(*in++); } if ( o == e ) { gripe(ERC_REPRESENTATION, L"NUTOKEN too long"); return NULL; } *o = '\0'; if ( o - buf > 8 ) gripe(ERC_LIMIT, L"nutoken length"); *id = dtd_add_symbol(dtd, buf); return iskip_layout(dtd, in); } static const ichar * itake_number(dtd *dtd, const ichar *in, dtd_attr *at) { in = iskip_layout(dtd, in); switch(dtd->number_mode) { case NU_TOKEN: { ichar buf[MAXNMLEN]; ichar *o = buf; while( HasClass(dtd, *in, CH_DIGIT) ) *o++ = *in++; if ( o == buf ) return NULL; /* empty */ *o = '\0'; at->att_def.name = dtd_add_symbol(dtd, buf); return iskip_layout(dtd, (const ichar *)in); } case NU_INTEGER: { ichar *end; at->att_def.number = wcstol(in, &end, 10); if ( end > in && errno != ERANGE ) return iskip_layout(dtd, end); } } return NULL; } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Get a quoted value. After successful return, *start points to the start of the string in the input and *len to the length. The data is *not* nul terminated. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static const ichar * itake_string(dtd *dtd, const ichar *in, ichar **start, int *len) { in = iskip_layout(dtd, in); if ( isee_func(dtd, in, CF_LIT) || isee_func(dtd, in, CF_LITA) ) { ichar q = *in++; *start = (ichar *)in; while( *in && *in != q ) in++; if ( *in ) { *len = (int)(in - (*start)); return iskip_layout(dtd, ++in); } } return NULL; } static const ichar * itake_dubbed_string(dtd *dtd, const ichar *in, ichar **out) { ichar *start; int len; const ichar *end; if ( (end=itake_string(dtd, in, &start, &len)) ) *out = istrndup(start, len); return end; } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - itake_url() is used to get the argument of a SYSTEM or 2nd argument of a PUBLIC reference. Once upon a time it tried to tag the argument as file:, but this job cannot be before lookup in the catalogue. It is now the same as itake_dubbed_string(), so we simply call this one. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static const ichar * itake_url(dtd *dtd, const ichar *in, ichar **out) { return itake_dubbed_string(dtd, in, out); } static const ichar * itake_nmtoken_chars(dtd *dtd, const ichar *in, ichar *out, int len) { in = iskip_layout(dtd, in); if ( !HasClass(dtd, *in, CH_NAME) ) return NULL; while( HasClass(dtd, *in, CH_NAME) ) { if ( --len <= 0 ) gripe(ERC_REPRESENTATION, L"Name token too long"); *out++ = (dtd->case_sensitive ? *in++ : (ichar)towlower(*in++)); } *out++ = '\0'; return iskip_layout(dtd, in); } /* There used to be a function itake_nonblank_chars(dtd, in, out, len) -> new end which - skipped layout, - copied characters from in[] to out[] until layout or \0 was found, - added a terminating \0 to out[], - skipped any following layout, and - returned the new position. That function was only called by get_attribute_value(), which used it to parse an unquoted attribute value. According to SGML, that's not right: unquoted attribute values must look like NMTOKENs (but have a different length bound). In particular, elements like zoo '. According to HTML practice, pretty much any old junk will be accepted, and some HTML parsers will allow bare slashes in such an attribute. Typical HTML is *so* bad that it doesn't agree with *any* part of the HTML specifications (e.g., is commonly wrapped around block-level elements, which has never been legal). It's not clear that there is much point in trying to accomodate bad HTML; if you really need to do that, use the free program HTML Tidy (from the http://www.w3c.org/ site) to clean up, and parse its output instead. However, in order to break as little as possible, the new (sgml-1.0.14) function accepts anything except > / \0 and blanks. JW: I decided to accept / as part of an unquoted in SGML-mode if shorttag is disabled as well as in XML mode if it is not the end of the begin-element */ static ichar const * itake_unquoted(dtd *dtd, ichar const *in, ichar *out, int len) { ichar const end2 = dtd->charfunc->func[CF_ETAGO2]; /* / */ ichar c; /* skip leading layout. Do NOT skip comments! --x-- is a value! */ while (c = *in, HasClass(dtd, c, CH_BLANK)) in++; /* copy the attribute to out[] */ while ( !HasClass(dtd, c, CH_BLANK) && c != '\0' ) { if ( c == end2 && (dtd->shorttag || (in[1] == '\0' && dtd->dialect != DL_SGML)) ) break; if ( --len > 0 ) *out++ = c; else if ( len == 0 ) gripe(ERC_REPRESENTATION, L"Attribute too long"); c = *++in; } *out = '\0'; /* skip trailing layout. While it is kind to skip comments here, it is technically wrong to do so. Tags may not contain comments. */ return iskip_layout(dtd, in); } /******************************* * DTD * *******************************/ dtd * new_dtd(const ichar *doctype) { dtd *dtd = sgml_calloc(1, sizeof(*dtd)); STAT(dtd_created++); dtd->magic = SGML_DTD_MAGIC; dtd->implicit = TRUE; dtd->dialect = DL_SGML; if ( doctype ) dtd->doctype = istrdup(doctype); dtd->symbols = new_symbol_table(); dtd->charclass = new_charclass(); dtd->charfunc = new_charfunc(); dtd->space_mode = SP_SGML; dtd->ent_case_sensitive = TRUE; /* case-sensitive entities */ dtd->shorttag = TRUE; /* allow for number_mode = NU_TOKEN; return dtd; } void free_dtd(dtd *dtd) { if ( --dtd->references == 0 ) { STAT(dtd_freed++); if ( dtd->doctype ) sgml_free(dtd->doctype); free_entity_list(dtd->entities); free_entity_list(dtd->pentities); free_notations(dtd->notations); free_shortrefs(dtd->shortrefs); free_elements(dtd->elements); free_symbol_table(dtd->symbols); sgml_free(dtd->charfunc); sgml_free(dtd->charclass); dtd->magic = 0; sgml_free(dtd); } } static const wchar_t *xml_entities[] = { L"lt CDATA \"<\"", /* < */ L"gt CDATA \">\"", /* > */ L"amp CDATA \"&\"", /* & */ L"apos CDATA \"'\"", /* ' */ L"quot CDATA \""\"", /* " */ NULL }; int set_dialect_dtd(dtd *dtd, dtd_dialect dialect) { if ( dtd->dialect != dialect ) { dtd->dialect = dialect; switch(dialect) { case DL_SGML: { dtd->case_sensitive = FALSE; dtd->space_mode = SP_SGML; dtd->shorttag = TRUE; break; } case DL_XML: case DL_XMLNS: { const ichar **el; dtd_parser p; dtd->case_sensitive = TRUE; dtd->encoding = SGML_ENC_UTF8; dtd->space_mode = SP_PRESERVE; dtd->shorttag = FALSE; memset(&p, 0, sizeof(p)); p.dtd = dtd; for(el = xml_entities; *el; el++) process_entity_declaration(&p, *el); break; } } } return TRUE; } int set_option_dtd(dtd *dtd, dtd_option option, int set) { switch(option) { case OPT_SHORTTAG: dtd->shorttag = set; break; } return TRUE; } static const ichar * baseurl(dtd_parser *p) { if ( p->location.type == IN_FILE && p->location.name.file ) { return p->location.name.file; } return NULL; } static const ichar * process_entity_value_declaration(dtd_parser *p, const ichar *decl, dtd_entity *e) { dtd *dtd = p->dtd; const ichar *s; if ( e->type == ET_SYSTEM ) { if ( (s=itake_url(dtd, decl, &e->exturl)) ) { e->baseurl = istrdup(baseurl(p)); return s; } goto string_expected; } else { ichar *start; int len; ichar val[MAXSTRINGLEN]; if ( !(s = itake_string(dtd, decl, &start, &len)) ) goto string_expected; decl = s; expand_pentities(p, start, len, val, sizeof(val)/sizeof(ichar)); switch ( e->type ) { case ET_PUBLIC: { e->extid = istrdup(val); if ( isee_func(dtd, decl, CF_LIT) || isee_func(dtd, decl, CF_LITA) ) { if ( (s=itake_url(dtd, decl, &e->exturl)) ) { e->baseurl = istrdup(baseurl(p)); decl = s; } } return decl; } case ET_LITERAL: { e->value = istrdup(val); e->length = (int)wcslen(e->value); return decl; } default: assert(0); return NULL; } } string_expected: gripe(ERC_SYNTAX_ERROR, L"String expected", decl); return NULL; } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - The sgml-standard tells us to accept the first definition of an entity, silently suppressing any further attempt to redefine the entity. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static int process_entity_declaration(dtd_parser *p, const ichar *decl) { dtd *dtd = p->dtd; const ichar *s; dtd_symbol *id; dtd_entity *e; int isparam; int isdef = FALSE; /* parameter entity */ if ( (s=isee_func(dtd, decl, CF_PERO)) ) { isparam = TRUE; decl = s; } else isparam = FALSE; if ( !(s = itake_entity_name(dtd, decl, &id)) ) { if ( !(s = isee_identifier(dtd, decl, "#default")) ) return gripe(ERC_SYNTAX_ERROR, L"Name expected", decl); id = dtd_add_symbol(dtd, (ichar*)"#DEFAULT"); isdef = TRUE; } if ( isparam && find_pentity(dtd, id) ) { gripe(ERC_REDEFINED, L"parameter entity", id); return TRUE; /* already defined parameter entity */ } if ( id->entity ) { gripe(ERC_REDEFINED, L"entity", id); return TRUE; /* already defined normal entity */ } decl = iskip_layout(dtd, s); e = sgml_calloc(1, sizeof(*e)); e->name = id; e->catalog_location = (isparam ? CAT_PENTITY : CAT_ENTITY); if ( (s = isee_identifier(dtd, decl, "system")) ) { e->type = ET_SYSTEM; e->content = EC_SGML; decl = s; } else if ( (s = isee_identifier(dtd, decl, "public")) ) { e->type = ET_PUBLIC; e->content = EC_SGML; decl = s; } else { e->type = ET_LITERAL; if ( !isparam ) { if ( (s=isee_identifier(dtd, decl, "cdata")) ) { decl = s; e->content = EC_CDATA; } else if ( (s=isee_identifier(dtd, decl, "sdata")) ) { decl = s; e->content = EC_SDATA; } else if ( (s=isee_identifier(dtd, decl, "pi")) ) { decl = s; e->content = EC_PI; } else if ( (s=isee_identifier(dtd, decl, "starttag")) ) { decl = s; e->content = EC_STARTTAG; } else if ( (s=isee_identifier(dtd, decl, "endtag")) ) { decl = s; e->content = EC_ENDTAG; } else e->content = EC_SGML; } } if ( (decl=process_entity_value_declaration(p, decl, e)) ) { if ( e->type == ET_LITERAL ) { switch(e->content) { case EC_STARTTAG: { ichar *buf = sgml_malloc((e->length + 3)*sizeof(ichar)); buf[0] = dtd->charfunc->func[CF_STAGO]; istrcpy(&buf[1], e->value); buf[++e->length] = dtd->charfunc->func[CF_STAGC]; buf[++e->length] = 0; sgml_free(e->value); e->value = buf; e->content = EC_SGML; break; } case EC_ENDTAG: { ichar *buf = sgml_malloc((e->length + 4)*sizeof(ichar)); buf[0] = dtd->charfunc->func[CF_ETAGO1]; buf[1] = dtd->charfunc->func[CF_ETAGO2]; istrcpy(&buf[2], e->value); e->length++; buf[++e->length] = dtd->charfunc->func[CF_STAGC]; buf[++e->length] = 0; sgml_free(e->value); e->value = buf; e->content = EC_SGML; break; } default: break; } } else { if ( *decl ) { dtd_symbol *nname; if ( (s=isee_identifier(dtd, decl, "cdata")) ) { decl = s; e->content = EC_CDATA; } else if ( (s=isee_identifier(dtd, decl, "sdata")) ) { decl = s; e->content = EC_SDATA; } else if ( (s=isee_identifier(dtd, decl, "ndata")) ) { decl = s; e->content = EC_NDATA; } else return gripe(ERC_SYNTAX_ERROR, L"Bad datatype declaration", decl); if ( (s=itake_name(dtd, decl, &nname)) ) /* what is this? */ { decl = s; } else return gripe(ERC_SYNTAX_ERROR, L"Bad notation declaration", decl); } } if ( *decl ) return gripe(ERC_SYNTAX_ERROR, L"Unexpected end of declaraction", decl); } if ( isparam ) { e->next = dtd->pentities; dtd->pentities = e; } else { e->name->entity = e; e->next = dtd->entities; dtd->entities = e; } if ( isdef ) dtd->default_entity = e; return TRUE; } /******************************* * NOTATIONS * *******************************/ static dtd_notation * find_notation(dtd *dtd, dtd_symbol *name) { dtd_notation *n; for(n=dtd->notations; n; n = n->next) { if ( n->name == name ) return n; } return NULL; } static void add_notation(dtd *dtd, dtd_notation *not) { dtd_notation **n = &dtd->notations; for( ; *n; n = &(*n)->next) ; *n = not; } static int process_notation_declaration(dtd_parser *p, const ichar *decl) { dtd *dtd = p->dtd; dtd_symbol *nname; const ichar *s; ichar *system = NULL, *public = NULL; dtd_notation *not; if ( !(s=itake_name(dtd, decl, &nname)) ) return gripe(ERC_SYNTAX_ERROR, L"Notation name expected", decl); decl = s; if ( find_notation(dtd, nname) ) { gripe(ERC_REDEFINED, L"notation", nname); return TRUE; } if ( (s=isee_identifier(dtd, decl, "system")) ) { ; } else if ( (s=isee_identifier(dtd, decl, "public")) ) { decl = s; if ( !(s=itake_dubbed_string(dtd, decl, &public)) ) return gripe(ERC_SYNTAX_ERROR, L"Public identifier expected", decl); } else return gripe(ERC_SYNTAX_ERROR, L"SYSTEM or PUBLIC expected", decl); decl = s; if ( (s=itake_dubbed_string(dtd, decl, &system)) ) decl = s; if ( *decl ) return gripe(ERC_SYNTAX_ERROR, L"Unexpected end of declaraction", decl); not = sgml_calloc(1, sizeof(*not)); not->name = nname; not->system = system; not->public = public; not->next = NULL; add_notation(dtd, not); return TRUE; } static void free_notations(dtd_notation *n) { dtd_notation *next; for( ; n; n=next) { next = n->next; sgml_free(n->system); sgml_free(n->public); sgml_free(n); } } /******************************* * SHORTREF * *******************************/ static void free_maps(dtd_map *map) { dtd_map *next; for( ; map; map=next) { next = map->next; if ( map->from ) sgml_free(map->from); sgml_free(map); } } static void free_shortrefs(dtd_shortref *sr) { dtd_shortref *next; for( ; sr; sr=next) { next = sr->next; free_maps(sr->map); sgml_free(sr); } } static const ichar * shortref_add_map(dtd *dtd, const ichar *decl, dtd_shortref *sr) { ichar *start; int len; ichar from[MAXMAPLEN]; ichar *f = from; dtd_symbol *to; const ichar *s; const ichar *end; dtd_map **p; dtd_map *m; if ( !(s=itake_string(dtd, decl, &start, &len)) ) { gripe(ERC_SYNTAX_ERROR, L"map-string expected", decl); return NULL; } decl = s; if ( !(s=itake_entity_name(dtd, decl, &to)) ) { gripe(ERC_SYNTAX_ERROR, L"map-to name expected", decl); return NULL; } end = s; for(decl=start; len > 0;) { if ( *decl == 'B' ) /* blank */ { if ( decl[1] == 'B' ) { *f++ = CHR_DBLANK; decl += 2; len -= 2; continue; } *f++ = CHR_BLANK; decl++; len--; } else { *f++ = *decl++; /* any other character */ len--; } } *f = 0; for(p=&sr->map; *p; p = &(*p)->next) ; m = sgml_calloc(1, sizeof(*m)); m->from = istrdup(from); m->len = (int)istrlen(from); m->to = to; *p = m; return end; } static dtd_shortref * def_shortref(dtd_parser *p, dtd_symbol *name) { dtd *dtd = p->dtd; dtd_shortref *sr, **pr; for(pr=&dtd->shortrefs; *pr; pr = &(*pr)->next) { dtd_shortref *r = *pr; if ( r->name == name ) return r; } sr = sgml_calloc(1, sizeof(*sr)); sr->name = name; *pr = sr; return sr; } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Create an array with TRUE in any character that can be the last of the shortref map. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static void compile_map(dtd *dtd, dtd_shortref *sr) { dtd_map *map; for(map = sr->map; map; map = map->next) { ichar last = map->from[map->len-1]; switch( last ) { case CHR_BLANK: case CHR_DBLANK: { wint_t i; for( i=0; i< SHORTMAP_SIZE; i++) { if ( HasClass(dtd, i, CH_BLANK) ) sr->ends[i] = TRUE; } } default: sr->ends[last] = TRUE; } } } static int process_shortref_declaration(dtd_parser *p, const ichar *decl) { dtd *dtd = p->dtd; ichar buf[MAXDECL]; dtd_shortref *sr; dtd_symbol *name; const ichar *s; if ( !expand_pentities(p, decl, ZERO_TERM_LEN, buf, sizeof(buf)/sizeof(ichar)) ) return FALSE; decl = buf; if ( !(s=itake_name(dtd, decl, &name)) ) return gripe(ERC_SYNTAX_ERROR, L"Name expected", decl); decl = s; sr = def_shortref(p, name); if ( sr->defined ) { gripe(ERC_REDEFINED, L"shortref", name); return TRUE; } sr->defined = TRUE; while( *(decl = iskip_layout(dtd, decl)) != '\0' && (s=shortref_add_map(dtd, decl, sr)) ) decl = s; compile_map(dtd, sr); if ( *decl ) return gripe(ERC_SYNTAX_ERROR, L"Map expected", decl); return TRUE; } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Find named name. The name NULL stands for the #empty map - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static dtd_shortref * find_map(dtd *dtd, dtd_symbol *name) { dtd_shortref *sr; if ( !name ) { static dtd_shortref *empty; if ( !empty ) { empty = sgml_calloc(1, sizeof(*empty)); empty->name = dtd_add_symbol(dtd, (ichar*)"#EMPTY"); empty->defined = TRUE; } return empty; } for( sr = dtd->shortrefs; sr; sr = sr->next ) { if ( sr->name == name ) { if ( !sr->defined ) break; return sr; } } return NULL; } static void set_map_element(dtd_element *e, void *closure) { e->map = closure; } static int process_usemap_declaration(dtd_parser *p, const ichar *decl) { dtd *dtd = p->dtd; ichar buf[MAXDECL]; dtd_symbol *name; const ichar *s; dtd_symbol *ename; dtd_element *e; dtd_shortref *map; if ( !expand_pentities(p, decl, ZERO_TERM_LEN, buf, sizeof(buf)/sizeof(ichar)) ) return FALSE; decl = buf; if ( !(s=itake_name(dtd, decl, &name)) ) { if ( (s=isee_identifier(dtd, decl, "#empty")) ) name = NULL; else return gripe(ERC_SYNTAX_ERROR, L"map-name expected", decl); } decl = s; if ( !(map = find_map(dtd, name)) ) map = def_shortref(p, name); /* make undefined map */ if ( isee_func(dtd, decl, CF_GRPO) ) /* ( */ { dtd_model *model; if ( (model = make_model(dtd, decl, &s)) ) { for_elements_in_model(model, set_map_element, map); free_model(model); decl = s; } else return FALSE; } else if ( (s=itake_name(dtd, decl, &ename)) ) { e = find_element(dtd, ename); e->map = map; decl = s; } else if ( p->environments ) { if ( !map->defined ) gripe(ERC_EXISTENCE, L"map", name->name); p->environments->map = map; p->map = p->environments->map; } else return gripe(ERC_SYNTAX_ERROR, L"element-name expected", decl); if ( *decl ) return gripe(ERC_SYNTAX_ERROR, L"Unparsed", decl); return TRUE; } static int match_map(dtd *dtd, dtd_map *map, ocharbuf *buf) { wchar_t *data = buf->data.w; wchar_t *e = data+buf->size-1; ichar *m = map->from+map->len-1; while( m >= map->from ) { if ( e < data ) return 0; if ( *m == *e ) { m--; e--; continue; } if ( *m == CHR_DBLANK ) { if ( e>data && HasClass(dtd, *e, CH_WHITE) ) e--; else return FALSE; goto wblank; } if ( *m == CHR_BLANK ) { wblank: while( e>data && HasClass(dtd, *e, CH_WHITE) ) e--; m--; continue; } return 0; } return (int)(data+buf->size-1-e); } static int match_shortref(dtd_parser *p) { dtd_map *map; for(map = p->map->map; map; map = map->next) { int len; if ( (len=match_map(p->dtd, map, p->cdata)) ) { p->cdata->size -= len; if ( p->cdata_must_be_empty ) { int blank = TRUE; const wchar_t *s; int i; for(s = p->cdata->data.w, i=0; i++ < p->cdata->size; s++) { if ( !iswspace(*s) ) { blank = FALSE; break; } } p->blank_cdata = blank; } WITH_CLASS(p, EV_SHORTREF, { sgml_cplocation(&p->startloc, &p->location); p->startloc.charpos -= len; p->startloc.linepos -= len; if ( p->startloc.linepos < 0 ) { p->startloc.line--; p->startloc.linepos = 0; /* not correct! */ } DEBUG(printf("%d-%d: Matched map '%s' --> %s, len = %d\n", p->startloc.charpos, p->location.charpos, map->from, map->to->name, len)); process_entity(p, map->to->name); }) /* TBD: optimise */ return TRUE; } } return FALSE; } /******************************* * ELEMENTS * *******************************/ static void add_submodel(dtd_model *m, dtd_model *sub) { dtd_model **d; for( d = &m->content.group; *d; d = &(*d)->next ) ; *d = sub; } /* for_elements_in_model() Walk along the model, calling f(e, closure) for any element found in the model. Used for */ static void for_elements_in_model(dtd_model *m, void (*f)(dtd_element *e, void *closure), void *closure) { switch(m->type) { case MT_SEQ: case MT_AND: case MT_OR: { dtd_model *sub = m->content.group; for(; sub; sub = sub->next) for_elements_in_model(sub, f, closure); break; } case MT_ELEMENT: (*f)(m->content.element, closure); break; default: ; } } static void free_model(dtd_model *m) { switch(m->type) { case MT_SEQ: case MT_AND: case MT_OR: { dtd_model *sub = m->content.group; dtd_model *next; for(; sub; sub = next) { next = sub->next; free_model(sub); } } default: ; } sgml_free(m); } static dtd_model * make_model(dtd *dtd, const ichar *decl, const ichar **end) { const ichar *s; dtd_model *m = sgml_calloc(1, sizeof(*m)); dtd_symbol *id; decl = iskip_layout(dtd, decl); if ( (s=isee_identifier(dtd, decl, "#pcdata")) ) { m->type = MT_PCDATA; m->cardinality = MC_ONE; /* actually don't care */ *end = s; return m; } if ( (s=itake_name(dtd, decl, &id)) ) { m->type = MT_ELEMENT; m->content.element = find_element(dtd, id); decl = s; } else { if ( !(s=isee_func(dtd, decl, CF_GRPO)) ) { gripe(ERC_SYNTAX_ERROR, L"Name group expected", decl); free_model(m); return NULL; } decl = s; for(;;) { dtd_model *sub; modeltype mt; if ( !(sub = make_model(dtd, decl, &s)) ) { free_model(sub); return NULL; } decl = s; add_submodel(m, sub); if ( (s = isee_func(dtd, decl, CF_OR)) ) { decl = s; mt = MT_OR; } else if ( (s = isee_func(dtd, decl, CF_SEQ)) ) { decl = s; mt = MT_SEQ; } else if ( (s = isee_func(dtd, decl, CF_AND)) ) { decl = s; mt = MT_AND; } else if ( (s = isee_func(dtd, decl, CF_GRPC)) ) { decl = s; break; } else { gripe(ERC_SYNTAX_ERROR, L"Connector ('|', ',' or '&') expected", decl); free_model(m); return NULL; } decl = iskip_layout(dtd, decl); if ( m->type != mt ) { if ( !m->type ) m->type = mt; else { gripe(ERC_SYNTAX_ERROR, L"Different connector types in model", decl); free_model(m); return NULL; } } } } if ( (s = isee_func(dtd, decl, CF_OPT)) ) { decl = s; m->cardinality = MC_OPT; } else if ( (s=isee_func(dtd, decl, CF_REP)) ) { decl = s; m->cardinality = MC_REP; } else if ( (s=isee_func(dtd, decl, CF_PLUS)) ) { /* ROK: watch out for (x) +(y) */ if ( isee_func(dtd, iskip_layout(dtd, s), CF_GRPO) == NULL ) { decl = s; m->cardinality = MC_PLUS; } } else m->cardinality = MC_ONE; if ( m->type == MT_UNDEF ) /* simplify (e+), etc. */ { dtd_model *sub = m->content.group; modelcard card; assert(!sub->next); if ( sub->cardinality == MC_ONE ) card = m->cardinality; else if ( m->cardinality == MC_ONE ) card = sub->cardinality; else { m->type = MT_OR; goto out; } *m = *sub; m->cardinality = card; sgml_free(sub); } out: *end = iskip_layout(dtd, decl); return m; } static const ichar * process_model(dtd *dtd, dtd_edef *e, const ichar *decl) { const ichar *s; decl = iskip_layout(dtd, decl); if ( (s = isee_identifier(dtd, decl, "empty")) ) { e->type = C_EMPTY; return s; } if ( (s = isee_identifier(dtd, decl, "cdata")) ) { e->type = C_CDATA; return s; } if ( (s = isee_identifier(dtd, decl, "rcdata")) ) { e->type = C_RCDATA; return s; } if ( (s = isee_identifier(dtd, decl, "any")) ) { e->type = C_ANY; return s; } e->type = C_PCDATA; if ( !(e->content = make_model(dtd, decl, &decl)) ) return FALSE; return decl; } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - See a name-group separator. As long as we haven't decided, this can be CF_NG. If we have decided they must all be the same. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static const ichar * isee_ngsep(dtd *dtd, const ichar *decl, charfunc *sep) { const ichar *s; if ( (s=isee_func(dtd, decl, *sep)) ) return iskip_layout(dtd, s); if ( *sep == CF_NG ) /* undecided */ { static const charfunc ng[] = { CF_SEQ, CF_OR, CF_AND }; int n; for(n=0; n<3; n++) { if ( (s=isee_func(dtd, decl, ng[n])) ) { *sep = ng[n]; return iskip_layout(dtd, s); } } } return NULL; } static const ichar * itake_namegroup(dtd *dtd, const ichar *decl, dtd_symbol **names, int *n) { const ichar *s; int en = 0; if ( (s=isee_func(dtd, decl, CF_GRPO)) ) { charfunc ngs = CF_NG; for(;;) { if ( !(decl=itake_name(dtd, s, &names[en++])) ) { gripe(ERC_SYNTAX_ERROR, L"Name expected", s); return NULL; } if ( (s=isee_ngsep(dtd, decl, &ngs)) ) { decl = iskip_layout(dtd, s); continue; } if ( (s=isee_func(dtd, decl, CF_GRPC)) ) { *n = en; decl = s; return iskip_layout(dtd, decl); } gripe(ERC_SYNTAX_ERROR, L"Bad name-group", decl); return NULL; } } return NULL; } typedef struct { dtd_symbol **list; int size; } namelist; static void add_list_element(dtd_element *e, void *closure) { namelist *nl = closure; nl->list[nl->size++] = e->name; } static const ichar * itake_el_or_model_element_list(dtd *dtd, const ichar *decl, dtd_symbol **names, int *n) { const ichar *s; if ( isee_func(dtd, decl, CF_GRPO) ) { dtd_model *model; if ( (model = make_model(dtd, decl, &s)) ) { namelist nl; nl.list = names; nl.size = 0; for_elements_in_model(model, add_list_element, &nl); free_model(model); *n = nl.size; return s; } else return NULL; } else { if ( !(s = itake_name(dtd, decl, &names[0])) ) { gripe(ERC_SYNTAX_ERROR, L"Name expected", decl); return NULL; } *n = 1; return s; } } static void add_element_list(dtd_element_list **l, dtd_element *e) { dtd_element_list *n = sgml_calloc(1, sizeof(*n)); n->value = e; for( ; *l; l = &(*l)->next ) ; *l = n; } static int process_element_declaraction(dtd_parser *p, const ichar *decl) { dtd *dtd = p->dtd; ichar buf[MAXDECL]; const ichar *s; dtd_symbol *eid[MAXATTELEM]; dtd_edef *def; int en; int i; /* expand parameter entities */ if ( !expand_pentities(p, decl, ZERO_TERM_LEN, buf, sizeof(buf)/sizeof(ichar)) ) return FALSE; decl = buf; if ( !(s=itake_el_or_model_element_list(dtd, decl, eid, &en)) ) return gripe(ERC_SYNTAX_ERROR, L"Name or name-group expected", decl); decl = s; if ( en == 0 ) return TRUE; /* 0 elements */ STAT(edefs_decl++); def = new_element_definition(dtd); for(i=0; ielement->structure == NULL); eid[i]->element->structure = def; eid[i]->element->undefined = FALSE; } def->references = en; /* for GC */ /* omitted tag declarations (opt) */ if ( (s = isee_identifier(dtd, decl, "-")) ) { def->omit_close = FALSE; goto seeclose; } else if ( (s = isee_identifier(dtd, decl, "o")) ) { def->omit_open = TRUE; seeclose: decl = s; if ( (s = isee_identifier(dtd, decl, "-")) ) { def->omit_close = FALSE; } else if ( (s = isee_identifier(dtd, decl, "o")) ) { for(i=0; iomit_close = TRUE; } else return gripe(ERC_SYNTAX_ERROR, L"Bad omit-tag declaration", decl); decl = s; } /* content model */ if ( !(decl=process_model(dtd, def, decl)) ) return FALSE; /* in/excluded elements */ if ( decl[0] == '-' || decl[0] == '+' ) { dtd_symbol *ng[MAXNAMEGROUP]; int ns; dtd_element_list **l; if ( decl[0] == '-' ) l = &def->excluded; else l = &def->included; decl++; if ( (s=itake_namegroup(dtd, decl, ng, &ns)) ) { int i; decl = s; for(i=0; ivalue = s; for( ; *nl; nl = &(*nl)->next ) ; *nl = n; } static void set_element_properties(dtd_element *e, dtd_attr *a) { if ( istreq(a->name->name, L"xml:space") ) { switch(a->def) { case AT_FIXED: case AT_DEFAULT: break; default: return; } switch (a->type ) { case AT_NAMEOF: case AT_NAME: case AT_NMTOKEN: e->space_mode = istr_to_space_mode(a->att_def.name->name); break; case AT_CDATA: e->space_mode = istr_to_space_mode((ichar *)a->att_def.cdata); break; default: break; } } } static void add_attribute(dtd *dtd, dtd_element *e, dtd_attr *a) { dtd_attr_list **l; dtd_attr_list *n; for(l = &e->attributes; *l; l = &(*l)->next) { if ( (*l)->attribute->name == a->name ) { gripe(ERC_REDEFINED, L"attribute", a->name); a->references++; /* attempt to redefine attribute: */ free_attribute(a); /* first wins according to standard */ return; } } n = sgml_calloc(1, sizeof(*n)); n->attribute = a; a->references++; *l = n; set_element_properties(e, a); } static int process_attlist_declaraction(dtd_parser *p, const ichar *decl) { dtd *dtd = p->dtd; dtd_symbol *eid[MAXATTELEM]; int i, en; ichar buf[MAXDECL]; const ichar *s; /* expand parameter entities */ if ( !expand_pentities(p, decl, ZERO_TERM_LEN, buf, sizeof(buf)/sizeof(ichar)) ) return FALSE; decl = iskip_layout(dtd, buf); DEBUG(printf("Expanded to %s\n", decl)); if ( !(decl=itake_el_or_model_element_list(dtd, decl, eid, &en)) ) return FALSE; /* fetch attributes */ while(*decl) { dtd_attr *at = sgml_calloc(1, sizeof(*at)); at->references = REFS_VIRGIN; /* name of attribute */ if ( !(s = itake_name(dtd, decl, &at->name)) ) { free_attribute(at); return gripe(ERC_SYNTAX_ERROR, L"Name expected", decl); } decl = s; /* (name1|name2|...) type */ if ( (s=isee_func(dtd, decl, CF_GRPO)) ) { charfunc ngs = CF_NG; at->type = AT_NAMEOF; decl=s; for(;;) { dtd_symbol *nm; if ( !(s = itake_nmtoken(dtd, decl, &nm)) ) { free_attribute(at); return gripe(ERC_SYNTAX_ERROR, L"Name expected", decl); } decl = s; add_name_list(&at->typeex.nameof, nm); if ( (s=isee_ngsep(dtd, decl, &ngs)) ) { decl = s; continue; } if ( (s = isee_func(dtd, decl, CF_GRPC)) ) { decl=s; decl = iskip_layout(dtd, decl); break; } free_attribute(at); return gripe(ERC_SYNTAX_ERROR, L"Illegal name-group", decl); } } else if ( (s=isee_identifier(dtd, decl, "cdata")) ) { decl = s; at->type = AT_CDATA; } else if ( (s=isee_identifier(dtd, decl, "entity")) ) { decl = s; at->type = AT_ENTITY; } else if ( (s=isee_identifier(dtd, decl, "entities")) ) { decl = s; at->type = AT_ENTITIES; at->islist = TRUE; } else if ( (s=isee_identifier(dtd, decl, "id")) ) { decl = s; at->type = AT_ID; } else if ( (s=isee_identifier(dtd, decl, "idref")) ) { decl = s; at->type = AT_IDREF; } else if ( (s=isee_identifier(dtd, decl, "idrefs")) ) { decl = s; at->type = AT_IDREFS; at->islist = TRUE; } else if ( (s=isee_identifier(dtd, decl, "name")) ) { decl = s; at->type = AT_NAME; } else if ( (s=isee_identifier(dtd, decl, "names")) ) { decl = s; at->type = AT_NAMES; at->islist = TRUE; } else if ( (s=isee_identifier(dtd, decl, "nmtoken")) ) { decl = s; at->type = AT_NMTOKEN; } else if ( (s=isee_identifier(dtd, decl, "nmtokens")) ) { decl = s; at->type = AT_NMTOKENS; at->islist = TRUE; } else if ( (s=isee_identifier(dtd, decl, "number")) ) { decl = s; at->type = AT_NUMBER; } else if ( (s=isee_identifier(dtd, decl, "numbers")) ) { decl = s; at->type = AT_NUMBERS; at->islist = TRUE; } else if ( (s=isee_identifier(dtd, decl, "nutoken")) ) { decl = s; at->type = AT_NUTOKEN; } else if ( (s=isee_identifier(dtd, decl, "nutokens")) ) { decl = s; at->type = AT_NUTOKENS; at->islist = TRUE; } else if ( (s=isee_identifier(dtd, decl, "notation")) ) { dtd_symbol *ng[MAXNAMEGROUP]; int ns; at->type = AT_NOTATION; decl=s; if ( (s=itake_namegroup(dtd, decl, ng, &ns)) ) { decl = s; for(i=0; itypeex.nameof, ng[i]); } else { free_attribute(at); return gripe(ERC_SYNTAX_ERROR, L"name-group expected", decl); } } else { free_attribute(at); return gripe(ERC_SYNTAX_ERROR, L"Attribute-type expected", decl); } /* Attribute Defaults */ if ( (s=isee_identifier(dtd, decl, "#fixed")) ) { decl = s; at->def = AT_FIXED; } else if ( (s=isee_identifier(dtd, decl, "#required")) ) { decl = s; at->def = AT_REQUIRED; } else if ( (s=isee_identifier(dtd, decl, "#current")) ) { decl = s; at->def = AT_CURRENT; } else if ( (s=isee_identifier(dtd, decl, "#conref")) ) { decl = s; at->def = AT_CONREF; } else if ( (s=isee_identifier(dtd, decl, "#implied")) ) { decl = s; at->def = AT_IMPLIED; } else /* real default */ at->def = AT_DEFAULT; if ( at->def == AT_DEFAULT || at->def == AT_FIXED ) { ichar buf[MAXSTRINGLEN]; ichar *start; int len; const ichar *end; if ( !(end=itake_string(dtd, decl, &start, &len)) ) { end=itake_nmtoken_chars(dtd, decl, buf, sizeof(buf)/sizeof(ichar)); start = buf; len = (int)istrlen(buf); } if ( !end ) return gripe(ERC_SYNTAX_ERROR, L"Bad attribute default", decl); /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Note: itake_name(), etc. work on nul-terminated strings. The result of itake_string() is a pointer in a nul-terminated string and these functions will stop scanning at the quote anyway, so we can use the length of the parsed data to verify we parsed all of it. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ switch(at->type) { case AT_CDATA: { at->att_def.cdata = istrndup(start, len); break; } case AT_ENTITY: case AT_NOTATION: case AT_NAME: { if ( !(s=itake_name(dtd, start, &at->att_def.name)) || (s-start) != len ) return gripe(ERC_DOMAIN, L"name", decl); break; } case AT_NMTOKEN: case AT_NAMEOF: { if ( !(s=itake_nmtoken(dtd, start, &at->att_def.name)) || (s-start) != len ) return gripe(ERC_DOMAIN, L"nmtoken", decl); break; } case AT_NUTOKEN: { if ( !(s=itake_nutoken(dtd, start, &at->att_def.name)) || (s-start) != len ) return gripe(ERC_DOMAIN, L"nutoken", decl); break; } case AT_NUMBER: { if ( !(s=itake_number(dtd, start, at)) || (s-start) != len ) return gripe(ERC_DOMAIN, L"number", decl); break; } case AT_NAMES: case AT_ENTITIES: case AT_IDREFS: case AT_NMTOKENS: case AT_NUMBERS: case AT_NUTOKENS: { at->att_def.list = istrndup(buf, len); break; } default: { free_attribute(at); return gripe(ERC_REPRESENTATION, L"No default for type"); } } decl = end; } /* add to list */ at->references = 0; for(i=0; iparent) { if ( env->element->structure ) { dtd_edef *def = env->element->structure; dtd_element_list *el; for(el=def->excluded; el; el=el->next) { if ( el->value == e ) return IE_EXCLUDED; } for(el=def->included; el; el=el->next) { if ( el->value == e ) return IE_INCLUDED; } } } return IE_NORMAL; } static int complete(sgml_environment *env) { if ( env->element->structure && !env->element->undefined && env->element->structure->type != C_ANY ) { dtd_edef *def = env->element->structure; if ( !same_state(def->final_state, env->state) ) return FALSE; } return TRUE; } static void validate_completeness(sgml_environment *env) { if ( !complete(env) ) { wchar_t buf[MAXNMLEN+50]; swprintf(buf, MAXNMLEN+50, L"Incomplete element: <%s>", env->element->name->name); gripe(ERC_VALIDATE, buf); /* TBD: expected */ } } static sgml_environment * push_element(dtd_parser *p, dtd_element *e, int callback) { if ( e != CDATA_ELEMENT ) { sgml_environment *env = sgml_calloc(1, sizeof(*env)); emit_cdata(p, FALSE); env->element = e; env->state = make_state_engine(e); env->space_mode = (p->environments ? p->environments->space_mode : p->dtd->space_mode); env->parent = p->environments; p->environments = env; if ( p->dtd->shorttag ) { env->saved_waiting_for_net = p->waiting_for_net; if ( p->event_class == EV_SHORTTAG ) { p->waiting_for_net = TRUE; env->wants_net = TRUE; } else { env->wants_net = FALSE; if ( e->structure && e->structure->omit_close == FALSE ) p->waiting_for_net = FALSE; } } if ( e->map ) p->map = env->map = e->map; else if ( env->parent ) p->map = env->map = env->parent->map; p->first = TRUE; if ( callback && p->on_begin_element ) { sgml_attribute atts[MAXATTRIBUTES]; int natts = 0; if ( !(p->flags & SGML_PARSER_NODEFS) ) natts = add_default_attributes(p, e, natts, atts); (*p->on_begin_element)(p, e, natts, atts); } if ( e->structure ) { if ( e->structure->type == C_CDATA || e->structure->type == C_RCDATA ) { p->state = (e->structure->type == C_CDATA ? S_CDATA : S_RCDATA); p->cdata_state = p->state; p->etag = e->name->name; p->etaglen = (int)istrlen(p->etag); sgml_cplocation(&p->startcdata, &p->location); } else p->cdata_state = S_PCDATA; } } return p->environments; } static void free_environment(sgml_environment *env) { #ifdef XMLNS if ( env->xmlns ) xmlns_free(env); #endif sgml_free(env); } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Pop the stack, closing all environment uptil `to'. The close was initiated by pushing the element `e'. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static int pop_to(dtd_parser *p, sgml_environment *to, dtd_element *e0) { sgml_environment *env, *parent; for(env = p->environments; env != to; env=parent) { dtd_element *e = env->element; validate_completeness(env); parent = env->parent; if ( e->structure && !e->structure->omit_close ) gripe(ERC_OMITTED_CLOSE, e->name->name); if ( e0 != CDATA_ELEMENT ) emit_cdata(p, TRUE); p->first = FALSE; p->environments = env; if ( p->dtd->shorttag ) p->waiting_for_net = env->saved_waiting_for_net; WITH_CLASS(p, EV_OMITTED, if ( p->on_end_element ) (*p->on_end_element)(p, e)); free_environment(env); } p->environments = to; p->map = to->map; return TRUE; } static void allow_for(dtd_element *in, dtd_element *e) { dtd_edef *def = in->structure; dtd_model *g; if ( def->type == C_EMPTY ) { def->type = C_PCDATA; def->content = sgml_calloc(1, sizeof(*def->content)); def->content->type = MT_OR; def->content->cardinality = MC_REP; } assert(def->content->type == MT_OR); g = def->content->content.group; if ( e == CDATA_ELEMENT ) { dtd_model *m; for(; g; g = g->next) { if ( g->type == MT_PCDATA ) return; } m = sgml_calloc(1, sizeof(*m)); m->type = MT_PCDATA; m->cardinality = MC_ONE; /* ignored */ add_submodel(def->content, m); } else { dtd_model *m; for(; g; g = g->next) { if ( g->type == MT_ELEMENT && g->content.element == e ) return; } m = sgml_calloc(1, sizeof(*m)); m->type = MT_ELEMENT; m->cardinality = MC_ONE; /* ignored */ m->content.element = e; add_submodel(def->content, m); } } static int open_element(dtd_parser *p, dtd_element *e, int warn) { if ( !p->environments && p->enforce_outer_element ) { dtd_element *f = p->enforce_outer_element->element; if ( f && f != e ) { if ( !f->structure || !f->structure->omit_open ) gripe(ERC_OMITTED_OPEN, f->name->name); WITH_CLASS(p, EV_OMITTED, { open_element(p, f, TRUE); if ( p->on_begin_element ) { sgml_attribute atts[MAXATTRIBUTES]; int natts = 0; if ( !(p->flags & SGML_PARSER_NODEFS) ) natts = add_default_attributes(p, f, natts, atts); (*p->on_begin_element)(p, f, natts, atts); } }); } } /* no DTD available yet */ if ( !p->environments && !p->dtd->doctype && e != CDATA_ELEMENT ) { const ichar *file; file = find_in_catalogue(CAT_DOCTYPE, e->name->name, NULL, NULL, p->dtd->dialect != DL_SGML); if ( file ) { dtd_parser *clone = clone_dtd_parser(p); gripe(ERC_NO_DOCTYPE, e->name->name, file); if ( load_dtd_from_file(clone, file) ) p->dtd->doctype = istrdup(e->name->name); else gripe(ERC_EXISTENCE, L"file", file); free_dtd_parser(clone); } } if ( p->environments ) { sgml_environment *env = p->environments; if ( env->element->undefined ) { allow_for(env->element, e); /* */ push_element(p, e, FALSE); return TRUE; } if ( env->element->structure && env->element->structure->type == C_ANY ) { if ( e != CDATA_ELEMENT && e->undefined ) gripe(ERC_EXISTENCE, L"Element", e->name->name); push_element(p, e, FALSE); return TRUE; } switch(in_or_excluded(env, e)) { case IE_INCLUDED: push_element(p, e, FALSE); return TRUE; case IE_EXCLUDED: if ( warn ) gripe(ERC_NOT_ALLOWED, e->name->name); /*FALLTHROUGH*/ case IE_NORMAL: for(; env; env=env->parent) { dtd_state *new; if ( (new = make_dtd_transition(env->state, e)) ) { env->state = new; pop_to(p, env, e); push_element(p, e, FALSE); return TRUE; } else { dtd_element *oe[MAXOMITTED]; /* omitted open */ int olen; int i; if ( (olen=find_omitted_path(env->state, e, oe)) > 0 ) { pop_to(p, env, e); WITH_CLASS(p, EV_OMITTED, for(i=0; istate = make_dtd_transition(env->state, oe[i]); env = push_element(p, oe[i], TRUE); }) env->state = make_dtd_transition(env->state, e); push_element(p, e, FALSE); return TRUE; } } if ( !env->element->structure || !env->element->structure->omit_close ) break; } } if ( warn ) { if ( e == CDATA_ELEMENT ) gripe(ERC_VALIDATE, L"#PCDATA not allowed here"); else if ( e->undefined ) gripe(ERC_EXISTENCE, L"Element", e->name->name); else gripe(ERC_NOT_ALLOWED, e->name->name); } } if ( warn ) { push_element(p, e, FALSE); return TRUE; } else return FALSE; } static int close_element(dtd_parser *p, dtd_element *e, int conref) { sgml_environment *env; for(env = p->environments; env; env=env->parent) { if ( env->element == e ) /* element is open */ { sgml_environment *parent; for(env = p->environments; ; env=parent) { dtd_element *ce = env->element; if ( !(conref && env == p->environments) ) validate_completeness(env); parent = env->parent; p->first = FALSE; if ( p->on_end_element ) (*p->on_end_element)(p, env->element); free_environment(env); p->environments = parent; if ( ce == e ) /* closing current element */ { p->map = (parent ? parent->map : NULL); return TRUE; } else /* omited close */ { if ( ce->structure && !ce->structure->omit_close ) gripe(ERC_OMITTED_CLOSE, ce->name->name); } } } } return gripe(ERC_NOT_OPEN, e->name->name); } static int close_current_element(dtd_parser *p) { if ( p->environments ) { dtd_element *e = p->environments->element; emit_cdata(p, TRUE); return close_element(p, e, FALSE); } return gripe(ERC_SYNTAX_ERROR, L"No element to close", ""); } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - get_attribute_value() Get the value for an attribute. Once I thought this was simple, but Richard O'Keefe pointed to the complex handling of white-space in SGML attributes. Basically, if the attribute is quoted, we need: * If CDATA, map all blank to space characters, then expand entities * If !CDATA expand all entities, canonise white space by deleting leading and trailing space and squishing multiple space characters to a single (lower for us) case. This almost, but not completely matches the XML definition. This however is so complex we will ignore it for now. [Rewritten by Richard O'Keefe with these addional comments] Reads a value, the attribute name and value indicator having been processed already. It calls itake_string() to read quoted values, and itake_unquoted() to read unquoted values. itake_string(dtd, in, buf, size) - skips layout INCLUDING comments, - returns NULL if the next character is not ' or ", - copies characters from in to buf until a matching ' or " is found, - adds a terminating \0, - skips more layout INCLUDING comments, and - returns the new input position. It is quite wrong to skip leading comments here. In the tag the characters "--ugh--" *are the value*. They are not a comment. Comments are not in fact allowed inside tags, unfortunately. This tag is equivalent to where something is an attribute that has zoo as one of its enumerals. Because itake_string() is called in many other places, this bug has not yet been fixed. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static ichar const * get_attribute_value(dtd_parser *p, ichar const *decl, sgml_attribute *att) { ichar tmp[MAXSTRINGLEN]; ichar *buf = tmp; ichar const *s; ichar c; dtd *dtd = p->dtd; ichar const *end; ichar *start; int len; enum { DIG_FIRST = 8, /* any token start with digit? */ NAM_FIRST = 4, /* any token start with non-digit name char? */ NAM_LATER = 2, /* any token have non-digit name char later? */ ANY_OTHER = 1, /* any token have illegal character? */ YET_EMPTY = 0 } token = YET_EMPTY; att->value.textW = NULL; /* UCS text */ att->value.number = 0; att->flags = 0; end = itake_string(dtd, decl, &start, &len); if ( end != NULL ) { ocharbuf out; init_ocharbuf(&out); expand_entities(p, start, len, &out); if ( att->definition->type == AT_CDATA ) { malloc_ocharbuf(&out); att->value.number = out.size; att->value.textW = out.data.w; return end; } else { ichar *d; buf = out.data.w; /* canonicalise blanks */ s = buf; while ((c = *s++) != '\0' && HasClass(dtd, c, CH_BLANK)) ; d = buf; while ( c != '\0' ) { token |= HasClass(dtd, c, CH_DIGIT) ? DIG_FIRST : HasClass(dtd, c, CH_NAME) ? NAM_FIRST : /* oops! */ ANY_OTHER; if ( d != buf ) *d++ = ' '; if ( dtd->case_sensitive ) { *d++ = c; while ((c = *s++) != '\0' && !HasClass(dtd, c, CH_BLANK)) { token |= HasClass(dtd, c, CH_DIGIT) ? 0 : HasClass(dtd, c, CH_NAME) ? NAM_LATER : /* oops! */ ANY_OTHER; *d++ = c; } } else { *d++ = towlower(c); while ((c = *s++) != '\0' && !HasClass(dtd, c, CH_BLANK)) { token |= HasClass(dtd, c, CH_DIGIT) ? 0 : HasClass(dtd, c, CH_NAME) ? NAM_LATER : /* oops! */ ANY_OTHER; *d++ = towlower(c); } } while (c != '\0' && HasClass(dtd, c, CH_BLANK)) c = *s++; } *d = '\0'; } } else { end = itake_unquoted(dtd, decl, tmp, sizeof(tmp)/sizeof(ichar)); if (end == NULL) return NULL; s = buf; c = *s++; if (c != '\0') { token |= HasClass(dtd, c, CH_DIGIT) ? DIG_FIRST : HasClass(dtd, c, CH_NAME) ? NAM_FIRST : /* oops! */ ANY_OTHER; while ((c = *s++) != 0) { token |= HasClass(dtd, c, CH_DIGIT) ? 0 : HasClass(dtd, c, CH_NAME) ? NAM_LATER : /* oops! */ ANY_OTHER; } } if ( token == YET_EMPTY || (token & ANY_OTHER) != 0) gripe(ERC_SYNTAX_WARNING, L"Attribute value requires quotes", buf); if (!dtd->case_sensitive && att->definition->type != AT_CDATA) istrlower(buf); } switch (att->definition->type) { case AT_NUMBER: /* number */ if (token != DIG_FIRST) { gripe(ERC_SYNTAX_WARNING, L"NUMBER expected", decl); } else if (dtd->number_mode == NU_INTEGER) { (void) istrtol(buf, &att->value.number); } else { att->value.textW = istrdup(buf); att->value.number = (long)istrlen(buf); } return end; case AT_CDATA: /* CDATA attribute */ att->value.textW = istrdup(buf); att->value.number = (long)istrlen(buf); return end; case AT_ID: /* identifier */ case AT_IDREF: /* identifier reference */ case AT_NAME: /* name token */ case AT_NOTATION: /* notation-name */ if (token == YET_EMPTY || (token & (DIG_FIRST | ANY_OTHER)) != 0) gripe(ERC_SYNTAX_WARNING, L"NAME expected", decl); break; case AT_NAMEOF: /* one of these names */ case AT_NMTOKEN: /* name-token */ if (token == YET_EMPTY || (token & ANY_OTHER) != 0) gripe(ERC_SYNTAX_WARNING, L"NMTOKEN expected", decl); if ( att->definition->type == AT_NAMEOF ) { dtd_name_list *nl; for(nl=att->definition->typeex.nameof; nl; nl = nl->next) { if ( istreq(nl->value->name, buf) ) goto passed; } gripe(ERC_SYNTAX_WARNING, L"unexpected value", decl); } break; case AT_NUTOKEN: /* number token */ if ((token & (NAM_FIRST | ANY_OTHER)) != 0) gripe(ERC_SYNTAX_WARNING, L"NUTOKEN expected", decl); break; case AT_ENTITY: /* entity-name */ if (token == YET_EMPTY || (token & (DIG_FIRST | ANY_OTHER)) != 0) gripe(ERC_SYNTAX_WARNING, L"entity NAME expected", decl); break; case AT_NAMES: /* list of names */ case AT_IDREFS: /* list of identifier references */ if (token == YET_EMPTY || (token & (DIG_FIRST | ANY_OTHER)) != 0) gripe(ERC_SYNTAX_WARNING, L"NAMES expected", decl); break; case AT_ENTITIES: /* entity-name list */ if (token == YET_EMPTY || (token & (DIG_FIRST | ANY_OTHER)) != 0) gripe(ERC_SYNTAX_WARNING, L"entity NAMES expected", decl); break; case AT_NMTOKENS: /* name-token list */ if (token == YET_EMPTY || (token & ANY_OTHER) != 0) gripe(ERC_SYNTAX_WARNING, L"NMTOKENS expected", decl); break; case AT_NUMBERS: /* number list */ if (token != DIG_FIRST) gripe(ERC_SYNTAX_WARNING, L"NUMBERS expected", decl); break; case AT_NUTOKENS: if ((token & (NAM_FIRST | ANY_OTHER)) != 0) gripe(ERC_SYNTAX_WARNING, L"NUTOKENS expected", decl); break; default: assert(0); return NULL; } passed: att->value.textW = istrdup(buf); /* TBD: more validation */ att->value.number = (long)istrlen(buf); return end; } static const ichar * process_attributes(dtd_parser *p, dtd_element *e, const ichar *decl, sgml_attribute *atts, int *argc) { int attn = 0; dtd *dtd = p->dtd; decl = iskip_layout(dtd, decl); while(decl && *decl) { dtd_symbol *nm; const ichar *s; if ( (s=itake_nmtoken(dtd, decl, &nm)) ) { decl = s; if ( (s=isee_func(dtd, decl, CF_VI)) ) /* name= */ { dtd_attr *a; if ( !HasClass(dtd, nm->name[0], CH_NMSTART) ) gripe(ERC_SYNTAX_WARNING, "Illegal start of attribute-name", decl); decl = s; if ( !(a=find_attribute(e, nm)) ) { a = sgml_calloc(1, sizeof(*a)); a->name = nm; a->type = AT_CDATA; a->def = AT_IMPLIED; add_attribute(dtd, e, a); if ( !e->undefined && !(dtd->dialect != DL_SGML && (istreq(L"xmlns", nm->name) || istrprefix(L"xmlns:", nm->name))) ) gripe(ERC_NO_ATTRIBUTE, e->name->name, nm->name); } atts[attn].definition = a; if ( (decl=get_attribute_value(p, decl, atts+attn)) ) { attn++; continue; } } else if ( e->structure ) { dtd_attr_list *al; /* value shorthand */ for(al=e->attributes; al; al=al->next) { dtd_attr *a = al->attribute; if ( a->type == AT_NAMEOF || a->type == AT_NOTATION ) { dtd_name_list *nl; for(nl=a->typeex.nameof; nl; nl = nl->next) { if ( nl->value == nm ) { if ( dtd->dialect != DL_SGML ) gripe(ERC_SYNTAX_WARNING, "Value short-hand in XML mode", decl); atts[attn].flags = 0; atts[attn].definition = a; atts[attn].value.textW = istrdup(nm->name); atts[attn].value.number = (long)istrlen(nm->name); attn++; goto next; } } } } gripe(ERC_NO_ATTRIBUTE_VALUE, e->name->name, nm->name); decl = s; } else { gripe(ERC_SYNTAX_ERROR, L"Bad attribute", decl); decl = s; } } else { *argc = attn; return decl; } next: ; } *argc = attn; return decl; } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - sgml_add_default_attributes() This function adds attributes for omitted default and fixed attributes. These attributes are added to the end of the attribute list. This function returns the new number of attributes. The `atts' array is assumed to be MAXATTRIBUTES long, normally passed from process_begin_element. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static int add_default_attributes(dtd_parser *p, dtd_element *e, int natts, sgml_attribute *atts) { dtd_attr_list *al; if ( e == CDATA_ELEMENT ) return natts; for(al=e->attributes; al; al=al->next) { dtd_attr *a = al->attribute; switch(a->def) { case AT_REQUIRED: /* TBD: check if present */ case AT_CURRENT: /* TBD: register in DTD and reuse */ case AT_CONREF: case AT_IMPLIED: goto next; case AT_FIXED: case AT_DEFAULT: { int i; sgml_attribute *ap; for(i=0, ap=atts; idefinition == a ) goto next; } ap->definition = a; ap->value.textW = NULL; ap->value.number = 0; ap->flags = SGML_AT_DEFAULT; switch(a->type) { case AT_CDATA: ap->value.textW = a->att_def.cdata; ap->value.number = (long)istrlen(ap->value.textW); break; case AT_NUMBER: if ( p->dtd->number_mode == NU_TOKEN ) { ap->value.textW = (ichar*)a->att_def.name->name; ap->value.number = (long)istrlen(ap->value.textW); } else { ap->value.number = a->att_def.number; } break; default: if ( a->islist ) { ap->value.textW = a->att_def.list; } else { ap->value.textW = (ichar*)a->att_def.name->name; } ap->value.number = (long)istrlen(ap->value.textW); } natts++; } } next:; } return natts; } static void free_attribute_values(int argc, sgml_attribute *argv) { int i; for(i=0; iflags & SGML_AT_DEFAULT) ) continue; /* shared with the DTD */ if ( argv->value.textW ) sgml_free(argv->value.textW); } } static int process_begin_element(dtd_parser *p, const ichar *decl) { dtd *dtd = p->dtd; dtd_symbol *id; const ichar *s; if ( (s=itake_name(dtd, decl, &id)) ) { sgml_attribute atts[MAXATTRIBUTES]; int natts; dtd_element *e = find_element(dtd, id); int empty = FALSE; int conref = FALSE; if ( !e->structure ) { dtd_edef *def; e->undefined = TRUE; STAT(edefs_implicit++); def_element(dtd, id); def = e->structure; def->type = C_EMPTY; } open_element(p, e, TRUE); decl=s; if ( (s=process_attributes(p, e, decl, atts, &natts)) ) decl=s; if ( dtd->dialect != DL_SGML ) { if ( (s=isee_func(dtd, decl, CF_ETAGO2)) ) { empty = TRUE; /* XML */ decl = s; } #ifdef XMLNS if ( dtd->dialect == DL_XMLNS ) update_xmlns(p, e, natts, atts); #endif if ( dtd->dialect != DL_SGML ) update_space_mode(p, e, natts, atts); } else { int i; for(i=0; idef == AT_CONREF ) { empty = TRUE; conref = TRUE; } } } if ( *decl ) gripe(ERC_SYNTAX_ERROR, L"Bad attribute list", decl); if ( !(p->flags & SGML_PARSER_NODEFS) ) natts = add_default_attributes(p, e, natts, atts); if ( empty || (dtd->dialect == DL_SGML && e->structure && e->structure->type == C_EMPTY && !e->undefined) ) p->empty_element = e; else p->empty_element = NULL; if ( p->on_begin_element ) (*p->on_begin_element)(p, e, natts, atts); free_attribute_values(natts, atts); if ( p->empty_element ) { p->empty_element = NULL; close_element(p, e, conref); if ( conref ) /* might be S_CDATA due to declared content */ p->cdata_state = p->state = S_PCDATA; } return TRUE; } return gripe(ERC_SYNTAX_ERROR, L"Bad open-element tag", decl); } static int process_end_element(dtd_parser *p, const ichar *decl) { dtd *dtd = p->dtd; dtd_symbol *id; const ichar *s; emit_cdata(p, TRUE); if ( (s=itake_name(dtd, decl, &id)) && *s == '\0' ) return close_element(p, find_element(dtd, id), FALSE); if ( p->dtd->shorttag && *decl == '\0' ) /* : close current element */ return close_current_element(p); return gripe(ERC_SYNTAX_ERROR, L"Bad close-element tag", decl); } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - process_net(dtd_parser *p) We've seen a / of a shorttag element. Close this one. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static int process_net(dtd_parser *p) { sgml_environment *env; prepare_cdata(p); for(env = p->environments; env; env=env->parent) { if ( env->wants_net ) { sgml_environment *parent; pop_to(p, env, NULL); /* close parents */ validate_completeness(env); parent = env->parent; emit_cdata(p, TRUE); p->first = FALSE; if ( p->on_end_element ) { WITH_CLASS(p, EV_SHORTTAG, (*p->on_end_element)(p, env->element)); } free_environment(env); p->environments = parent; p->map = (parent ? parent->map : NULL); return TRUE; } } return FALSE; } static int /* */ process_doctype(dtd_parser *p, const ichar *decl, const ichar *decl0) { dtd *dtd = p->dtd; dtd_symbol *id; const ichar *s; dtd_entity *et = NULL; if ( !(s=itake_name(dtd, decl, &id)) ) return gripe(ERC_SYNTAX_ERROR, L"Name expected", decl); decl = s; if ( (s=isee_identifier(dtd, decl, "system")) ) { et = sgml_calloc(1, sizeof(*et)); et->type = ET_SYSTEM; decl = s; } else if ( (s=isee_identifier(dtd, decl, "public")) ) { et = sgml_calloc(1, sizeof(*et)); et->type = ET_PUBLIC; decl = s; } else if ( isee_func(dtd, decl, CF_DSO) ) goto local; if ( et ) { et->name = id; et->catalog_location = CAT_DOCTYPE; if ( !(s=process_entity_value_declaration(p, decl, et)) ) return FALSE; decl = s; } if ( !dtd->doctype ) /* i.e. anonymous DTD */ { ichar *file; dtd_parser *clone; dtd->doctype = istrdup(id->name); /* Fill it */ if ( et ) file = entity_file(dtd, et); else file = istrdup(find_in_catalogue(CAT_DOCTYPE, dtd->doctype, NULL, NULL, dtd->dialect != DL_SGML)); if ( !file ) { gripe(ERC_EXISTENCE, L"DTD", dtd->doctype); } else { clone = clone_dtd_parser(p); if ( !load_dtd_from_file(clone, file) ) gripe(ERC_EXISTENCE, L"file", file); free_dtd_parser(clone); sgml_free(file); } } if ( et ) free_entity_list(et); local: if ( (s=isee_func(dtd, decl, CF_DSO)) ) /* [...] */ { int grouplevel = 1; data_mode oldmode = p->dmode; dtdstate oldstate = p->state; locbuf oldloc; const ichar *q; icharbuf *saved_ibuf = p->buffer; push_location(p, &oldloc); /* try to find start-location. */ /* fails if there is comment before */ /* the []! */ sgml_cplocation(&p->location, &p->startloc); inc_location(&p->location, '<'); for(q=decl0; q < s; q++) inc_location(&p->location, *q); p->dmode = DM_DTD; p->state = S_PCDATA; p->buffer = new_icharbuf(); for( ; *s; s++ ) { if ( isee_func(dtd, s, CF_LIT) || /* skip quoted strings */ isee_func(dtd, s, CF_LITA) ) { ichar q = *s; putchar_dtd_parser(p, *s++); /* pass open quote */ for( ; *s && *s != q; s++ ) putchar_dtd_parser(p, *s); if ( *s == q ) /* pass closing quote */ putchar_dtd_parser(p, *s); continue; } if ( isee_func(dtd, s, CF_DSO) ) grouplevel++; else if ( isee_func(dtd, s, CF_DSC) && --grouplevel == 0 ) break; putchar_dtd_parser(p, *s); } p->dtd->implicit = FALSE; p->state = oldstate; p->dmode = oldmode; free_icharbuf(p->buffer); p->buffer = saved_ibuf; pop_location(p, &oldloc); } p->enforce_outer_element = id; /* make this the outer element */ return TRUE; } static void init_decoding(dtd_parser *p) { #ifdef UTF8 int decode; dtd *dtd = p->dtd; if ( dtd->encoding == SGML_ENC_UTF8 && p->encoded == TRUE ) decode = TRUE; else decode = FALSE; if ( p->utf8_decode != decode ) { DEBUG(fprintf(stderr, "%s UTF-8 decoding on %p\n", decode ? "Enable" : "Disable", p)); p->utf8_decode = decode; } #endif } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - xml_set_encoding() is the public interface to set the encoding for the parser. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static int /* strcasecmp() with C locale */ posix_strcasecmp(const char *s1, const char *s2) { for(; *s1 && *s2; s1++, s2++) { int c1 = *s1&0xff; int c2 = *s2&0xff; if ( c1 >= 'A' && c1 <= 'Z' ) c1 += 'a'-'A'; if ( c2 >= 'A' && c2 <= 'Z' ) c2 += 'a'-'A'; if ( c1 != c2 ) return c1-c2; } return *s1 - *s2; } int xml_set_encoding(dtd_parser *p, const char *enc) { dtd *dtd = p->dtd; if ( posix_strcasecmp(enc, "iso-8859-1") == 0 ) { dtd->encoding = SGML_ENC_ISO_LATIN1; } else if ( posix_strcasecmp(enc, "us-ascii") == 0 ) { dtd->encoding = SGML_ENC_ISO_LATIN1; /* doesn't make a difference */ } else if ( posix_strcasecmp(enc, "utf-8") == 0 ) { dtd->encoding = SGML_ENC_UTF8; } else return FALSE; init_decoding(p); return TRUE; } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - set_encoding() sets the encoding from the encoding="..." field of the XML header. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static void set_encoding(dtd_parser *p, const ichar *enc) { char buf[32]; char *e = buf+sizeof(buf)-1; char *o; const ichar *i; for(i=enc, o=buf; *i; ) { if ( *i < 128 && o < e ) { *o++ = (char)*i++; } else { goto error; } } *o = '\0'; if ( !xml_set_encoding(p, buf) ) { error: gripe(ERC_EXISTENCE, L"character encoding", enc); } } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Process Should deal with character encoding for XML documents. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static int process_pi(dtd_parser *p, const ichar *decl) { const ichar *s; dtd *dtd = p->dtd; if ( (s=isee_identifier(dtd, decl, "xml")) ) /* */ { decl = s; switch(dtd->dialect) { case DL_SGML: set_dialect_dtd(dtd, DL_XML); break; case DL_XML: case DL_XMLNS: break; } while(*decl) { dtd_symbol *nm; if ( (s=itake_name(dtd, decl, &nm)) && (s=isee_func(dtd, s, CF_VI)) ) /* = */ { ichar *start; int len; ichar buf[MAXSTRINGLEN]; const ichar *end; if ( !(end=itake_string(dtd, s, &start, &len)) ) { end=itake_nmtoken_chars(dtd, s, buf, sizeof(buf)/sizeof(ichar)); start = buf; len = (int)istrlen(buf); } if ( end ) { decl = end; if ( istrcaseeq(nm->name, L"encoding") ) { ichar tmp[32]; if ( len < (int)(sizeof(tmp)/sizeof(ichar)-1) ) { istrncpy(tmp, start, len); tmp[len] = 0; set_encoding(p, tmp); } else { gripe(ERC_SYNTAX_ERROR, L"Unterminated encoding?", decl); } } /* fprintf(stderr, "XML %s = %s\n", nm->name, buf); */ continue; } } gripe(ERC_SYNTAX_ERROR, L"Illegal XML parameter", decl); break; } return TRUE; } if ( p->on_pi ) (*p->on_pi)(p, decl); return FALSE; /* Warn? */ } static int process_sgml_declaration(dtd_parser *p, const ichar *decl) { return gripe(ERC_SYNTAX_WARNING, L"Ignored declaration", NULL); } static int process_declaration(dtd_parser *p, const ichar *decl) { const ichar *s; dtd *dtd = p->dtd; if ( p->dmode != DM_DTD ) { if ( (s=isee_func(dtd, decl, CF_ETAGO2)) ) /* */ { return process_end_element(p, s); } else if ( HasClass(dtd, *decl, CH_NAME) ) /* */ { decl = s; if ( p->on_decl ) (*p->on_decl)(p, decl); if ( (s = isee_identifier(dtd, decl, "entity")) ) process_entity_declaration(p, s); else if ( (s = isee_identifier(dtd, decl, "element")) ) process_element_declaraction(p, s); else if ( (s = isee_identifier(dtd, decl, "attlist")) ) process_attlist_declaraction(p, s); else if ( (s = isee_identifier(dtd, decl, "notation")) ) process_notation_declaration(p, s); else if ( (s = isee_identifier(dtd, decl, "shortref")) ) process_shortref_declaration(p, s); else if ( (s = isee_identifier(dtd, decl, "usemap")) ) process_usemap_declaration(p, s); else if ( (s = isee_identifier(dtd, decl, "sgml")) ) process_sgml_declaration(p, s); else if ( (s = isee_identifier(dtd, decl, "doctype")) ) { if ( p->dmode != DM_DTD ) process_doctype(p, s, decl-1); } else { s = iskip_layout(dtd, decl); if ( *s ) gripe(ERC_SYNTAX_ERROR, L"Invalid declaration", s); } return TRUE; } return gripe(ERC_SYNTAX_ERROR, L"Invalid declaration", decl); } /******************************* * STREAM BINDING * *******************************/ static dtd_parser *current_parser; /* For gripes */ void set_file_dtd_parser(dtd_parser *p, input_type type, const ichar *name) { p->location.type = type; p->location.name.file = name; p->location.line = 1; p->location.linepos = 0; p->location.charpos = 0; } static void set_src_dtd_parser(dtd_parser *p, input_type type, const ichar *name) { p->location.type = type; p->location.name.entity = name; p->location.line = 1; p->location.linepos = 0; p->location.charpos = 0; } void set_mode_dtd_parser(dtd_parser *p, data_mode m) { p->dmode = m; /* DM_DTD or DM_DATA */ p->state = S_PCDATA; p->blank_cdata = TRUE; } dtd_parser * new_dtd_parser(dtd *dtd) { dtd_parser *p = sgml_calloc(1, sizeof(*p)); if ( !dtd ) dtd = new_dtd(NULL); dtd->references++; p->magic = SGML_PARSER_MAGIC; p->dtd = dtd; p->state = S_PCDATA; p->mark_state = MS_INCLUDE; p->dmode = DM_DTD; p->encoded = TRUE; /* encoded octet stream */ p->buffer = new_icharbuf(); p->cdata = new_ocharbuf(); p->event_class = EV_EXPLICIT; set_src_dtd_parser(p, IN_NONE, NULL); return p; } static dtd_parser * clone_dtd_parser(dtd_parser *p) { dtd_parser *clone = sgml_calloc(1, sizeof(*p)); *clone = *p; clone->dtd->references++; clone->environments = NULL; clone->marked = NULL; clone->etag = NULL; clone->grouplevel = 0; clone->state = S_PCDATA; clone->mark_state = MS_INCLUDE; clone->dmode = DM_DTD; clone->buffer = new_icharbuf(); clone->cdata = new_ocharbuf(); return clone; } void free_dtd_parser(dtd_parser *p) { free_icharbuf(p->buffer); free_ocharbuf(p->cdata); free_dtd(p->dtd); sgml_free(p); } static int process_chars(dtd_parser *p, input_type in, const ichar *name, const ichar *s) { locbuf old; push_location(p, &old); set_src_dtd_parser(p, in, name); empty_icharbuf(p->buffer); /* dubious */ for(; *s; s++) putchar_dtd_parser(p, *s); pop_location(p, &old); return TRUE; } static int process_include(dtd_parser *p, const ichar *entity_name) { dtd_symbol *id; dtd_entity *pe; dtd *dtd = p->dtd; if ( (id=dtd_find_entity_symbol(dtd, entity_name)) && (pe=find_pentity(p->dtd, id)) ) { ichar *file; if ( (file = entity_file(dtd, pe)) ) { int rc = sgml_process_file(p, file, SGML_SUB_DOCUMENT); sgml_free(file); return rc; } else { const ichar *text = entity_value(p, pe, NULL); if ( !text ) return gripe(ERC_NO_VALUE, pe->name->name); return process_chars(p, IN_ENTITY, entity_name, text); } } return gripe(ERC_EXISTENCE, L"parameter entity", entity_name); } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Process mark_state according to KEYWORD. Processes the rest in normal S_PCDATA style, which pops the mark-stack on seeing ]]> For the purpose of we switch to S_GROUP if - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static void process_marked_section(dtd_parser *p) { ichar buf[MAXDECL]; dtd *dtd = p->dtd; const ichar *decl = p->buffer->data; const ichar *s; if ( (decl=isee_func(dtd, decl, CF_MDO2)) && /* ! */ (decl=isee_func(dtd, decl, CF_DSO)) && /* [ */ expand_pentities(p, decl, ZERO_TERM_LEN, buf, sizeof(buf)/sizeof(ichar)) ) { dtd_symbol *kwd; decl = buf; if ( (s=itake_name(dtd, decl, &kwd)) && isee_func(dtd, s, CF_DSO) ) /* [ */ { dtd_marked *m = sgml_calloc(1, sizeof(*m)); m->keyword = kwd; /* push on the stack */ m->parent = p->marked; p->marked = m; if ( istrcaseeq(kwd->name, L"IGNORE") ) m->type = MS_IGNORE; else if ( istrcaseeq(kwd->name, L"INCLUDE") ) m->type = MS_INCLUDE; else if ( istrcaseeq(kwd->name, L"TEMP") ) m->type = MS_INCLUDE; else if ( istrcaseeq(kwd->name, L"CDATA") ) m->type = MS_CDATA; else if ( istrcaseeq(kwd->name, L"RCDATA") ) m->type = MS_RCDATA; else m->type = MS_INCLUDE; /* default */ empty_icharbuf(p->buffer); if ( m->type == MS_CDATA ) p->state = S_MSCDATA; else p->state = S_PCDATA; if ( p->mark_state != MS_IGNORE ) p->mark_state = m->type; } } else { decl = p->buffer->data; if ( (decl=isee_func(dtd, decl, CF_MDO2)) && /* ! */ !isee_func(dtd, decl, CF_DSO) ) /* [ */ { p->state = S_GROUP; p->grouplevel = 1; } } } static void pop_marked_section(dtd_parser *p) { dtd_marked *m = p->marked; if ( m ) { p->marked = m->parent; sgml_free(m); p->mark_state = (p->marked ? p->marked->type : MS_INCLUDE); } } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Update the space-mode for the current element. The space mode defines how spaces are handled in the CDATA output. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static dtd_space_mode istr_to_space_mode(const ichar *val) { if ( istreq(val, L"default") ) return SP_DEFAULT; if ( istreq(val, L"preserve") ) return SP_PRESERVE; if ( istreq(val, L"sgml") ) return SP_SGML; if ( istreq(val, L"remove") ) return SP_REMOVE; return SP_INHERIT; /* interpret as error */ } static void update_space_mode(dtd_parser *p, dtd_element *e, int natts, sgml_attribute *atts) { for( ; natts-- > 0; atts++ ) { const ichar *name = atts->definition->name->name; if ( istreq(name, L"xml:space") && atts->definition->type == AT_CDATA && atts->value.textW ) { dtd_space_mode m = istr_to_space_mode(atts->value.textW); if ( m != SP_INHERIT ) p->environments->space_mode = m; else gripe(ERC_EXISTENCE, L"xml:space-mode", atts->value.textW); return; } } if ( e->space_mode != SP_INHERIT ) p->environments->space_mode = e->space_mode; } static void empty_cdata(dtd_parser *p) { if ( p->dmode == DM_DATA ) { empty_ocharbuf(p->cdata); p->blank_cdata = TRUE; p->cdata_must_be_empty = FALSE; } } static void cb_cdata(dtd_parser *p, ocharbuf *buf, int offset, int size) { if ( p->on_data ) (*p->on_data)(p, EC_CDATA, size, buf->data.w+offset); } static int emit_cdata(dtd_parser *p, int last) { dtd *dtd = p->dtd; locbuf locsafe; ocharbuf *cdata = p->cdata; int offset = 0; int size = cdata->size; if ( size == 0 ) return TRUE; /* empty or done */ push_location(p, &locsafe); sgml_cplocation(&p->location, &p->startloc); /* start of markup */ sgml_cplocation(&p->startloc, &p->startcdata); /* real start of CDATA */ if ( p->environments ) { switch(p->environments->space_mode) { case SP_SGML: case SP_DEFAULT: if ( p->first ) { wint_t c = fetch_ocharbuf(cdata, offset); if ( HasClass(dtd, c, CH_RE) ) { inc_location(&p->startloc, c); offset++; size--; c = fetch_ocharbuf(cdata, offset); } if ( HasClass(dtd, c, CH_RS) ) { inc_location(&p->startloc, c); offset++; size--; } } if ( last && size > 0 ) { wint_t c = fetch_ocharbuf(cdata, offset+size-1); if ( HasClass(dtd, c, CH_RS) ) { dec_location(&p->location, c); size--; poke_ocharbuf(cdata, offset+size, '\0'); if ( size > 0 ) c = fetch_ocharbuf(cdata, offset+size-1); else c = 0; /* HasClass(CH_RE) must fail */ } if ( HasClass(dtd, c, CH_RE) ) { dec_location(&p->location, c); size--; poke_ocharbuf(cdata, offset+size, '\0'); } } if ( p->environments->space_mode == SP_DEFAULT ) { int o = 0; int i; for(i=0; istartloc, c); else break; } if ( i 0); if ( !p->blank_cdata ) { if ( p->cdata_must_be_empty ) { gripe(ERC_NOT_ALLOWED_PCDATA, p->cdata); /* TBD: now passes buffer! */ } cb_cdata(p, cdata, offset, size); } else if ( p->environments ) { sgml_environment *env = p->environments; dtd_state *new; /* If an element is not in the DTD we must */ /* assume mixed content and emit spaces */ if ( (new=make_dtd_transition(env->state, CDATA_ELEMENT)) ) { env->state = new; cb_cdata(p, cdata, offset, size); } else if ( env->element->undefined && p->environments->space_mode == SP_PRESERVE ) { cb_cdata(p, cdata, offset, size); } } pop_location(p, &locsafe); empty_cdata(p); return TRUE; } static int prepare_cdata(dtd_parser *p) { if ( p->cdata->size == 0 ) return TRUE; terminate_ocharbuf(p->cdata); if ( p->mark_state == MS_INCLUDE ) { dtd *dtd = p->dtd; if ( p->environments ) /* needed for */ { dtd_element *e = p->environments->element; if ( e->structure && e->structure->type == C_EMPTY && !e->undefined ) close_element(p, e, FALSE); } if ( p->blank_cdata == TRUE ) { int blank = TRUE; int i; for(i=0; icdata->size; i++) { wint_t c = fetch_ocharbuf(p->cdata, i); if ( !HasClass(dtd, c, CH_BLANK) ) { blank = FALSE; break; } } p->blank_cdata = blank; if ( !blank ) { if ( p->dmode == DM_DTD ) gripe(ERC_SYNTAX_ERROR, L"CDATA in DTD", p->cdata->data); else open_element(p, CDATA_ELEMENT, TRUE); } } } return TRUE; } static int process_cdata(dtd_parser *p, int last) { int rc; WITH_PARSER(p, (prepare_cdata(p), rc=emit_cdata(p, last))); return rc; } static int process_entity(dtd_parser *p, const ichar *name) { if ( name[0] == '#' ) /* #charcode: character entity */ { int v = char_entity_value(name); if ( v <= 0 ) return gripe(ERC_SYNTAX_ERROR, L"Bad character entity", name); add_ocharbuf(p->cdata, v); } else { dtd_symbol *id; dtd_entity *e; dtd *dtd = p->dtd; int len; const ichar *text; const ichar *s; int chr; ichar *file; if ( !(id=dtd_find_entity_symbol(dtd, name)) || !(e=id->entity) ) { if ( dtd->default_entity ) e = dtd->default_entity; else return gripe(ERC_EXISTENCE, L"entity", name); } if ( !e->value && e->content == EC_SGML && (file=entity_file(p->dtd, e)) ) { int rc; empty_icharbuf(p->buffer); /* dubious */ rc = sgml_process_file(p, file, SGML_SUB_DOCUMENT); sgml_free(file); return rc; } if ( !(text = entity_value(p, e, &len)) ) return gripe(ERC_NO_VALUE, e->name->name); switch ( e->content ) { case EC_SGML: case EC_CDATA: if ( (s=isee_character_entity(dtd, text, &chr)) && *s == '\0' ) { if ( chr == 0 ) return gripe(ERC_SYNTAX_ERROR, L"Illegal character entity", text); if ( p->blank_cdata == TRUE && !HasClass(dtd, (wint_t)chr, CH_BLANK) ) { p->cdata_must_be_empty = !open_element(p, CDATA_ELEMENT, FALSE); p->blank_cdata = FALSE; } add_ocharbuf(p->cdata, chr); return TRUE; } if ( e->content == EC_SGML ) { locbuf oldloc; int decode = p->utf8_decode; push_location(p, &oldloc); p->utf8_decode = FALSE; set_src_dtd_parser(p, IN_ENTITY, e->name->name); empty_icharbuf(p->buffer); /* dubious */ for(s=text; *s; s++) putchar_dtd_parser(p, *s); p->utf8_decode = decode; pop_location(p, &oldloc); } else if ( *text ) { const ichar *o; if ( p->blank_cdata == TRUE ) { p->cdata_must_be_empty = !open_element(p, CDATA_ELEMENT, FALSE); p->blank_cdata = FALSE; } for(o=text; *o; o++) add_ocharbuf(p->cdata, *o); } break; case EC_SDATA: case EC_NDATA: process_cdata(p, FALSE); if ( p->on_data ) (*p->on_data)(p, e->content, len, text); break; case EC_PI: process_cdata(p, FALSE); if ( p->on_pi ) (*p->on_pi)(p, text); case EC_STARTTAG: #if 0 prepare_cdata(p); process_begin_element(p, text); #endif break; case EC_ENDTAG: #if 0 prepare_cdata(p); process_end_element(p, text); #endif break; } return TRUE; } return TRUE; } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Deal with end of input. We should give a proper error message depending on the state and the start-location of the error. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static int end_document_dtd_parser_(dtd_parser *p) { int rval; switch(p->state) { case S_RCDATA: case S_CDATA: case S_PCDATA: rval = TRUE; break; case S_CMT: case S_CMT1: case S_CMTE0: case S_CMTE1: case S_DECLCMT0: case S_DECLCMT: case S_DECLCMTE0: rval = gripe(ERC_SYNTAX_ERROR, L"Unexpected end-of-file in comment", L""); break; case S_ECDATA1: case S_ECDATA2: case S_EMSC1: case S_EMSC2: case S_DECL0: case S_DECL: case S_MDECL0: case S_STRING: case S_CMTO: case S_GROUP: case S_PENT: case S_ENT: case S_ENT0: rval = gripe(ERC_SYNTAX_ERROR, L"Unexpected end-of-file", L""); break; #ifdef UTF8 case S_UTF8: rval = gripe(ERC_SYNTAX_ERROR, L"Unexpected end-of-file in UTF-8 sequence", L""); break; #endif case S_MSCDATA: case S_EMSCDATA1: case S_EMSCDATA2: rval = gripe(ERC_SYNTAX_ERROR, L"Unexpected end-of-file in CDATA marked section", L""); break; case S_PI: case S_PI2: rval = gripe(ERC_SYNTAX_ERROR, L"Unexpected end-of-file in processing instruction", L""); break; default: rval = gripe(ERC_SYNTAX_ERROR, L"Unexpected end-of-file in ???"); break; } if ( p->dmode == DM_DATA ) { sgml_environment *env; if ( p->cdata->size > 0 && fetch_ocharbuf(p->cdata, p->cdata->size-1) == CR ) del_ocharbuf(p->cdata); process_cdata(p, TRUE); if ( (env=p->environments) ) { dtd_element *e; while(env->parent) env = env->parent; pop_to(p, env, CDATA_ELEMENT); e = env->element; if ( e->structure && !e->structure->omit_close ) gripe(ERC_OMITTED_CLOSE, e->name->name); close_element(p, e, FALSE); } } return rval; } int end_document_dtd_parser(dtd_parser *p) { int rval; WITH_PARSER(p, rval = end_document_dtd_parser_(p)); return rval; } int begin_document_dtd_parser(dtd_parser *p) { init_decoding(p); return TRUE; } void reset_document_dtd_parser(dtd_parser *p) { if ( p->environments ) { sgml_environment *env, *parent; for(env = p->environments; env; env=parent) { parent = env->parent; free_environment(env); } p->environments = NULL; } while(p->marked) pop_marked_section(p); empty_icharbuf(p->buffer); empty_ocharbuf(p->cdata); p->mark_state = MS_INCLUDE; p->state = S_PCDATA; p->grouplevel = 0; p->blank_cdata = TRUE; p->event_class = EV_EXPLICIT; p->dmode = DM_DATA; begin_document_dtd_parser(p); } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Set the UTF-8 state - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ #ifdef UTF8 static void process_utf8(dtd_parser *p, int chr) { int bytes; int mask; for( bytes=1, mask=0x20; chr&mask; bytes++, mask >>= 1 ) ; mask--; /* 0x20 --> 0x1f */ p->utf8_saved_state = p->state; /* state to return to */ p->state = S_UTF8; p->utf8_char = chr & mask; p->utf8_left = bytes; } #endif /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - add_cdata() adds a character to the output data. It also maps \r\n onto a single \n for Windows newline conventions. There is a problem here in shortref handling. We open the CDATA_ELEMENT as soon as we find a character as this may open other elements through omitted tags and thus install a new shortref map. If, at a later stage, all CDATA read sofar turns out to be a shortref we have incorrectly opened the CDATA_ELEMENT. As `undoing' the open_element() is not an option (it may already have caused `events' on omitted tags) we are in trouble. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ static void add_cdata(dtd_parser *p, int chr) { if ( p->mark_state == MS_INCLUDE ) { ocharbuf *buf = p->cdata; if ( p->blank_cdata == TRUE && !HasClass(p->dtd, (wint_t)chr, CH_BLANK) ) { p->cdata_must_be_empty = !open_element(p, CDATA_ELEMENT, FALSE); p->blank_cdata = FALSE; } if ( chr == '\n' ) /* insert missing CR */ { int sz; if ( (sz=buf->size) == 0 || fetch_ocharbuf(buf, sz-1) != CR ) add_cdata(p, CR); } add_ocharbuf(buf, chr); if ( p->map && chr <= 0xff && p->map->ends[chr] && match_shortref(p) ) return; if ( chr == '\n' ) /* dubious. Whould we do that */ { int sz; /* here or in space-handling? */ if ( (sz=buf->size) > 1 && fetch_ocharbuf(buf, sz-1) == LF && fetch_ocharbuf(buf, sz-2) == CR ) { poke_ocharbuf(buf, sz-2, LF); buf->size--; } } } } static void add_verbatim_cdata(dtd_parser *p, int chr) { if ( p->mark_state != MS_IGNORE ) { ocharbuf *buf = p->cdata; if ( p->blank_cdata == TRUE && !HasClass(p->dtd, (wint_t)chr, CH_BLANK) ) { p->cdata_must_be_empty = !open_element(p, CDATA_ELEMENT, FALSE); p->blank_cdata = FALSE; } if ( chr == '\n' && buf->size > 0 && fetch_ocharbuf(buf, buf->size-1) == '\r' ) buf->size--; add_ocharbuf(buf, chr); } } /* We discovered illegal markup and now process it as normal CDATA */ static void recover_parser(dtd_parser *p) { const ichar *s; terminate_icharbuf(p->buffer); add_cdata(p, p->saved); for(s=p->buffer->data; *s; s++) add_cdata(p, *s); p->state = S_PCDATA; } static inline void setlocation(dtd_srcloc *d, dtd_srcloc *loc, int line, int lpos) { d->line = line; d->linepos = lpos; d->charpos = loc->charpos - 1; d->type = loc->type; d->name = loc->name; } void putchar_dtd_parser(dtd_parser *p, int chr) { dtd *dtd = p->dtd; const ichar *f = dtd->charfunc->func; int line = p->location.line; int lpos = p->location.linepos; p->location.charpos++; /* TBD: actually `bytepos' */ #ifdef UTF8 if ( p->state == S_UTF8 ) { if ( (chr & 0xc0) != 0x80 ) /* TBD: recover */ gripe(ERC_SYNTAX_ERROR, L"Bad UTF-8 sequence", L""); p->utf8_char <<= 6; p->utf8_char |= (chr & ~0xc0); if ( --p->utf8_left == 0 ) { chr = p->utf8_char; p->state = p->utf8_saved_state; } else { return; } } else if ( ISUTF8_MB(chr) && p->utf8_decode ) { process_utf8(p, chr); return; } #endif if ( f[CF_RS] == chr ) { p->location.line++; p->location.linepos = 0; } else { if ( f[CF_RE] == chr ) p->location.linepos = 0; else p->location.linepos++; } reprocess: switch(p->state) { case S_PCDATA: { if ( f[CF_MDO1] == chr ) /* < */ { setlocation(&p->startloc, &p->location, line, lpos); p->state = S_DECL0; empty_icharbuf(p->buffer); return; } if ( p->dmode == DM_DTD ) { if ( f[CF_PERO] == chr ) /* % */ { setlocation(&p->startloc, &p->location, line, lpos); p->state = S_PENT; return; } } else { if ( f[CF_ERO] == chr ) /* & */ { setlocation(&p->startloc, &p->location, line, lpos); p->state = S_ENT0; return; } } if ( p->marked && f[CF_DSC] == chr ) /* ] in marked section */ { empty_icharbuf(p->buffer); p->state = S_EMSC1; p->saved = chr; /* for recovery */ return; } if ( p->waiting_for_net && f[CF_ETAGO2] == chr ) /* shorttag */ { setlocation(&p->startloc, &p->location, line, lpos); WITH_PARSER(p, process_net(p)); return; } /* Real character data */ if ( p->cdata->size == 0 ) setlocation(&p->startcdata, &p->location, line, lpos); add_cdata(p, chr); return; } case S_ECDATA2: /* Seen etaglen == p->buffer->size && istrncaseeq(p->buffer->data, p->etag, p->etaglen) ) { p->cdata->size -= p->etaglen+2; /* 2 for cdata); terminate_icharbuf(p->buffer); if ( p->mark_state == MS_INCLUDE ) { WITH_PARSER(p, process_cdata(p, TRUE); process_end_element(p, p->buffer->data)); empty_cdata(p); } empty_icharbuf(p->buffer); p->cdata_state = p->state = S_PCDATA; } else { add_verbatim_cdata(p, chr); if ( p->etaglen < p->buffer->size || !HasClass(dtd, (wint_t)chr, CH_NAME)) { empty_icharbuf(p->buffer); /* mismatch */ p->state = p->cdata_state; } else add_icharbuf(p->buffer, chr); } return; } case S_ECDATA1: /* seen < in CDATA */ { add_verbatim_cdata(p, chr); if ( f[CF_ETAGO2] == chr ) /* / */ { empty_icharbuf(p->buffer); p->state = S_ECDATA2; } else if ( f[CF_ETAGO1] != chr ) /* <: do not change state */ p->state = p->cdata_state; return; } case S_RCDATA: { if ( f[CF_ERO] == chr ) /* & */ { setlocation(&p->startloc, &p->location, line, lpos); p->state = S_ENT0; return; } /*FALLTHROUGH*/ } case S_CDATA: { add_verbatim_cdata(p, chr); if ( f[CF_MDO1] == chr ) /* < */ { setlocation(&p->startloc, &p->location, line, lpos); p->state = S_ECDATA1; } /* / in CDATA shorttag element */ if ( p->waiting_for_net && f[CF_ETAGO2] == chr ) { setlocation(&p->startloc, &p->location, line, lpos); p->cdata->size--; terminate_ocharbuf(p->cdata); terminate_icharbuf(p->buffer); if ( p->mark_state == MS_INCLUDE ) { WITH_PARSER(p, process_cdata(p, TRUE); process_net(p)); empty_cdata(p); } empty_icharbuf(p->buffer); p->cdata_state = p->state = S_PCDATA; } return; } case S_MSCDATA: { add_verbatim_cdata(p, chr); if ( f[CF_DSC] == chr ) /* ] */ p->state = S_EMSCDATA1; return; } case S_EMSCDATA1: { add_verbatim_cdata(p, chr); if ( f[CF_DSC] == chr ) /* ]] */ p->state = S_EMSCDATA2; else p->state = S_MSCDATA; return; } case S_EMSCDATA2: { add_verbatim_cdata(p, chr); if ( f[CF_MDC] == chr ) /* ]]> */ { p->cdata->size -= 3; /* Delete chars for ]] */ pop_marked_section(p); p->state = S_PCDATA; } else if ( f[CF_DSC] != chr ) /* if ]]], stay in this state */ p->state = S_MSCDATA; return; } case S_EMSC1: { if ( f[CF_DSC] == chr ) /* ]] in marked section */ { p->state = S_EMSC2; return; } else { add_icharbuf(p->buffer, chr); recover_parser(p); return; } } case S_EMSC2: { if ( f[CF_MDC] == chr ) /* ]]> in marked section */ { pop_marked_section(p); p->state = S_PCDATA; return; } else { add_icharbuf(p->buffer, chr); recover_parser(p); return; } } case S_PENT: /* %parameter entity; */ { if ( f[CF_ERC] == chr ) { p->state = S_PCDATA; terminate_icharbuf(p->buffer); if ( p->mark_state == MS_INCLUDE ) { WITH_PARSER(p, process_include(p, p->buffer->data)); } empty_icharbuf(p->buffer); return; } if ( HasClass(dtd, (wint_t)chr, CH_NAME) ) { add_icharbuf(p->buffer, chr); return; } terminate_icharbuf(p->buffer); gripe(ERC_SYNTAX_ERROR, L"Illegal parameter entity", p->buffer->data); break; } case S_ENT0: /* Seen & */ { if ( chr == '#' || HasClass(dtd, (wint_t)chr, CH_NAME) ) { empty_icharbuf(p->buffer); add_icharbuf(p->buffer, chr); p->state = S_ENT; } else { if ( dtd->dialect != DL_SGML ) { wchar_t buf[3]; buf[0] = '&'; buf[1] = chr; buf[2] = '\0'; gripe(ERC_SYNTAX_ERROR, L"Illegal entity", buf); } add_cdata(p, f[CF_ERO]); p->state = p->cdata_state; goto reprocess; } return; } case S_ENT: /* &entity; */ { if ( HasClass(dtd, (wint_t)chr, CH_NAME) ) { add_icharbuf(p->buffer, chr); return; } terminate_icharbuf(p->buffer); p->state = p->cdata_state; if ( p->mark_state == MS_INCLUDE ) { WITH_PARSER(p, process_entity(p, p->buffer->data)); } empty_icharbuf(p->buffer); if ( chr == CR ) p->state = S_ENTCR; else if ( f[CF_ERC] != chr && chr != '\n' ) goto reprocess; break; } case S_ENTCR: /* seen &entCR, eat the LF */ { p->state = p->cdata_state; if ( chr != LF ) goto reprocess; break; } case S_DECL0: /* Seen < */ { if ( f[CF_ETAGO2] == chr ) /* buffer, chr); p->state = S_DECL; } else if ( HasClass(dtd, (wint_t)chr, CH_NAME) ) /* buffer, chr); p->state = S_DECL; } else if ( f[CF_MDO2] == chr ) /* state = S_MDECL0; } else if ( f[CF_PRO2] == chr ) /* state = S_PI; } else /* recover */ { add_cdata(p, f[CF_MDO1]); add_cdata(p, chr); p->state = S_PCDATA; } return; } case S_MDECL0: /* Seen state = S_CMTO; return; } add_icharbuf(p->buffer, f[CF_MDO2]); add_icharbuf(p->buffer, chr); p->state = S_DECL; return; } case S_DECL: /* <...> */ { if ( f[CF_MDC] == chr ) /* > */ { prepare_cdata(p); p->state = S_PCDATA; terminate_icharbuf(p->buffer); if ( p->mark_state == MS_INCLUDE ) { WITH_PARSER(p, process_declaration(p, p->buffer->data)); } empty_icharbuf(p->buffer); return; } if ( dtd->shorttag && f[CF_ETAGO2] == chr && p->buffer->size > 0 ) { prepare_cdata(p); p->state = S_PCDATA; terminate_icharbuf(p->buffer); if ( p->mark_state == MS_INCLUDE ) { WITH_CLASS(p, EV_SHORTTAG, WITH_PARSER(p, process_declaration(p, p->buffer->data))); } empty_icharbuf(p->buffer); p->waiting_for_net = TRUE; return; } add_icharbuf(p->buffer, chr); if ( f[CF_LIT] == chr ) /* " */ { p->state = S_STRING; p->saved = chr; p->lit_saved_state = S_DECL; } else if ( f[CF_LITA] == chr ) /* ' */ { p->state = S_STRING; p->saved = chr; p->lit_saved_state = S_DECL; return; } else if ( f[CF_CMT] == chr && /* - */ p->buffer->data[0] == f[CF_MDO2] ) /* Started state = S_DECLCMT0; } else if ( f[CF_DSO] == chr ) /* [: marked section */ { terminate_icharbuf(p->buffer); process_marked_section(p); } break; } case S_DECLCMT0: /* <...- */ { if ( f[CF_CMT] == chr ) { p->buffer->size--; p->state = S_DECLCMT; } else { add_icharbuf(p->buffer, chr); p->state = S_DECL; } break; } case S_DECLCMT: /* <...--.. */ { if ( f[CF_CMT] == chr ) p->state = S_DECLCMTE0; break; } case S_DECLCMTE0: /* <...--..- */ { if ( f[CF_CMT] == chr ) p->state = S_DECL; else p->state = S_DECLCMT; break; } case S_PI: { add_icharbuf(p->buffer, chr); if ( f[CF_PRO2] == chr ) /* state = S_PI2; if ( f[CF_PRC] == chr ) /* no ? is ok too (XML/SGML) */ goto pi; return; } case S_PI2: { if ( f[CF_PRC] == chr ) { pi: process_cdata(p, FALSE); p->state = S_PCDATA; p->buffer->size--; terminate_icharbuf(p->buffer); if ( p->mark_state == MS_INCLUDE ) { WITH_PARSER(p, process_pi(p, p->buffer->data)); } empty_icharbuf(p->buffer); return; } add_icharbuf(p->buffer, chr); p->state = S_PI; return; } case S_STRING: { add_icharbuf(p->buffer, chr); if ( chr == p->saved ) p->state = p->lit_saved_state; break; } case S_CMTO: /* Seen state = S_CMT1; return; } else { add_cdata(p, f[CF_MDO1]); add_cdata(p, f[CF_MDO2]); add_cdata(p, f[CF_CMT]); add_cdata(p, chr); p->state = S_PCDATA; return; } } case S_CMT1: /*