This repository has been archived on 2023-08-20. You can view files and clone it, but cannot push or open issues or pull requests.
yap-6.3/packages/sgml/parser.c

5599 lines
127 KiB
C
Raw Normal View History

/* $Id$
Part of SWI-Prolog
Author: Jan Wielemaker
E-mail: wielemak@science.uva.nl
WWW: http://www.swi-prolog.org
Copyright (C): 1985-2006, University of Amsterdam
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#define _ISOC99_SOURCE 1 /* fwprintf(), etc prototypes */
#define DTD_IMPLEMENTATION 1
#include <stdio.h>
#include <wchar.h>
#include "dtd.h"
#include "model.h"
#include "util.h"
#include "catalog.h"
#include "parser.h"
#include <stdlib.h>
#include <assert.h>
#include <stdarg.h>
#include <ctype.h>
#include <string.h>
#include "utf8.h"
#include <errno.h>
#include <wctype.h>
#include "xml_unicode.h"
#define DEBUG(g) ((void)0)
#define ZERO_TERM_LEN (-1) /* terminated by nul */
#ifdef __WINDOWS__
#define inline __inline
#define swprintf _snwprintf
#endif
/*******************************
* LOCAL TYPES *
*******************************/
typedef struct locbuf
{ dtd_srcloc start; /* p->startloc */
dtd_srcloc here; /* p->location */
} locbuf;
/*******************************
* PROTOYPES *
*******************************/
2010-05-06 10:59:09 +01:00
static const ichar * itake_name(dtd_parser *p,
const ichar *in, dtd_symbol **id);
static const ichar * itake_entity_name(dtd_parser *p, const ichar *in,
dtd_symbol **id);
2010-05-06 10:59:09 +01:00
static const ichar * itake_namegroup(dtd_parser *p, const ichar *decl,
dtd_symbol **names, int *n);
static const ichar * iskip_layout(dtd *dtd, const ichar *in);
static dtd_parser * clone_dtd_parser(dtd_parser *p);
static void free_model(dtd_model *m);
static int process_entity_declaration(dtd_parser *p,
const ichar *decl);
static void free_notations(dtd_notation *n);
static void free_shortrefs(dtd_shortref *sr);
static int process_cdata(dtd_parser *p, int last);
static int process_entity(dtd_parser *p, const ichar *name);
static int emit_cdata(dtd_parser *p, int last);
static dtd_space_mode istr_to_space_mode(const ichar *val);
static void update_space_mode(dtd_parser *p, dtd_element *e,
int natts, sgml_attribute *atts);
2010-05-06 10:59:09 +01:00
static dtd_model * make_model(dtd_parser *p, const ichar *decl,
const ichar **end);
static void for_elements_in_model(dtd_model *m,
void (*f)(dtd_element *e,
void *closure),
void *closure);
2010-05-06 10:59:09 +01:00
int putchar_dtd_parser(dtd_parser *p, int chr);
void free_dtd_parser(dtd_parser *p);
static const ichar * isee_character_entity(dtd *dtd, const ichar *in,
int *chr);
static int add_default_attributes(dtd_parser *p, dtd_element *e,
int natts,
sgml_attribute *atts);
static int prepare_cdata(dtd_parser *p);
/*******************************
* MACROS *
*******************************/
#define WITH_CLASS(p, c, g) \
{ sgml_event_class _oc = p->event_class; \
p->event_class = c; \
g; \
p->event_class = _oc; \
}
/*******************************
* STATISTICS *
*******************************/
#ifdef O_STATISTICS
int edefs_created = 0;
int edefs_freed = 0;
int edefs_implicit = 0;
int edefs_atts = 0;
int edefs_decl = 0;
int dtd_created = 0;
int dtd_freed = 0;
void
sgml_statistics(void)
{ fprintf(stderr, "EDEFS: created %d; freed %d\n", edefs_created, edefs_freed);
fprintf(stderr, "EDEFS: implicit %d; atts %d; decl %d\n",
edefs_implicit, edefs_atts, edefs_decl);
fprintf(stderr, "DTDs: created: %d; freed: %d\n", dtd_created, dtd_freed);
}
#define STAT(g) g
#else
#define STAT(g) ((void)0)
#endif
/*******************************
* SRC LOCATION *
*******************************/
static void /* TBD: also handle startloc */
push_location(dtd_parser *p, locbuf *save)
{ save->here = p->location;
save->start = p->startloc;
p->location.parent = &save->here;
p->startloc.parent = &save->start;
}
static void
pop_location(dtd_parser *p, locbuf *saved)
{ p->location = saved->here;
p->startloc = saved->start;
}
static inline void
_sgml_cplocation(dtd_srcloc *d, dtd_srcloc *loc)
{ d->type = loc->type;
d->name.file = loc->name.file;
d->line = loc->line;
d->linepos = loc->linepos;
d->charpos = loc->charpos;
/* but not the parent! */
}
void
sgml_cplocation(dtd_srcloc *d, dtd_srcloc *loc)
{ _sgml_cplocation(d, loc);
}
#define sgml_cplocation(d,s) _sgml_cplocation(d, s)
static void
inc_location(dtd_srcloc *l, int chr)
{ if ( chr == '\n' )
{ l->linepos = 0;
l->line++;
}
l->linepos++;
l->charpos++;
}
static void
dec_location(dtd_srcloc *l, int chr)
{ if ( chr == '\n' )
{ l->linepos = 2; /* not good! */
l->line--;
}
l->linepos--;
l->charpos--;
}
/*******************************
* CLASSIFICATION PRIMITIVES *
*******************************/
static inline int
HasClass(dtd *dtd, wint_t chr, int mask)
{ if ( chr <= 0xff )
return (dtd->charclass->class[(chr)] & (mask));
else
{ switch(mask)
{ case CH_NAME:
return ( xml_basechar(chr) ||
xml_digit(chr) ||
xml_ideographic(chr) ||
xml_combining_char(chr) ||
xml_extender(chr)
);
case CH_NMSTART:
return ( xml_basechar(chr) ||
xml_ideographic(chr) );
case CH_WHITE:
return FALSE; /* only ' ' and '\t' */
case CH_BLANK:
return iswspace(chr);
case CH_DIGIT:
return xml_digit(chr);
case CH_RS:
case CH_RE:
return FALSE;
default:
assert(0);
return FALSE;
}
}
}
static const ichar *
isee_func(dtd *dtd, const ichar *in, charfunc func)
{ if ( dtd->charfunc->func[func] == *in )
return ++in;
return NULL;
}
/*******************************
* SYMBOLS *
*******************************/
static dtd_symbol_table *
2010-05-06 10:59:09 +01:00
new_symbol_table()
{ dtd_symbol_table *t = sgml_calloc(1, sizeof(*t));
t->size = SYMBOLHASHSIZE;
t->entries = sgml_calloc(t->size, sizeof(dtd_symbol*));
return t;
}
static void
free_symbol_table(dtd_symbol_table *t)
{ int i;
for(i=0; i<t->size; i++)
{ dtd_symbol *s, *next;
for(s=t->entries[i]; s; s=next)
{ next = s->next;
sgml_free((ichar*)s->name);
sgml_free(s);
}
}
sgml_free(t->entries);
sgml_free(t);
}
dtd_symbol *
dtd_find_symbol(dtd *dtd, const ichar *name)
{ dtd_symbol_table *t = dtd->symbols;
if ( dtd->case_sensitive )
{ int k = istrhash(name, t->size);
dtd_symbol *s;
for(s=t->entries[k]; s; s = s->next)
{ if ( istreq(s->name, name) )
return s;
}
} else
{ int k = istrcasehash(name, t->size);
dtd_symbol *s;
for(s=t->entries[k]; s; s = s->next)
{ if ( istrcaseeq(s->name, name) )
return s;
}
}
return NULL;
}
static dtd_symbol *
dtd_find_entity_symbol(dtd *dtd, const ichar *name)
{ dtd_symbol_table *t = dtd->symbols;
if ( dtd->ent_case_sensitive )
{ int k = istrhash(name, t->size);
dtd_symbol *s;
for(s=t->entries[k]; s; s = s->next)
{ if ( istreq(s->name, name) )
return s;
}
} else
{ int k = istrcasehash(name, t->size);
dtd_symbol *s;
for(s=t->entries[k]; s; s = s->next)
{ if ( istrcaseeq(s->name, name) )
return s;
}
}
return NULL;
}
dtd_symbol *
dtd_add_symbol(dtd *dtd, const ichar *name)
{ dtd_symbol_table *t = dtd->symbols;
int k = istrhash(name, t->size);
dtd_symbol *s;
for(s=t->entries[k]; s; s = s->next)
{ if ( istreq(s->name, name) )
return s;
}
s = sgml_calloc(1, sizeof(*s));
s->name = istrdup(name);
s->next = t->entries[k];
t->entries[k] = s;
return s;
}
/*******************************
* ENTITIES *
*******************************/
static void
free_entity_list(dtd_entity *e)
{ dtd_entity *next;
for( ; e; e=next)
{ next = e->next;
if ( e->value ) sgml_free(e->value);
if ( e->extid ) sgml_free(e->extid);
if ( e->exturl ) sgml_free(e->exturl);
if ( e->baseurl ) sgml_free(e->baseurl);
sgml_free(e);
}
}
static dtd_entity *
find_pentity(dtd *dtd, dtd_symbol *id)
{ dtd_entity *e;
for(e = dtd->pentities; e; e=e->next)
{ if ( e->name == id )
return e;
}
return NULL;
}
/* returned path must be freed when done */
static ichar *
entity_file(dtd *dtd, dtd_entity *e)
{ switch(e->type)
{ case ET_SYSTEM:
case ET_PUBLIC:
{ const ichar *f;
f = find_in_catalogue(e->catalog_location,
e->name->name,
e->extid,
e->exturl,
dtd->dialect != DL_SGML);
if ( f ) /* owned by catalog */
{ ichar *file;
if ( is_absolute_path(f) || !e->baseurl )
file = istrdup(f);
else
file = localpath(e->baseurl, f);
return file;
}
}
default:
return NULL;
}
}
static const ichar *
entity_value(dtd_parser *p, dtd_entity *e, int *len)
{ ichar *file;
if ( !e->value && (file=entity_file(p->dtd, e)) )
{ int normalise = (e->content == EC_SGML || e->content == EC_CDATA);
size_t l;
e->value = load_sgml_file_to_charp(file, normalise, &l);
e->length = (long)l;
sgml_free(file);
}
if ( len )
*len = e->length;
return e->value;
}
static int
expand_pentities(dtd_parser *p, const ichar *in, int ilen, ichar *out, int len)
{ dtd *dtd = p->dtd;
int pero = dtd->charfunc->func[CF_PERO]; /* % */
int ero = dtd->charfunc->func[CF_ERO]; /* & */
const ichar *s;
const ichar *end;
if ( ilen == ZERO_TERM_LEN )
{ end = in + wcslen(in);
} else
{ end = &in[ilen];
}
while(in < end)
{ if ( *in == pero )
{ dtd_symbol *id;
2010-05-06 10:59:09 +01:00
if ( (s = itake_entity_name(p, in+1, &id)) )
{ dtd_entity *e = find_pentity(dtd, id);
const ichar *eval;
int l;
in = s;
if ( (s=isee_func(dtd, s, CF_ERC)) ) /* ; is not obligatory? */
in = s;
if ( !e )
2010-05-06 10:59:09 +01:00
return gripe(p, ERC_EXISTENCE, L"parameter entity", id->name);
if ( !(eval = entity_value(p, e, NULL)) )
return FALSE;
if ( !expand_pentities(p, eval, ZERO_TERM_LEN, out, len) )
return FALSE;
l = (int)istrlen(out); /* could be better */
out += l;
len -= l;
continue;
}
}
if ( --len <= 0 )
2010-05-06 10:59:09 +01:00
{ gripe(p, ERC_REPRESENTATION, L"Declaration too long");
return FALSE;
}
if ( *in == ero && in[1] == '#' ) /* &# */
{ int chr;
if ( (s=isee_character_entity(dtd, in, &chr)) )
{ if ( chr == 0 )
2010-05-06 10:59:09 +01:00
{ gripe(p, ERC_SYNTAX_ERROR, L"Illegal character entity", in);
} else
{ *out++ = chr;
in = s;
continue;
}
}
}
*out++ = *in++;
}
*out = '\0';
return TRUE;
}
static int
char_entity_value(const ichar *decl)
{ if ( *decl == '#' )
{ const ichar *s = decl+1;
ichar *end;
long v;
/* do octal too? */
if ( s[0] == 'x' || s[0] == 'X' )
v = wcstoul(s+1, &end, 16);
else
v = wcstoul(s, &end, 10);
if ( *end == '\0' )
{ return (int)v;
} else if ( istreq(s, L"RS") )
{ return '\n';
} else if ( istreq(s, L"RE") )
{ return '\r';
} else if ( istreq(s, L"TAB") )
{ return '\t';
} else if ( istreq(s, L"SPACE") )
{ return ' ';
}
}
return -1;
}
static const ichar *
isee_character_entity(dtd *dtd, const ichar *in, int *chr)
{ const ichar *s;
if ( (s=isee_func(dtd, in, CF_ERO)) && *s == '#' )
{ ichar e[32];
ichar *o = e;
int v;
*o++ = *s++;
while(o < e+sizeof(e)/sizeof(ichar)-1 && HasClass(dtd, *s, CH_NAME))
*o++ = *s++;
if ( isee_func(dtd, s, CF_ERC)) /* skip ; */
s++;
*o = '\0';
if ( (v=char_entity_value(e)) >= 0 )
{ *chr = v;
return s;
}
}
return NULL;
}
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Expand entities in a string. Used to expand CDATA attribute values.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
static int
expand_entities(dtd_parser *p, const ichar *in, int len, ocharbuf *out)
{ const ichar *s;
const ichar *end = &in[len];
dtd *dtd = p->dtd;
int ero = dtd->charfunc->func[CF_ERO]; /* & */
while(in < end)
{ if ( *in == ero )
{ const ichar *estart = in; /* for recovery */
int chr;
if ( (s=isee_character_entity(dtd, in, &chr)) )
{ if ( chr == 0 )
2010-05-06 10:59:09 +01:00
gripe(p, ERC_SYNTAX_ERROR, L"Illegal character entity", in);
add_ocharbuf(out, chr);
in = s;
continue;
}
if ( HasClass(dtd, in[1], CH_NMSTART) )
{ dtd_symbol *id;
dtd_entity *e;
const ichar *eval;
2010-05-06 10:59:09 +01:00
if ( !(in = itake_name(p, in+1, &id)) )
{ in = estart;
goto recover;
}
if ( isee_func(dtd, in, CF_ERC) || *in == '\n' )
in++;
2010-05-06 10:59:09 +01:00
if ( !(e = id->entity) && !(e=dtd->default_entity) )
2010-05-06 10:59:09 +01:00
{ gripe(p, ERC_EXISTENCE, L"entity", id->name);
in = estart;
goto recover;
}
2010-05-06 10:59:09 +01:00
if ( !(eval = entity_value(p, e, NULL)) )
2010-05-06 10:59:09 +01:00
{ gripe(p, ERC_NO_VALUE, e->name->name);
in = estart;
goto recover;
}
if ( e->content == EC_SGML )
{ if ( !expand_entities(p, eval, (int)istrlen(eval), out) )
return FALSE;
} else
{ const ichar *s;
for(s=eval; *s; s++)
add_ocharbuf(out, *s);
}
continue;
}
if ( dtd->dialect != DL_SGML )
2010-05-06 10:59:09 +01:00
gripe(p, ERC_SYNTAX_ERROR, L"Illegal entity", estart);
}
recover:
if ( *in == CR && in[1] == LF )
in++;
if ( HasClass(dtd, *in, CH_BLANK) )
{ add_ocharbuf(out, ' ');
in++;
} else
{ add_ocharbuf(out, *in++);
}
}
terminate_ocharbuf(out);
return TRUE;
}
/*******************************
* ELEMENTS *
*******************************/
static dtd_element *
find_element(dtd *dtd, dtd_symbol *id)
{ dtd_element *e;
if ( id->element )
return id->element; /* must check */
e = sgml_calloc(1, sizeof(*e));
e->space_mode = SP_INHERIT;
e->undefined = TRUE;
e->name = id;
id->element = e;
2010-05-06 10:59:09 +01:00
e->next = dtd->elements;
dtd->elements = e;
return e;
}
static dtd_edef *
new_element_definition(dtd *dtd)
{ dtd_edef *def = sgml_calloc(1, sizeof(*def));
2010-05-06 10:59:09 +01:00
STAT(edefs_created++);
return def;
}
static dtd_element *
def_element(dtd *dtd, dtd_symbol *id)
{ dtd_element *e = find_element(dtd, id);
if ( !e->structure )
{ e->structure = new_element_definition(dtd);
e->structure->references = 1;
e->structure->type = C_EMPTY;
}
return e;
}
static void
free_name_list(dtd_name_list *nl)
{ dtd_name_list *next;
for( ; nl; nl=next)
{ next = nl->next;
sgml_free(nl);
}
}
#define REFS_VIRGIN (-42)
static void
free_attribute(dtd_attr *a)
{ if ( a->references == REFS_VIRGIN || --a->references == 0 )
{ switch(a->type)
{ case AT_NAMEOF:
case AT_NOTATION:
free_name_list(a->typeex.nameof);
default:
;
}
switch(a->def)
{ case AT_DEFAULT:
case AT_FIXED:
{ if ( a->islist )
sgml_free(a->att_def.list);
else if ( a->type == AT_CDATA && a->att_def.cdata )
sgml_free(a->att_def.cdata);
}
default:
;
}
sgml_free(a);
}
}
static void
free_attribute_list(dtd_attr_list *l)
{ dtd_attr_list *next;
for(; l; l=next)
{ next = l->next;
free_attribute(l->attribute);
sgml_free(l);
2010-05-06 10:59:09 +01:00
}
}
static void
free_element_list(dtd_element_list *l)
{ dtd_element_list *next;
for( ; l; l=next)
{ next = l->next;
sgml_free(l);
}
}
static void
free_element_definition(dtd_edef *def)
{ if ( --def->references == 0 )
{ STAT(edefs_freed++);
if ( def->content )
free_model(def->content);
free_element_list(def->included);
free_element_list(def->excluded);
free_state_engine(def->initial_state);
sgml_free(def);
}
}
static void
free_elements(dtd_element *e)
{ dtd_element *next;
for( ; e; e=next)
{ next = e->next;
if ( e->structure )
free_element_definition(e->structure);
free_attribute_list(e->attributes);
sgml_free(e);
}
}
/*******************************
* ATTRIBUTES *
*******************************/
static dtd_attr *
find_attribute(dtd_element *e, dtd_symbol *name)
{ dtd_attr_list *a;
for(a=e->attributes; a; a=a->next)
{ if ( a->attribute->name == name )
return a->attribute;
}
return NULL;
}
/*******************************
* PARSE PRIMITIVES *
*******************************/
static const ichar *
iskip_layout(dtd *dtd, const ichar *in)
{ ichar cmt = dtd->charfunc->func[CF_CMT]; /* also skips comment */
for( ; *in; in++ )
{ if ( HasClass(dtd, *in, CH_BLANK) )
continue;
if ( in[0] == cmt && in[1] == cmt )
{ in += 2;
for( ; *in; in++ )
{ if ( in[0] == cmt && in[1] == cmt )
break;
}
in++;
continue;
}
return in;
}
return in;
}
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
See whether we are looking at identifier "id". "id" must be lowercase!
This is only used for reserved words, and parsed case-insentive in both
XML and SGML modes.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
static const ichar *
isee_identifier(dtd *dtd, const ichar *in, char *id)
{ in = iskip_layout(dtd, in);
/* match */
while (*id && (wint_t)*id == towlower(*in) )
id++, in++;
if ( *id == 0 && !HasClass(dtd, *in, CH_NAME) )
return iskip_layout(dtd, in);
return NULL;
}
static const ichar *
2010-05-06 10:59:09 +01:00
itake_name(dtd_parser *p, const ichar *in, dtd_symbol **id)
{ ichar buf[MAXNMLEN];
ichar *o = buf;
ichar *e = &buf[MAXNMLEN]-1;
2010-05-06 10:59:09 +01:00
dtd *dtd = p->dtd;
in = iskip_layout(dtd, in);
if ( !HasClass(dtd, *in, CH_NMSTART) )
return NULL;
if ( dtd->case_sensitive )
{ while( HasClass(dtd, *in, CH_NAME) && o < e )
*o++ = *in++;
} else
{ while( HasClass(dtd, *in, CH_NAME) && o < e )
*o++ = towlower(*in++);
}
if ( o == e )
2010-05-06 10:59:09 +01:00
{ gripe(p, ERC_REPRESENTATION, L"NAME too long");
return NULL;
}
*o++ = '\0';
*id = dtd_add_symbol(dtd, buf);
return iskip_layout(dtd, in);
}
static const ichar *
2010-05-06 10:59:09 +01:00
itake_entity_name(dtd_parser *p, const ichar *in, dtd_symbol **id)
{ ichar buf[MAXNMLEN];
ichar *o = buf;
ichar *e = &buf[MAXNMLEN]-1;
2010-05-06 10:59:09 +01:00
dtd *dtd = p->dtd;
in = iskip_layout(dtd, in);
if ( !HasClass(dtd, *in, CH_NMSTART) )
return NULL;
if ( dtd->ent_case_sensitive )
{ while( HasClass(dtd, *in, CH_NAME) && o < e )
*o++ = *in++;
} else
{ while( HasClass(dtd, *in, CH_NAME) && o < e )
*o++ = towlower(*in++);
}
if ( o == e )
2010-05-06 10:59:09 +01:00
{ gripe(p, ERC_REPRESENTATION, L"Entity NAME too long");
return NULL;
}
*o++ = '\0';
*id = dtd_add_symbol(dtd, buf);
return in;
}
static const ichar *
2010-05-06 10:59:09 +01:00
itake_nmtoken(dtd_parser *p, const ichar *in, dtd_symbol **id)
{ ichar buf[MAXNMLEN];
ichar *o = buf;
ichar *e = &buf[MAXNMLEN]-1;
2010-05-06 10:59:09 +01:00
dtd *dtd = p->dtd;
in = iskip_layout(dtd, in);
if ( !HasClass(dtd, *in, CH_NAME) )
return NULL;
if ( dtd->case_sensitive )
{ while( HasClass(dtd, *in, CH_NAME) && o < e )
*o++ = *in++;
} else
{ while( HasClass(dtd, *in, CH_NAME) && o < e )
*o++ = towlower(*in++);
}
if ( o == e )
2010-05-06 10:59:09 +01:00
{ gripe(p, ERC_REPRESENTATION, L"NMTOKEN too long");
return NULL;
}
*o = '\0';
*id = dtd_add_symbol(dtd, buf);
return iskip_layout(dtd, in);
}
static const ichar *
2010-05-06 10:59:09 +01:00
itake_nutoken(dtd_parser *p, const ichar *in, dtd_symbol **id)
{ ichar buf[MAXNMLEN];
ichar *o = buf;
ichar *e = &buf[MAXNMLEN]-1;
2010-05-06 10:59:09 +01:00
dtd *dtd = p->dtd;
in = iskip_layout(dtd, in);
if ( !HasClass(dtd, *in, CH_DIGIT) )
return NULL;
if ( dtd->case_sensitive )
{ while( HasClass(dtd, *in, CH_NAME) && o < e )
*o++ = *in++;
} else
{ while( HasClass(dtd, *in, CH_NAME) && o < e )
*o++ = towlower(*in++);
}
if ( o == e )
2010-05-06 10:59:09 +01:00
{ gripe(p, ERC_REPRESENTATION, L"NUTOKEN too long");
return NULL;
}
*o = '\0';
if ( o - buf > 8 )
2010-05-06 10:59:09 +01:00
gripe(p, ERC_LIMIT, L"nutoken length");
*id = dtd_add_symbol(dtd, buf);
return iskip_layout(dtd, in);
}
static const ichar *
2010-05-06 10:59:09 +01:00
itake_number(dtd_parser *p, const ichar *in, dtd_attr *at)
{ dtd *dtd = p->dtd;
in = iskip_layout(dtd, in);
switch(dtd->number_mode)
{ case NU_TOKEN:
{ ichar buf[MAXNMLEN];
ichar *o = buf;
while( HasClass(dtd, *in, CH_DIGIT) )
*o++ = *in++;
if ( o == buf )
return NULL; /* empty */
*o = '\0';
at->att_def.name = dtd_add_symbol(dtd, buf);
return iskip_layout(dtd, (const ichar *)in);
}
case NU_INTEGER:
{ ichar *end;
at->att_def.number = wcstol(in, &end, 10);
if ( end > in && errno != ERANGE )
return iskip_layout(dtd, end);
}
}
return NULL;
}
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Get a quoted value. After successful return, *start points to the start
of the string in the input and *len to the length. The data is *not*
nul terminated.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
static const ichar *
itake_string(dtd *dtd, const ichar *in, ichar **start, int *len)
{ in = iskip_layout(dtd, in);
if ( isee_func(dtd, in, CF_LIT) ||
isee_func(dtd, in, CF_LITA) )
{ ichar q = *in++;
*start = (ichar *)in;
while( *in && *in != q )
in++;
if ( *in )
{ *len = (int)(in - (*start));
return iskip_layout(dtd, ++in);
}
}
return NULL;
}
static const ichar *
itake_dubbed_string(dtd *dtd, const ichar *in, ichar **out)
{ ichar *start;
int len;
const ichar *end;
if ( (end=itake_string(dtd, in, &start, &len)) )
*out = istrndup(start, len);
return end;
}
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
itake_url() is used to get the argument of a SYSTEM or 2nd argument of a
PUBLIC reference. Once upon a time it tried to tag the argument as
file:<path>, but this job cannot be before lookup in the catalogue. It
is now the same as itake_dubbed_string(), so we simply call this one.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
static const ichar *
itake_url(dtd *dtd, const ichar *in, ichar **out)
{ return itake_dubbed_string(dtd, in, out);
}
static const ichar *
2010-05-06 10:59:09 +01:00
itake_nmtoken_chars(dtd_parser *p, const ichar *in, ichar *out, int len)
{ dtd *dtd = p->dtd;
in = iskip_layout(dtd, in);
if ( !HasClass(dtd, *in, CH_NAME) )
return NULL;
while( HasClass(dtd, *in, CH_NAME) )
{ if ( --len <= 0 )
2010-05-06 10:59:09 +01:00
gripe(p, ERC_REPRESENTATION, L"Name token too long");
*out++ = (dtd->case_sensitive ? *in++ : (ichar)towlower(*in++));
}
*out++ = '\0';
return iskip_layout(dtd, in);
}
/* There used to be a function
itake_nonblank_chars(dtd, in, out, len) -> new end
which
- skipped layout,
- copied characters from in[] to out[] until layout or \0 was found,
- added a terminating \0 to out[],
- skipped any following layout, and
- returned the new position.
That function was only called by get_attribute_value(), which used
it to parse an unquoted attribute value. According to SGML, that's
not right: unquoted attribute values must look like NMTOKENs (but
have a different length bound). In particular, elements like
<foo a=bar>zoo</foo>
<foo a=ugh/zip/
are perfectly legal, so scanning an unquoted attribute value MUST
stop at a '/' or '>'. According to HTML practice, pretty much any
old junk will be accepted, and some HTML parsers will allow bare
slashes in such an attribute.
Typical HTML is *so* bad that it doesn't agree with *any* part of
the HTML specifications (e.g., <FONT> is commonly wrapped around
block-level elements, which has never been legal). It's not clear
that there is much point in trying to accomodate bad HTML; if you
really need to do that, use the free program HTML Tidy (from the
http://www.w3c.org/ site) to clean up, and parse its output instead.
However, in order to break as little as possible, the new (sgml-1.0.14)
function accepts anything except > / \0 and blanks.
JW: I decided to accept / as part of an unquoted in SGML-mode if
shorttag is disabled as well as in XML mode if it is not the
end of the begin-element
*/
static ichar const *
2010-05-06 10:59:09 +01:00
itake_unquoted(dtd_parser *p, ichar const *in, ichar *out, int len)
{ dtd *dtd = p->dtd;
ichar const end2 = dtd->charfunc->func[CF_ETAGO2]; /* / */
ichar c;
/* skip leading layout. Do NOT skip comments! --x-- is a value! */
while (c = *in, HasClass(dtd, c, CH_BLANK))
in++;
/* copy the attribute to out[] */
while ( !HasClass(dtd, c, CH_BLANK) &&
c != '\0' )
{ if ( c == end2 && (dtd->shorttag ||
(in[1] == '\0' && dtd->dialect != DL_SGML)) )
break;
if ( --len > 0 )
*out++ = c;
else if ( len == 0 )
2010-05-06 10:59:09 +01:00
gripe(p, ERC_REPRESENTATION, L"Attribute too long");
c = *++in;
}
*out = '\0';
/* skip trailing layout. While it is kind to skip comments here,
it is technically wrong to do so. Tags may not contain comments.
*/
return iskip_layout(dtd, in);
}
/*******************************
* DTD *
*******************************/
dtd *
new_dtd(const ichar *doctype)
{ dtd *dtd = sgml_calloc(1, sizeof(*dtd));
STAT(dtd_created++);
dtd->magic = SGML_DTD_MAGIC;
dtd->implicit = TRUE;
dtd->dialect = DL_SGML;
if ( doctype )
dtd->doctype = istrdup(doctype);
dtd->symbols = new_symbol_table();
dtd->charclass = new_charclass();
dtd->charfunc = new_charfunc();
dtd->space_mode = SP_SGML;
dtd->ent_case_sensitive = TRUE; /* case-sensitive entities */
dtd->shorttag = TRUE; /* allow for <tag/value/ */
dtd->number_mode = NU_TOKEN;
return dtd;
}
void
free_dtd(dtd *dtd)
{ if ( --dtd->references == 0 )
{ STAT(dtd_freed++);
if ( dtd->doctype )
sgml_free(dtd->doctype);
2010-05-06 10:59:09 +01:00
free_entity_list(dtd->entities);
free_entity_list(dtd->pentities);
free_notations(dtd->notations);
free_shortrefs(dtd->shortrefs);
free_elements(dtd->elements);
free_symbol_table(dtd->symbols);
sgml_free(dtd->charfunc);
sgml_free(dtd->charclass);
dtd->magic = 0;
2010-05-06 10:59:09 +01:00
sgml_free(dtd);
}
}
static const wchar_t *xml_entities[] =
{ L"lt CDATA \"&#60;\"", /* < */
L"gt CDATA \"&#62;\"", /* > */
L"amp CDATA \"&#38;\"", /* & */
L"apos CDATA \"&#39;\"", /* ' */
L"quot CDATA \"&#34;\"", /* " */
NULL
};
int
set_dialect_dtd(dtd *dtd, dtd_dialect dialect)
{ if ( dtd->dialect != dialect )
{ dtd->dialect = dialect;
switch(dialect)
{ case DL_SGML:
{ dtd->case_sensitive = FALSE;
dtd->space_mode = SP_SGML;
dtd->shorttag = TRUE;
break;
}
case DL_XML:
case DL_XMLNS:
{ const ichar **el;
dtd_parser p;
2010-05-06 10:59:09 +01:00
dtd->case_sensitive = TRUE;
dtd->encoding = SGML_ENC_UTF8;
dtd->space_mode = SP_PRESERVE;
dtd->shorttag = FALSE;
2010-05-06 10:59:09 +01:00
memset(&p, 0, sizeof(p));
p.dtd = dtd;
for(el = xml_entities; *el; el++)
process_entity_declaration(&p, *el);
2010-05-06 10:59:09 +01:00
break;
}
}
}
return TRUE;
}
int
set_option_dtd(dtd *dtd, dtd_option option, int set)
{ switch(option)
{ case OPT_SHORTTAG:
dtd->shorttag = set;
break;
}
return TRUE;
}
static const ichar *
baseurl(dtd_parser *p)
{ if ( p->location.type == IN_FILE && p->location.name.file )
{ return p->location.name.file;
}
return NULL;
}
static const ichar *
process_entity_value_declaration(dtd_parser *p,
const ichar *decl, dtd_entity *e)
{ dtd *dtd = p->dtd;
const ichar *s;
if ( e->type == ET_SYSTEM )
{ if ( (s=itake_url(dtd, decl, &e->exturl)) )
{ e->baseurl = istrdup(baseurl(p));
return s;
}
goto string_expected;
} else
{ ichar *start; int len;
ichar val[MAXSTRINGLEN];
if ( !(s = itake_string(dtd, decl, &start, &len)) )
goto string_expected;
decl = s;
expand_pentities(p, start, len, val, sizeof(val)/sizeof(ichar));
switch ( e->type )
{ case ET_PUBLIC:
{ e->extid = istrdup(val);
if ( isee_func(dtd, decl, CF_LIT) ||
isee_func(dtd, decl, CF_LITA) )
{ if ( (s=itake_url(dtd, decl, &e->exturl)) )
{ e->baseurl = istrdup(baseurl(p));
decl = s;
}
}
return decl;
}
case ET_LITERAL:
{ e->value = istrdup(val);
e->length = (int)wcslen(e->value);
return decl;
}
default:
assert(0);
return NULL;
}
}
string_expected:
2010-05-06 10:59:09 +01:00
gripe(p, ERC_SYNTAX_ERROR, L"String expected", decl);
return NULL;
}
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
The sgml-standard tells us to accept the first definition of an entity,
silently suppressing any further attempt to redefine the entity.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
static int
process_entity_declaration(dtd_parser *p, const ichar *decl)
{ dtd *dtd = p->dtd;
const ichar *s;
dtd_symbol *id;
dtd_entity *e;
int isparam;
int isdef = FALSE;
/* parameter entity */
if ( (s=isee_func(dtd, decl, CF_PERO)) )
{ isparam = TRUE;
decl = s;
} else
isparam = FALSE;
2010-05-06 10:59:09 +01:00
if ( !(s = itake_entity_name(p, decl, &id)) )
{ if ( !(s = isee_identifier(dtd, decl, "#default")) )
2010-05-06 10:59:09 +01:00
return gripe(p, ERC_SYNTAX_ERROR, L"Name expected", decl);
id = dtd_add_symbol(dtd, (ichar*)"#DEFAULT");
isdef = TRUE;
}
if ( isparam && find_pentity(dtd, id) )
2010-05-06 10:59:09 +01:00
{ gripe(p, ERC_REDEFINED, L"parameter entity", id);
return TRUE; /* already defined parameter entity */
}
if ( id->entity )
2010-05-06 10:59:09 +01:00
{ gripe(p, ERC_REDEFINED, L"entity", id);
return TRUE; /* already defined normal entity */
}
decl = iskip_layout(dtd, s);
e = sgml_calloc(1, sizeof(*e));
e->name = id;
e->catalog_location = (isparam ? CAT_PENTITY : CAT_ENTITY);
if ( (s = isee_identifier(dtd, decl, "system")) )
{ e->type = ET_SYSTEM;
e->content = EC_SGML;
decl = s;
} else if ( (s = isee_identifier(dtd, decl, "public")) )
{ e->type = ET_PUBLIC;
e->content = EC_SGML;
decl = s;
} else
{ e->type = ET_LITERAL;
if ( !isparam )
{ if ( (s=isee_identifier(dtd, decl, "cdata")) )
{ decl = s;
e->content = EC_CDATA;
} else if ( (s=isee_identifier(dtd, decl, "sdata")) )
{ decl = s;
e->content = EC_SDATA;
} else if ( (s=isee_identifier(dtd, decl, "pi")) )
{ decl = s;
e->content = EC_PI;
} else if ( (s=isee_identifier(dtd, decl, "starttag")) )
{ decl = s;
e->content = EC_STARTTAG;
} else if ( (s=isee_identifier(dtd, decl, "endtag")) )
{ decl = s;
e->content = EC_ENDTAG;
} else
e->content = EC_SGML;
}
}
if ( (decl=process_entity_value_declaration(p, decl, e)) )
{ if ( e->type == ET_LITERAL )
{ switch(e->content)
{ case EC_STARTTAG:
{ ichar *buf = sgml_malloc((e->length + 3)*sizeof(ichar));
buf[0] = dtd->charfunc->func[CF_STAGO];
istrcpy(&buf[1], e->value);
buf[++e->length] = dtd->charfunc->func[CF_STAGC];
buf[++e->length] = 0;
sgml_free(e->value);
e->value = buf;
e->content = EC_SGML;
break;
}
case EC_ENDTAG:
{ ichar *buf = sgml_malloc((e->length + 4)*sizeof(ichar));
buf[0] = dtd->charfunc->func[CF_ETAGO1];
buf[1] = dtd->charfunc->func[CF_ETAGO2];
istrcpy(&buf[2], e->value);
e->length++;
buf[++e->length] = dtd->charfunc->func[CF_STAGC];
buf[++e->length] = 0;
sgml_free(e->value);
e->value = buf;
e->content = EC_SGML;
break;
}
default:
break;
}
} else
{ if ( *decl )
{ dtd_symbol *nname;
if ( (s=isee_identifier(dtd, decl, "cdata")) )
{ decl = s;
e->content = EC_CDATA;
} else if ( (s=isee_identifier(dtd, decl, "sdata")) )
{ decl = s;
e->content = EC_SDATA;
} else if ( (s=isee_identifier(dtd, decl, "ndata")) )
{ decl = s;
e->content = EC_NDATA;
} else
2010-05-06 10:59:09 +01:00
return gripe(p, ERC_SYNTAX_ERROR, L"Bad datatype declaration", decl);
if ( (s=itake_name(p, decl, &nname)) ) /* what is this? */
{ decl = s;
} else
2010-05-06 10:59:09 +01:00
return gripe(p, ERC_SYNTAX_ERROR, L"Bad notation declaration", decl);
}
}
if ( *decl )
2010-05-06 10:59:09 +01:00
return gripe(p, ERC_SYNTAX_ERROR, L"Unexpected end of declaraction", decl);
}
if ( isparam )
{ e->next = dtd->pentities;
dtd->pentities = e;
} else
{ e->name->entity = e;
e->next = dtd->entities;
dtd->entities = e;
}
2010-05-06 10:59:09 +01:00
if ( isdef )
dtd->default_entity = e;
return TRUE;
}
/*******************************
* NOTATIONS *
*******************************/
static dtd_notation *
find_notation(dtd *dtd, dtd_symbol *name)
{ dtd_notation *n;
for(n=dtd->notations; n; n = n->next)
{ if ( n->name == name )
return n;
}
return NULL;
}
static void
add_notation(dtd *dtd, dtd_notation *not)
{ dtd_notation **n = &dtd->notations;
for( ; *n; n = &(*n)->next)
;
*n = not;
}
static int
process_notation_declaration(dtd_parser *p, const ichar *decl)
{ dtd *dtd = p->dtd;
dtd_symbol *nname;
const ichar *s;
ichar *system = NULL, *public = NULL;
dtd_notation *not;
2010-05-06 10:59:09 +01:00
if ( !(s=itake_name(p, decl, &nname)) )
return gripe(p, ERC_SYNTAX_ERROR, L"Notation name expected", decl);
decl = s;
if ( find_notation(dtd, nname) )
2010-05-06 10:59:09 +01:00
{ gripe(p, ERC_REDEFINED, L"notation", nname);
return TRUE;
}
if ( (s=isee_identifier(dtd, decl, "system")) )
{ ;
} else if ( (s=isee_identifier(dtd, decl, "public")) )
{ decl = s;
if ( !(s=itake_dubbed_string(dtd, decl, &public)) )
2010-05-06 10:59:09 +01:00
return gripe(p, ERC_SYNTAX_ERROR, L"Public identifier expected", decl);
} else
2010-05-06 10:59:09 +01:00
return gripe(p, ERC_SYNTAX_ERROR, L"SYSTEM or PUBLIC expected", decl);
decl = s;
if ( (s=itake_dubbed_string(dtd, decl, &system)) )
decl = s;
if ( *decl )
2010-05-06 10:59:09 +01:00
return gripe(p, ERC_SYNTAX_ERROR, L"Unexpected end of declaraction", decl);
not = sgml_calloc(1, sizeof(*not));
not->name = nname;
not->system = system;
not->public = public;
not->next = NULL;
add_notation(dtd, not);
return TRUE;
}
static void
free_notations(dtd_notation *n)
{ dtd_notation *next;
for( ; n; n=next)
{ next = n->next;
sgml_free(n->system);
sgml_free(n->public);
sgml_free(n);
}
}
/*******************************
* SHORTREF *
*******************************/
static void
free_maps(dtd_map *map)
{ dtd_map *next;
for( ; map; map=next)
{ next = map->next;
if ( map->from )
sgml_free(map->from);
sgml_free(map);
}
}
static void
free_shortrefs(dtd_shortref *sr)
{ dtd_shortref *next;
for( ; sr; sr=next)
{ next = sr->next;
free_maps(sr->map);
sgml_free(sr);
}
}
static const ichar *
2010-05-06 10:59:09 +01:00
shortref_add_map(dtd_parser *p, const ichar *decl, dtd_shortref *sr)
{ ichar *start; int len;
ichar from[MAXMAPLEN];
ichar *f = from;
dtd_symbol *to;
const ichar *s;
const ichar *end;
2010-05-06 10:59:09 +01:00
dtd *dtd = p->dtd;
dtd_map **prev;
dtd_map *m;
if ( !(s=itake_string(dtd, decl, &start, &len)) )
2010-05-06 10:59:09 +01:00
{ gripe(p, ERC_SYNTAX_ERROR, L"map-string expected", decl);
return NULL;
}
decl = s;
2010-05-06 10:59:09 +01:00
if ( !(s=itake_entity_name(p, decl, &to)) )
{ gripe(p, ERC_SYNTAX_ERROR, L"map-to name expected", decl);
return NULL;
}
end = s;
for(decl=start; len > 0;)
{ if ( *decl == 'B' ) /* blank */
{ if ( decl[1] == 'B' )
{ *f++ = CHR_DBLANK;
decl += 2;
len -= 2;
continue;
}
*f++ = CHR_BLANK;
decl++;
len--;
} else
{ *f++ = *decl++; /* any other character */
len--;
}
}
*f = 0;
2010-05-06 10:59:09 +01:00
for(prev=&sr->map; *prev; prev = &(*prev)->next)
;
2010-05-06 10:59:09 +01:00
m = sgml_calloc(1, sizeof(*m));
m->from = istrdup(from);
m->len = (int)istrlen(from);
m->to = to;
2010-05-06 10:59:09 +01:00
*prev = m;
return end;
}
static dtd_shortref *
def_shortref(dtd_parser *p, dtd_symbol *name)
{ dtd *dtd = p->dtd;
dtd_shortref *sr, **pr;
for(pr=&dtd->shortrefs; *pr; pr = &(*pr)->next)
{ dtd_shortref *r = *pr;
if ( r->name == name )
return r;
}
2010-05-06 10:59:09 +01:00
sr = sgml_calloc(1, sizeof(*sr));
sr->name = name;
*pr = sr;
return sr;
}
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Create an array with TRUE in any character that can be the last of the
shortref map.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
static void
compile_map(dtd *dtd, dtd_shortref *sr)
{ dtd_map *map;
for(map = sr->map; map; map = map->next)
{ ichar last = map->from[map->len-1];
switch( last )
{ case CHR_BLANK:
case CHR_DBLANK:
{ wint_t i;
for( i=0; i< SHORTMAP_SIZE; i++)
{ if ( HasClass(dtd, i, CH_BLANK) )
sr->ends[i] = TRUE;
}
}
default:
sr->ends[last] = TRUE;
}
}
}
static int
process_shortref_declaration(dtd_parser *p, const ichar *decl)
{ dtd *dtd = p->dtd;
ichar buf[MAXDECL];
dtd_shortref *sr;
dtd_symbol *name;
const ichar *s;
if ( !expand_pentities(p, decl, ZERO_TERM_LEN, buf, sizeof(buf)/sizeof(ichar)) )
return FALSE;
decl = buf;
2010-05-06 10:59:09 +01:00
if ( !(s=itake_name(p, decl, &name)) )
return gripe(p, ERC_SYNTAX_ERROR, L"Name expected", decl);
decl = s;
sr = def_shortref(p, name);
if ( sr->defined )
2010-05-06 10:59:09 +01:00
{ gripe(p, ERC_REDEFINED, L"shortref", name);
return TRUE;
}
sr->defined = TRUE;
while( *(decl = iskip_layout(dtd, decl)) != '\0'
2010-05-06 10:59:09 +01:00
&& (s=shortref_add_map(p, decl, sr)) )
decl = s;
compile_map(dtd, sr);
if ( *decl )
2010-05-06 10:59:09 +01:00
return gripe(p, ERC_SYNTAX_ERROR, L"Map expected", decl);
return TRUE;
}
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Find named name. The name NULL stands for the #empty map
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
static dtd_shortref *
find_map(dtd *dtd, dtd_symbol *name)
{ dtd_shortref *sr;
if ( !name )
{ static dtd_shortref *empty;
if ( !empty )
{ empty = sgml_calloc(1, sizeof(*empty));
empty->name = dtd_add_symbol(dtd, (ichar*)"#EMPTY");
empty->defined = TRUE;
}
return empty;
}
for( sr = dtd->shortrefs; sr; sr = sr->next )
{ if ( sr->name == name )
{ if ( !sr->defined )
break;
return sr;
}
}
2010-05-06 10:59:09 +01:00
return NULL;
}
static void
set_map_element(dtd_element *e, void *closure)
{ e->map = closure;
}
static int
process_usemap_declaration(dtd_parser *p, const ichar *decl)
{ dtd *dtd = p->dtd;
ichar buf[MAXDECL];
dtd_symbol *name;
const ichar *s;
dtd_symbol *ename;
dtd_element *e;
dtd_shortref *map;
if ( !expand_pentities(p, decl, ZERO_TERM_LEN, buf, sizeof(buf)/sizeof(ichar)) )
return FALSE;
decl = buf;
2010-05-06 10:59:09 +01:00
if ( !(s=itake_name(p, decl, &name)) )
{ if ( (s=isee_identifier(dtd, decl, "#empty")) )
name = NULL;
else
2010-05-06 10:59:09 +01:00
return gripe(p, ERC_SYNTAX_ERROR, L"map-name expected", decl);
}
decl = s;
if ( !(map = find_map(dtd, name)) )
map = def_shortref(p, name); /* make undefined map */
if ( isee_func(dtd, decl, CF_GRPO) ) /* ( */
{ dtd_model *model;
2010-05-06 10:59:09 +01:00
if ( (model = make_model(p, decl, &s)) )
{ for_elements_in_model(model, set_map_element, map);
free_model(model);
decl = s;
} else
return FALSE;
2010-05-06 10:59:09 +01:00
} else if ( (s=itake_name(p, decl, &ename)) )
{ e = find_element(dtd, ename);
e->map = map;
decl = s;
} else if ( p->environments )
{ if ( !map->defined )
2010-05-06 10:59:09 +01:00
gripe(p, ERC_EXISTENCE, L"map", name->name);
p->environments->map = map;
p->map = p->environments->map;
} else
2010-05-06 10:59:09 +01:00
return gripe(p, ERC_SYNTAX_ERROR, L"element-name expected", decl);
if ( *decl )
2010-05-06 10:59:09 +01:00
return gripe(p, ERC_SYNTAX_ERROR, L"Unparsed", decl);
return TRUE;
}
static int
match_map(dtd *dtd, dtd_map *map, ocharbuf *buf)
{ wchar_t *data = buf->data.w;
wchar_t *e = data+buf->size-1;
ichar *m = map->from+map->len-1;
while( m >= map->from )
{ if ( e < data )
return 0;
if ( *m == *e )
{ m--;
e--;
continue;
}
if ( *m == CHR_DBLANK )
{ if ( e>data && HasClass(dtd, *e, CH_WHITE) )
e--;
else
return FALSE;
goto wblank;
}
if ( *m == CHR_BLANK )
{ wblank:
while( e>data && HasClass(dtd, *e, CH_WHITE) )
e--;
m--;
continue;
}
return 0;
}
return (int)(data+buf->size-1-e);
}
static int
match_shortref(dtd_parser *p)
{ dtd_map *map;
for(map = p->map->map; map; map = map->next)
{ int len;
if ( (len=match_map(p->dtd, map, p->cdata)) )
{ p->cdata->size -= len;
if ( p->cdata_must_be_empty )
{ int blank = TRUE;
const wchar_t *s;
int i;
for(s = p->cdata->data.w, i=0; i++ < p->cdata->size; s++)
{ if ( !iswspace(*s) )
{ blank = FALSE;
break;
}
}
p->blank_cdata = blank;
}
WITH_CLASS(p, EV_SHORTREF,
{ sgml_cplocation(&p->startloc, &p->location);
p->startloc.charpos -= len;
p->startloc.linepos -= len;
if ( p->startloc.linepos < 0 )
{ p->startloc.line--;
p->startloc.linepos = 0; /* not correct! */
}
DEBUG(printf("%d-%d: Matched map '%s' --> %s, len = %d\n",
p->startloc.charpos,
p->location.charpos,
map->from, map->to->name, len));
process_entity(p, map->to->name);
}) /* TBD: optimise */
return TRUE;
}
}
return FALSE;
}
/*******************************
* ELEMENTS *
*******************************/
static void
add_submodel(dtd_model *m, dtd_model *sub)
{ dtd_model **d;
for( d = &m->content.group; *d; d = &(*d)->next )
;
*d = sub;
}
/* for_elements_in_model()
Walk along the model, calling f(e, closure) for any element found
in the model. Used for <!SHORTREF name model>
*/
static void
for_elements_in_model(dtd_model *m,
void (*f)(dtd_element *e, void *closure),
void *closure)
{ switch(m->type)
{ case MT_SEQ:
case MT_AND:
case MT_OR:
{ dtd_model *sub = m->content.group;
for(; sub; sub = sub->next)
for_elements_in_model(sub, f, closure);
break;
}
case MT_ELEMENT:
(*f)(m->content.element, closure);
break;
default:
;
}
}
static void
free_model(dtd_model *m)
{ switch(m->type)
{ case MT_SEQ:
case MT_AND:
case MT_OR:
{ dtd_model *sub = m->content.group;
dtd_model *next;
for(; sub; sub = next)
{ next = sub->next;
free_model(sub);
}
}
default:
;
}
sgml_free(m);
}
static dtd_model *
2010-05-06 10:59:09 +01:00
make_model(dtd_parser *p, const ichar *decl, const ichar **end)
{ const ichar *s;
dtd_model *m = sgml_calloc(1, sizeof(*m));
dtd_symbol *id;
2010-05-06 10:59:09 +01:00
dtd *dtd = p->dtd;
decl = iskip_layout(dtd, decl);
if ( (s=isee_identifier(dtd, decl, "#pcdata")) )
{ m->type = MT_PCDATA;
m->cardinality = MC_ONE; /* actually don't care */
*end = s;
return m;
}
2010-05-06 10:59:09 +01:00
if ( (s=itake_name(p, decl, &id)) )
{ m->type = MT_ELEMENT;
m->content.element = find_element(dtd, id);
decl = s;
} else
{ if ( !(s=isee_func(dtd, decl, CF_GRPO)) )
2010-05-06 10:59:09 +01:00
{ gripe(p, ERC_SYNTAX_ERROR, L"Name group expected", decl);
free_model(m);
return NULL;
}
decl = s;
for(;;)
{ dtd_model *sub;
modeltype mt;
2010-05-06 10:59:09 +01:00
if ( !(sub = make_model(p, decl, &s)) )
{ free_model(sub);
return NULL;
}
decl = s;
add_submodel(m, sub);
2010-05-06 10:59:09 +01:00
if ( (s = isee_func(dtd, decl, CF_OR)) )
{ decl = s;
mt = MT_OR;
} else if ( (s = isee_func(dtd, decl, CF_SEQ)) )
{ decl = s;
mt = MT_SEQ;
} else if ( (s = isee_func(dtd, decl, CF_AND)) )
{ decl = s;
mt = MT_AND;
} else if ( (s = isee_func(dtd, decl, CF_GRPC)) )
{ decl = s;
break;
} else
2010-05-06 10:59:09 +01:00
{ gripe(p, ERC_SYNTAX_ERROR, L"Connector ('|', ',' or '&') expected", decl);
free_model(m);
return NULL;
}
decl = iskip_layout(dtd, decl);
if ( m->type != mt )
{ if ( !m->type )
m->type = mt;
else
2010-05-06 10:59:09 +01:00
{ gripe(p, ERC_SYNTAX_ERROR, L"Different connector types in model", decl);
free_model(m);
return NULL;
}
}
}
}
if ( (s = isee_func(dtd, decl, CF_OPT)) )
{ decl = s;
m->cardinality = MC_OPT;
} else if ( (s=isee_func(dtd, decl, CF_REP)) )
{ decl = s;
m->cardinality = MC_REP;
} else if ( (s=isee_func(dtd, decl, CF_PLUS)) )
{ /* ROK: watch out for (x) +(y) */
if ( isee_func(dtd, iskip_layout(dtd, s), CF_GRPO) == NULL )
{ decl = s;
m->cardinality = MC_PLUS;
}
} else
m->cardinality = MC_ONE;
2010-05-06 10:59:09 +01:00
if ( m->type == MT_UNDEF ) /* simplify (e+), etc. */
{ dtd_model *sub = m->content.group;
modelcard card;
assert(!sub->next);
if ( sub->cardinality == MC_ONE )
card = m->cardinality;
else if ( m->cardinality == MC_ONE )
card = sub->cardinality;
else
{ m->type = MT_OR;
goto out;
}
2010-05-06 10:59:09 +01:00
*m = *sub;
m->cardinality = card;
sgml_free(sub);
}
out:
*end = iskip_layout(dtd, decl);
return m;
}
static const ichar *
2010-05-06 10:59:09 +01:00
process_model(dtd_parser *p, dtd_edef *e, const ichar *decl)
{ const ichar *s;
2010-05-06 10:59:09 +01:00
dtd *dtd = p->dtd;
decl = iskip_layout(dtd, decl);
if ( (s = isee_identifier(dtd, decl, "empty")) )
{ e->type = C_EMPTY;
return s;
}
if ( (s = isee_identifier(dtd, decl, "cdata")) )
{ e->type = C_CDATA;
return s;
}
if ( (s = isee_identifier(dtd, decl, "rcdata")) )
{ e->type = C_RCDATA;
return s;
}
if ( (s = isee_identifier(dtd, decl, "any")) )
{ e->type = C_ANY;
return s;
}
2010-05-06 10:59:09 +01:00
e->type = C_PCDATA;
2010-05-06 10:59:09 +01:00
if ( !(e->content = make_model(p, decl, &decl)) )
return FALSE;
return decl;
}
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
See a name-group separator. As long as we haven't decided, this can be
CF_NG. If we have decided they must all be the same.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
static const ichar *
isee_ngsep(dtd *dtd, const ichar *decl, charfunc *sep)
{ const ichar *s;
if ( (s=isee_func(dtd, decl, *sep)) )
return iskip_layout(dtd, s);
if ( *sep == CF_NG ) /* undecided */
{ static const charfunc ng[] = { CF_SEQ, CF_OR, CF_AND };
int n;
for(n=0; n<3; n++)
{ if ( (s=isee_func(dtd, decl, ng[n])) )
{ *sep = ng[n];
return iskip_layout(dtd, s);
}
}
}
return NULL;
}
static const ichar *
2010-05-06 10:59:09 +01:00
itake_namegroup(dtd_parser *p, const ichar *decl,
dtd_symbol **names, int *n)
{ const ichar *s;
int en = 0;
2010-05-06 10:59:09 +01:00
dtd *dtd = p->dtd;
if ( (s=isee_func(dtd, decl, CF_GRPO)) )
{ charfunc ngs = CF_NG;
for(;;)
2010-05-06 10:59:09 +01:00
{ if ( !(decl=itake_name(p, s, &names[en++])) )
{ gripe(p, ERC_SYNTAX_ERROR, L"Name expected", s);
return NULL;
}
if ( (s=isee_ngsep(dtd, decl, &ngs)) )
{ decl = iskip_layout(dtd, s);
continue;
}
if ( (s=isee_func(dtd, decl, CF_GRPC)) )
{ *n = en;
decl = s;
return iskip_layout(dtd, decl);
}
2010-05-06 10:59:09 +01:00
gripe(p, ERC_SYNTAX_ERROR, L"Bad name-group", decl);
return NULL;
}
}
return NULL;
}
typedef struct
{ dtd_symbol **list;
int size;
} namelist;
static void
add_list_element(dtd_element *e, void *closure)
{ namelist *nl = closure;
nl->list[nl->size++] = e->name;
}
static const ichar *
2010-05-06 10:59:09 +01:00
itake_el_or_model_element_list(dtd_parser *p,
const ichar *decl, dtd_symbol **names, int *n)
{ const ichar *s;
2010-05-06 10:59:09 +01:00
dtd *dtd = p->dtd;
if ( isee_func(dtd, decl, CF_GRPO) )
{ dtd_model *model;
2010-05-06 10:59:09 +01:00
if ( (model = make_model(p, decl, &s)) )
{ namelist nl;
2010-05-06 10:59:09 +01:00
nl.list = names;
nl.size = 0;
for_elements_in_model(model, add_list_element, &nl);
free_model(model);
*n = nl.size;
return s;
} else
return NULL;
} else
2010-05-06 10:59:09 +01:00
{ if ( !(s = itake_name(p, decl, &names[0])) )
{ gripe(p, ERC_SYNTAX_ERROR, L"Name expected", decl);
return NULL;
}
*n = 1;
return s;
}
}
static void
add_element_list(dtd_element_list **l, dtd_element *e)
{ dtd_element_list *n = sgml_calloc(1, sizeof(*n));
n->value = e;
for( ; *l; l = &(*l)->next )
;
*l = n;
}
static int
process_element_declaraction(dtd_parser *p, const ichar *decl)
{ dtd *dtd = p->dtd;
ichar buf[MAXDECL];
const ichar *s;
dtd_symbol *eid[MAXATTELEM];
dtd_edef *def;
int en;
int i;
/* expand parameter entities */
if ( !expand_pentities(p, decl, ZERO_TERM_LEN,
buf, sizeof(buf)/sizeof(ichar)) )
return FALSE;
decl = buf;
2010-05-06 10:59:09 +01:00
if ( !(s=itake_el_or_model_element_list(p, decl, eid, &en)) )
return gripe(p, ERC_SYNTAX_ERROR, L"Name or name-group expected", decl);
decl = s;
if ( en == 0 )
return TRUE; /* 0 elements */
STAT(edefs_decl++);
def = new_element_definition(dtd);
for(i=0; i<en; i++)
{ find_element(dtd, eid[i]);
assert(eid[i]->element->structure == NULL);
eid[i]->element->structure = def;
eid[i]->element->undefined = FALSE;
}
def->references = en; /* for GC */
/* omitted tag declarations (opt) */
if ( (s = isee_identifier(dtd, decl, "-")) )
{ def->omit_close = FALSE;
goto seeclose;
} else if ( (s = isee_identifier(dtd, decl, "o")) )
{ def->omit_open = TRUE;
seeclose:
decl = s;
if ( (s = isee_identifier(dtd, decl, "-")) )
{ def->omit_close = FALSE;
} else if ( (s = isee_identifier(dtd, decl, "o")) )
{ for(i=0; i<en; i++)
def->omit_close = TRUE;
} else
2010-05-06 10:59:09 +01:00
return gripe(p, ERC_SYNTAX_ERROR, L"Bad omit-tag declaration", decl);
decl = s;
}
2010-05-06 10:59:09 +01:00
/* content model */
2010-05-06 10:59:09 +01:00
if ( !(decl=process_model(p, def, decl)) )
return FALSE;
/* in/excluded elements */
if ( decl[0] == '-' || decl[0] == '+' )
{ dtd_symbol *ng[MAXNAMEGROUP];
int ns;
dtd_element_list **l;
2010-05-06 10:59:09 +01:00
if ( decl[0] == '-' )
l = &def->excluded;
else
l = &def->included;
decl++;
2010-05-06 10:59:09 +01:00
if ( (s=itake_namegroup(p, decl, ng, &ns)) )
{ int i;
decl = s;
for(i=0; i<ns; i++)
add_element_list(l, find_element(dtd, ng[i]));
} else
2010-05-06 10:59:09 +01:00
{ return gripe(p, ERC_SYNTAX_ERROR, L"Name group expected", decl);
}
}
if (*decl)
2010-05-06 10:59:09 +01:00
return gripe(p, ERC_SYNTAX_ERROR, L"Unexpected end of declaration", decl);
return TRUE;
}
static void
add_name_list(dtd_name_list **nl, dtd_symbol *s)
{ dtd_name_list *n = sgml_calloc(1, sizeof(*n));
2010-05-06 10:59:09 +01:00
n->value = s;
for( ; *nl; nl = &(*nl)->next )
;
*nl = n;
}
static void
set_element_properties(dtd_element *e, dtd_attr *a)
{ if ( istreq(a->name->name, L"xml:space") )
{ switch(a->def)
{ case AT_FIXED:
case AT_DEFAULT:
break;
default:
return;
}
switch (a->type )
{ case AT_NAMEOF:
case AT_NAME:
case AT_NMTOKEN:
e->space_mode = istr_to_space_mode(a->att_def.name->name);
break;
case AT_CDATA:
e->space_mode = istr_to_space_mode((ichar *)a->att_def.cdata);
break;
default:
break;
}
}
}
static void
2010-05-06 10:59:09 +01:00
add_attribute(dtd_parser *p, dtd_element *e, dtd_attr *a)
{ dtd_attr_list **l;
dtd_attr_list *n;
for(l = &e->attributes; *l; l = &(*l)->next)
{ if ( (*l)->attribute->name == a->name )
2010-05-06 10:59:09 +01:00
{ gripe(p, ERC_REDEFINED, L"attribute", a->name);
a->references++; /* attempt to redefine attribute: */
free_attribute(a); /* first wins according to standard */
return;
}
}
n = sgml_calloc(1, sizeof(*n));
n->attribute = a;
a->references++;
*l = n;
set_element_properties(e, a);
}
static int
process_attlist_declaraction(dtd_parser *p, const ichar *decl)
{ dtd *dtd = p->dtd;
dtd_symbol *eid[MAXATTELEM];
int i, en;
ichar buf[MAXDECL];
const ichar *s;
/* expand parameter entities */
if ( !expand_pentities(p, decl, ZERO_TERM_LEN, buf, sizeof(buf)/sizeof(ichar)) )
return FALSE;
decl = iskip_layout(dtd, buf);
DEBUG(printf("Expanded to %s\n", decl));
2010-05-06 10:59:09 +01:00
if ( !(decl=itake_el_or_model_element_list(p, decl, eid, &en)) )
return FALSE;
/* fetch attributes */
while(*decl)
{ dtd_attr *at = sgml_calloc(1, sizeof(*at));
at->references = REFS_VIRGIN;
/* name of attribute */
2010-05-06 10:59:09 +01:00
if ( !(s = itake_name(p, decl, &at->name)) )
{ free_attribute(at);
2010-05-06 10:59:09 +01:00
return gripe(p, ERC_SYNTAX_ERROR, L"Name expected", decl);
}
decl = s;
/* (name1|name2|...) type */
if ( (s=isee_func(dtd, decl, CF_GRPO)) )
{ charfunc ngs = CF_NG;
at->type = AT_NAMEOF;
decl=s;
for(;;)
{ dtd_symbol *nm;
2010-05-06 10:59:09 +01:00
if ( !(s = itake_nmtoken(p, decl, &nm)) )
{ free_attribute(at);
2010-05-06 10:59:09 +01:00
return gripe(p, ERC_SYNTAX_ERROR, L"Name expected", decl);
}
decl = s;
add_name_list(&at->typeex.nameof, nm);
if ( (s=isee_ngsep(dtd, decl, &ngs)) )
{ decl = s;
continue;
}
if ( (s = isee_func(dtd, decl, CF_GRPC)) )
{ decl=s;
decl = iskip_layout(dtd, decl);
break;
}
free_attribute(at);
2010-05-06 10:59:09 +01:00
return gripe(p, ERC_SYNTAX_ERROR, L"Illegal name-group", decl);
}
} else if ( (s=isee_identifier(dtd, decl, "cdata")) )
{ decl = s;
at->type = AT_CDATA;
} else if ( (s=isee_identifier(dtd, decl, "entity")) )
{ decl = s;
at->type = AT_ENTITY;
} else if ( (s=isee_identifier(dtd, decl, "entities")) )
{ decl = s;
at->type = AT_ENTITIES;
at->islist = TRUE;
} else if ( (s=isee_identifier(dtd, decl, "id")) )
{ decl = s;
at->type = AT_ID;
} else if ( (s=isee_identifier(dtd, decl, "idref")) )
{ decl = s;
at->type = AT_IDREF;
} else if ( (s=isee_identifier(dtd, decl, "idrefs")) )
{ decl = s;
at->type = AT_IDREFS;
at->islist = TRUE;
} else if ( (s=isee_identifier(dtd, decl, "name")) )
{ decl = s;
at->type = AT_NAME;
} else if ( (s=isee_identifier(dtd, decl, "names")) )
{ decl = s;
at->type = AT_NAMES;
at->islist = TRUE;
} else if ( (s=isee_identifier(dtd, decl, "nmtoken")) )
{ decl = s;
at->type = AT_NMTOKEN;
} else if ( (s=isee_identifier(dtd, decl, "nmtokens")) )
{ decl = s;
at->type = AT_NMTOKENS;
at->islist = TRUE;
} else if ( (s=isee_identifier(dtd, decl, "number")) )
{ decl = s;
at->type = AT_NUMBER;
} else if ( (s=isee_identifier(dtd, decl, "numbers")) )
{ decl = s;
at->type = AT_NUMBERS;
at->islist = TRUE;
} else if ( (s=isee_identifier(dtd, decl, "nutoken")) )
{ decl = s;
at->type = AT_NUTOKEN;
} else if ( (s=isee_identifier(dtd, decl, "nutokens")) )
{ decl = s;
at->type = AT_NUTOKENS;
at->islist = TRUE;
} else if ( (s=isee_identifier(dtd, decl, "notation")) )
{ dtd_symbol *ng[MAXNAMEGROUP];
int ns;
at->type = AT_NOTATION;
decl=s;
2010-05-06 10:59:09 +01:00
if ( (s=itake_namegroup(p, decl, ng, &ns)) )
{ decl = s;
for(i=0; i<ns; i++)
add_name_list(&at->typeex.nameof, ng[i]);
} else
{ free_attribute(at);
2010-05-06 10:59:09 +01:00
return gripe(p, ERC_SYNTAX_ERROR, L"name-group expected", decl);
}
} else
{ free_attribute(at);
2010-05-06 10:59:09 +01:00
return gripe(p, ERC_SYNTAX_ERROR, L"Attribute-type expected", decl);
}
/* Attribute Defaults */
if ( (s=isee_identifier(dtd, decl, "#fixed")) )
{ decl = s;
at->def = AT_FIXED;
} else if ( (s=isee_identifier(dtd, decl, "#required")) )
{ decl = s;
at->def = AT_REQUIRED;
} else if ( (s=isee_identifier(dtd, decl, "#current")) )
{ decl = s;
at->def = AT_CURRENT;
} else if ( (s=isee_identifier(dtd, decl, "#conref")) )
{ decl = s;
at->def = AT_CONREF;
} else if ( (s=isee_identifier(dtd, decl, "#implied")) )
{ decl = s;
at->def = AT_IMPLIED;
} else /* real default */
at->def = AT_DEFAULT;
if ( at->def == AT_DEFAULT || at->def == AT_FIXED )
{ ichar buf[MAXSTRINGLEN];
ichar *start; int len;
const ichar *end;
2010-05-06 10:59:09 +01:00
if ( !(end=itake_string(dtd, decl, &start, &len)) )
2010-05-06 10:59:09 +01:00
{ end=itake_nmtoken_chars(p, decl, buf, sizeof(buf)/sizeof(ichar));
start = buf;
len = (int)istrlen(buf);
}
if ( !end )
2010-05-06 10:59:09 +01:00
return gripe(p, ERC_SYNTAX_ERROR, L"Bad attribute default", decl);
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Note: itake_name(), etc. work on nul-terminated strings. The result of
itake_string() is a pointer in a nul-terminated string and these
functions will stop scanning at the quote anyway, so we can use the
length of the parsed data to verify we parsed all of it.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
switch(at->type)
{ case AT_CDATA:
{ at->att_def.cdata = istrndup(start, len);
break;
}
case AT_ENTITY:
case AT_NOTATION:
case AT_NAME:
2010-05-06 10:59:09 +01:00
{ if ( !(s=itake_name(p, start, &at->att_def.name)) ||
(s-start) != len )
2010-05-06 10:59:09 +01:00
return gripe(p, ERC_DOMAIN, L"name", decl);
break;
}
case AT_NMTOKEN:
case AT_NAMEOF:
2010-05-06 10:59:09 +01:00
{ if ( !(s=itake_nmtoken(p, start, &at->att_def.name)) ||
(s-start) != len )
2010-05-06 10:59:09 +01:00
return gripe(p, ERC_DOMAIN, L"nmtoken", decl);
break;
}
case AT_NUTOKEN:
2010-05-06 10:59:09 +01:00
{ if ( !(s=itake_nutoken(p, start, &at->att_def.name)) ||
(s-start) != len )
2010-05-06 10:59:09 +01:00
return gripe(p, ERC_DOMAIN, L"nutoken", decl);
break;
}
case AT_NUMBER:
2010-05-06 10:59:09 +01:00
{ if ( !(s=itake_number(p, start, at)) ||
(s-start) != len )
2010-05-06 10:59:09 +01:00
return gripe(p, ERC_DOMAIN, L"number", decl);
break;
}
case AT_NAMES:
case AT_ENTITIES:
case AT_IDREFS:
case AT_NMTOKENS:
case AT_NUMBERS:
case AT_NUTOKENS:
{ at->att_def.list = istrndup(buf, len);
break;
}
default:
{ free_attribute(at);
2010-05-06 10:59:09 +01:00
return gripe(p, ERC_REPRESENTATION, L"No default for type");
}
}
decl = end;
}
/* add to list */
at->references = 0;
for(i=0; i<en; i++)
{ dtd_element *e = def_element(dtd, eid[i]);
2010-05-06 10:59:09 +01:00
add_attribute(p, e, at);
}
}
return TRUE;
}
/*******************************
* GENERIC TAG PROCESSING *
*******************************/
typedef enum
{ IE_NORMAL,
IE_INCLUDED, /* is included */
IE_EXCLUDED /* is excluded */
} includetype;
static includetype
in_or_excluded(sgml_environment *env, dtd_element *e)
{ for(; env; env=env->parent)
{ if ( env->element->structure )
{ dtd_edef *def = env->element->structure;
dtd_element_list *el;
for(el=def->excluded; el; el=el->next)
{ if ( el->value == e )
return IE_EXCLUDED;
}
for(el=def->included; el; el=el->next)
{ if ( el->value == e )
return IE_INCLUDED;
}
}
}
return IE_NORMAL;
}
static int
complete(sgml_environment *env)
{ if ( env->element->structure &&
!env->element->undefined &&
env->element->structure->type != C_ANY )
{ dtd_edef *def = env->element->structure;
if ( !same_state(def->final_state, env->state) )
return FALSE;
}
return TRUE;
}
static void
2010-05-06 10:59:09 +01:00
validate_completeness(dtd_parser *p, sgml_environment *env)
{ if ( !complete(env) )
{ wchar_t buf[MAXNMLEN+50];
swprintf(buf, MAXNMLEN+50, L"Incomplete element: <%s>",
env->element->name->name);
2010-05-06 10:59:09 +01:00
gripe(p, ERC_VALIDATE, buf); /* TBD: expected */
}
}
static sgml_environment *
push_element(dtd_parser *p, dtd_element *e, int callback)
{ if ( e != CDATA_ELEMENT )
{ sgml_environment *env = sgml_calloc(1, sizeof(*env));
emit_cdata(p, FALSE);
env->element = e;
env->state = make_state_engine(e);
env->space_mode = (p->environments ? p->environments->space_mode
: p->dtd->space_mode);
env->parent = p->environments;
p->environments = env;
if ( p->dtd->shorttag )
{ env->saved_waiting_for_net = p->waiting_for_net;
if ( p->event_class == EV_SHORTTAG )
{ p->waiting_for_net = TRUE;
env->wants_net = TRUE;
} else
{ env->wants_net = FALSE;
if ( e->structure && e->structure->omit_close == FALSE )
p->waiting_for_net = FALSE;
}
}
if ( e->map )
p->map = env->map = e->map;
else if ( env->parent )
p->map = env->map = env->parent->map;
p->first = TRUE;
if ( callback && p->on_begin_element )
{ sgml_attribute atts[MAXATTRIBUTES];
int natts = 0;
if ( !(p->flags & SGML_PARSER_NODEFS) )
natts = add_default_attributes(p, e, natts, atts);
(*p->on_begin_element)(p, e, natts, atts);
}
if ( e->structure )
{ if ( e->structure->type == C_CDATA ||
e->structure->type == C_RCDATA )
{ p->state = (e->structure->type == C_CDATA ? S_CDATA : S_RCDATA);
p->cdata_state = p->state;
p->etag = e->name->name;
p->etaglen = (int)istrlen(p->etag);
sgml_cplocation(&p->startcdata, &p->location);
} else
p->cdata_state = S_PCDATA;
}
}
return p->environments;
}
static void
free_environment(sgml_environment *env)
{
#ifdef XMLNS
if ( env->xmlns )
2010-05-06 10:59:09 +01:00
xmlns_free(env->xmlns);
#endif
sgml_free(env);
}
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Pop the stack, closing all environment uptil `to'. The close was
initiated by pushing the element `e'.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
static int
pop_to(dtd_parser *p, sgml_environment *to, dtd_element *e0)
{ sgml_environment *env, *parent;
2010-05-06 10:59:09 +01:00
for(env = p->environments; env != to; env=parent)
{ dtd_element *e = env->element;
2010-05-06 10:59:09 +01:00
validate_completeness(p, env);
parent = env->parent;
2010-05-06 10:59:09 +01:00
if ( e->structure && !e->structure->omit_close )
2010-05-06 10:59:09 +01:00
gripe(p, ERC_OMITTED_CLOSE, e->name->name);
if ( e0 != CDATA_ELEMENT )
emit_cdata(p, TRUE);
p->first = FALSE;
p->environments = env;
if ( p->dtd->shorttag )
p->waiting_for_net = env->saved_waiting_for_net;
WITH_CLASS(p, EV_OMITTED,
if ( p->on_end_element )
(*p->on_end_element)(p, e));
free_environment(env);
}
p->environments = to;
p->map = to->map;
2010-05-06 10:59:09 +01:00
return TRUE;
}
static void
allow_for(dtd_element *in, dtd_element *e)
{ dtd_edef *def = in->structure;
dtd_model *g;
if ( def->type == C_EMPTY )
{ def->type = C_PCDATA;
def->content = sgml_calloc(1, sizeof(*def->content));
def->content->type = MT_OR;
def->content->cardinality = MC_REP;
}
assert(def->content->type == MT_OR);
g = def->content->content.group;
if ( e == CDATA_ELEMENT )
{ dtd_model *m;
for(; g; g = g->next)
{ if ( g->type == MT_PCDATA )
return;
}
m = sgml_calloc(1, sizeof(*m));
m->type = MT_PCDATA;
m->cardinality = MC_ONE; /* ignored */
add_submodel(def->content, m);
} else
{ dtd_model *m;
for(; g; g = g->next)
{ if ( g->type == MT_ELEMENT && g->content.element == e )
return;
}
m = sgml_calloc(1, sizeof(*m));
m->type = MT_ELEMENT;
m->cardinality = MC_ONE; /* ignored */
m->content.element = e;
add_submodel(def->content, m);
}
}
static int
open_element(dtd_parser *p, dtd_element *e, int warn)
{ if ( !p->environments && p->enforce_outer_element )
{ dtd_element *f = p->enforce_outer_element->element;
if ( f && f != e )
{ if ( !f->structure ||
!f->structure->omit_open )
2010-05-06 10:59:09 +01:00
gripe(p, ERC_OMITTED_OPEN, f->name->name);
WITH_CLASS(p, EV_OMITTED,
{ open_element(p, f, TRUE);
if ( p->on_begin_element )
{ sgml_attribute atts[MAXATTRIBUTES];
int natts = 0;
2010-05-06 10:59:09 +01:00
if ( !(p->flags & SGML_PARSER_NODEFS) )
natts = add_default_attributes(p, f, natts, atts);
(*p->on_begin_element)(p, f, natts, atts);
}
});
}
}
/* no DTD available yet */
if ( !p->environments && !p->dtd->doctype && e != CDATA_ELEMENT )
{ const ichar *file;
file = find_in_catalogue(CAT_DOCTYPE, e->name->name, NULL, NULL,
p->dtd->dialect != DL_SGML);
if ( file )
{ dtd_parser *clone = clone_dtd_parser(p);
2010-05-06 10:59:09 +01:00
gripe(p, ERC_NO_DOCTYPE, e->name->name, file);
if ( load_dtd_from_file(clone, file) )
p->dtd->doctype = istrdup(e->name->name);
else
2010-05-06 10:59:09 +01:00
gripe(p, ERC_EXISTENCE, L"file", file);
free_dtd_parser(clone);
}
}
if ( p->environments )
{ sgml_environment *env = p->environments;
if ( env->element->undefined )
{ allow_for(env->element, e); /* <!ELEMENT x - - (model) +(y)> */
push_element(p, e, FALSE);
return TRUE;
}
if ( env->element->structure &&
env->element->structure->type == C_ANY )
{ if ( e != CDATA_ELEMENT && e->undefined )
2010-05-06 10:59:09 +01:00
gripe(p, ERC_EXISTENCE, L"Element", e->name->name);
push_element(p, e, FALSE);
return TRUE;
}
switch(in_or_excluded(env, e))
{ case IE_INCLUDED:
push_element(p, e, FALSE);
return TRUE;
case IE_EXCLUDED:
if ( warn )
2010-05-06 10:59:09 +01:00
gripe(p, ERC_NOT_ALLOWED, e->name->name);
/*FALLTHROUGH*/
case IE_NORMAL:
for(; env; env=env->parent)
{ dtd_state *new;
2010-05-06 10:59:09 +01:00
if ( (new = make_dtd_transition(env->state, e)) )
{ env->state = new;
pop_to(p, env, e);
push_element(p, e, FALSE);
return TRUE;
} else
{ dtd_element *oe[MAXOMITTED]; /* omitted open */
int olen;
int i;
2010-05-06 10:59:09 +01:00
if ( (olen=find_omitted_path(env->state, e, oe)) > 0 )
{ pop_to(p, env, e);
WITH_CLASS(p, EV_OMITTED,
for(i=0; i<olen; i++)
{ env->state = make_dtd_transition(env->state, oe[i]);
env = push_element(p, oe[i], TRUE);
})
env->state = make_dtd_transition(env->state, e);
push_element(p, e, FALSE);
return TRUE;
}
}
if ( !env->element->structure ||
!env->element->structure->omit_close )
break;
}
}
if ( warn )
{ if ( e == CDATA_ELEMENT )
2010-05-06 10:59:09 +01:00
gripe(p, ERC_VALIDATE, L"#PCDATA not allowed here");
else if ( e->undefined )
2010-05-06 10:59:09 +01:00
gripe(p, ERC_EXISTENCE, L"Element", e->name->name);
else
2010-05-06 10:59:09 +01:00
gripe(p, ERC_NOT_ALLOWED, e->name->name);
}
}
if ( warn )
{ push_element(p, e, FALSE);
return TRUE;
} else
return FALSE;
}
static int
close_element(dtd_parser *p, dtd_element *e, int conref)
{ sgml_environment *env;
for(env = p->environments; env; env=env->parent)
{ if ( env->element == e ) /* element is open */
{ sgml_environment *parent;
for(env = p->environments; ; env=parent)
{ dtd_element *ce = env->element;
if ( !(conref && env == p->environments) )
2010-05-06 10:59:09 +01:00
validate_completeness(p, env);
parent = env->parent;
2010-05-06 10:59:09 +01:00
p->first = FALSE;
if ( p->on_end_element )
(*p->on_end_element)(p, env->element);
free_environment(env);
p->environments = parent;
if ( ce == e ) /* closing current element */
{ p->map = (parent ? parent->map : NULL);
return TRUE;
} else /* omited close */
{ if ( ce->structure && !ce->structure->omit_close )
2010-05-06 10:59:09 +01:00
gripe(p, ERC_OMITTED_CLOSE, ce->name->name);
}
}
}
}
2010-05-06 10:59:09 +01:00
return gripe(p, ERC_NOT_OPEN, e->name->name);
}
static int
close_current_element(dtd_parser *p)
{ if ( p->environments )
{ dtd_element *e = p->environments->element;
2010-05-06 10:59:09 +01:00
emit_cdata(p, TRUE);
return close_element(p, e, FALSE);
}
2010-05-06 10:59:09 +01:00
return gripe(p, ERC_SYNTAX_ERROR, L"No element to close", "");
}
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
get_attribute_value()
Get the value for an attribute. Once I thought this was simple, but
Richard O'Keefe pointed to the complex handling of white-space in SGML
attributes. Basically, if the attribute is quoted, we need:
* If CDATA, map all blank to space characters, then expand
entities
* If !CDATA expand all entities, canonise white space by
deleting leading and trailing space and squishing multiple
space characters to a single (lower for us) case.
This almost, but not completely matches the XML definition. This however
is so complex we will ignore it for now.
[Rewritten by Richard O'Keefe with these addional comments]
Reads a value, the attribute name and value indicator having been
processed already. It calls itake_string() to read quoted values, and
itake_unquoted() to read unquoted values.
itake_string(dtd, in, buf, size)
- skips layout INCLUDING comments,
- returns NULL if the next character is not ' or ",
- copies characters from in to buf until a matching ' or " is found,
- adds a terminating \0,
- skips more layout INCLUDING comments, and
- returns the new input position.
It is quite wrong to skip leading comments here. In the tag
<foo bar = --ugh-- zoo>
the characters "--ugh--" *are the value*. They are not a comment.
Comments are not in fact allowed inside tags, unfortunately.
This tag is equivalent to
<foo bar="--ugh--" something="zoo">
where something is an attribute that has zoo as one of its enumerals.
Because itake_string() is called in many other places, this bug has
not yet been fixed.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
static ichar const *
get_attribute_value(dtd_parser *p, ichar const *decl, sgml_attribute *att)
{ ichar tmp[MAXSTRINGLEN];
ichar *buf = tmp;
ichar const *s;
ichar c;
dtd *dtd = p->dtd;
ichar const *end;
ichar *start; int len;
enum
{ DIG_FIRST = 8, /* any token start with digit? */
NAM_FIRST = 4, /* any token start with non-digit name char? */
NAM_LATER = 2, /* any token have non-digit name char later? */
ANY_OTHER = 1, /* any token have illegal character? */
YET_EMPTY = 0
}
token = YET_EMPTY;
att->value.textW = NULL; /* UCS text */
att->value.number = 0;
att->flags = 0;
end = itake_string(dtd, decl, &start, &len);
if ( end != NULL )
{ ocharbuf out;
init_ocharbuf(&out);
expand_entities(p, start, len, &out);
if ( att->definition->type == AT_CDATA )
{ malloc_ocharbuf(&out);
att->value.number = out.size;
att->value.textW = out.data.w;
return end;
} else
{ ichar *d;
buf = out.data.w;
/* canonicalise blanks */
s = buf;
while ((c = *s++) != '\0' && HasClass(dtd, c, CH_BLANK))
;
d = buf;
while ( c != '\0' )
{ token |= HasClass(dtd, c, CH_DIGIT) ? DIG_FIRST
: HasClass(dtd, c, CH_NAME) ? NAM_FIRST : /* oops! */ ANY_OTHER;
if ( d != buf )
*d++ = ' ';
if ( dtd->case_sensitive )
{ *d++ = c;
while ((c = *s++) != '\0' && !HasClass(dtd, c, CH_BLANK))
{ token |= HasClass(dtd, c, CH_DIGIT) ? 0
: HasClass(dtd, c, CH_NAME) ? NAM_LATER : /* oops! */ ANY_OTHER;
*d++ = c;
}
} else
{ *d++ = towlower(c);
while ((c = *s++) != '\0' && !HasClass(dtd, c, CH_BLANK))
{ token |= HasClass(dtd, c, CH_DIGIT) ? 0
: HasClass(dtd, c, CH_NAME) ? NAM_LATER : /* oops! */ ANY_OTHER;
*d++ = towlower(c);
}
}
while (c != '\0' && HasClass(dtd, c, CH_BLANK))
c = *s++;
}
*d = '\0';
}
} else
2010-05-06 10:59:09 +01:00
{ end = itake_unquoted(p, decl, tmp, sizeof(tmp)/sizeof(ichar));
if (end == NULL)
return NULL;
s = buf;
c = *s++;
if (c != '\0')
{ token |= HasClass(dtd, c, CH_DIGIT) ? DIG_FIRST
: HasClass(dtd, c, CH_NAME) ? NAM_FIRST : /* oops! */ ANY_OTHER;
while ((c = *s++) != 0)
{ token |= HasClass(dtd, c, CH_DIGIT) ? 0
: HasClass(dtd, c, CH_NAME) ? NAM_LATER : /* oops! */ ANY_OTHER;
}
}
if ( token == YET_EMPTY || (token & ANY_OTHER) != 0)
2010-05-06 10:59:09 +01:00
gripe(p, ERC_SYNTAX_WARNING, L"Attribute value requires quotes", buf);
if (!dtd->case_sensitive && att->definition->type != AT_CDATA)
istrlower(buf);
}
switch (att->definition->type)
{ case AT_NUMBER: /* number */
if (token != DIG_FIRST)
2010-05-06 10:59:09 +01:00
{ gripe(p, ERC_SYNTAX_WARNING, L"NUMBER expected", decl);
} else if (dtd->number_mode == NU_INTEGER)
{ (void) istrtol(buf, &att->value.number);
} else
{ att->value.textW = istrdup(buf);
att->value.number = (long)istrlen(buf);
}
return end;
case AT_CDATA: /* CDATA attribute */
att->value.textW = istrdup(buf);
att->value.number = (long)istrlen(buf);
return end;
case AT_ID: /* identifier */
case AT_IDREF: /* identifier reference */
case AT_NAME: /* name token */
case AT_NOTATION: /* notation-name */
if (token == YET_EMPTY || (token & (DIG_FIRST | ANY_OTHER)) != 0)
2010-05-06 10:59:09 +01:00
gripe(p, ERC_SYNTAX_WARNING, L"NAME expected", decl);
break;
case AT_NAMEOF: /* one of these names */
case AT_NMTOKEN: /* name-token */
if (token == YET_EMPTY || (token & ANY_OTHER) != 0)
2010-05-06 10:59:09 +01:00
gripe(p, ERC_SYNTAX_WARNING, L"NMTOKEN expected", decl);
if ( att->definition->type == AT_NAMEOF )
{ dtd_name_list *nl;
for(nl=att->definition->typeex.nameof; nl; nl = nl->next)
{ if ( istreq(nl->value->name, buf) )
goto passed;
}
2010-05-06 10:59:09 +01:00
gripe(p, ERC_SYNTAX_WARNING, L"unexpected value", decl);
}
break;
case AT_NUTOKEN: /* number token */
if ((token & (NAM_FIRST | ANY_OTHER)) != 0)
2010-05-06 10:59:09 +01:00
gripe(p, ERC_SYNTAX_WARNING, L"NUTOKEN expected", decl);
break;
case AT_ENTITY: /* entity-name */
if (token == YET_EMPTY || (token & (DIG_FIRST | ANY_OTHER)) != 0)
2010-05-06 10:59:09 +01:00
gripe(p, ERC_SYNTAX_WARNING, L"entity NAME expected", decl);
break;
case AT_NAMES: /* list of names */
case AT_IDREFS: /* list of identifier references */
if (token == YET_EMPTY || (token & (DIG_FIRST | ANY_OTHER)) != 0)
2010-05-06 10:59:09 +01:00
gripe(p, ERC_SYNTAX_WARNING, L"NAMES expected", decl);
break;
case AT_ENTITIES: /* entity-name list */
if (token == YET_EMPTY || (token & (DIG_FIRST | ANY_OTHER)) != 0)
2010-05-06 10:59:09 +01:00
gripe(p, ERC_SYNTAX_WARNING, L"entity NAMES expected", decl);
break;
case AT_NMTOKENS: /* name-token list */
if (token == YET_EMPTY || (token & ANY_OTHER) != 0)
2010-05-06 10:59:09 +01:00
gripe(p, ERC_SYNTAX_WARNING, L"NMTOKENS expected", decl);
break;
case AT_NUMBERS: /* number list */
if (token != DIG_FIRST)
2010-05-06 10:59:09 +01:00
gripe(p, ERC_SYNTAX_WARNING, L"NUMBERS expected", decl);
break;
case AT_NUTOKENS:
if ((token & (NAM_FIRST | ANY_OTHER)) != 0)
2010-05-06 10:59:09 +01:00
gripe(p, ERC_SYNTAX_WARNING, L"NUTOKENS expected", decl);
break;
default:
assert(0);
return NULL;
}
passed:
att->value.textW = istrdup(buf); /* TBD: more validation */
2010-05-06 10:59:09 +01:00
att->value.number = (long)istrlen(buf);
return end;
}
static const ichar *
process_attributes(dtd_parser *p, dtd_element *e, const ichar *decl,
sgml_attribute *atts, int *argc)
{ int attn = 0;
dtd *dtd = p->dtd;
decl = iskip_layout(dtd, decl);
while(decl && *decl)
{ dtd_symbol *nm;
const ichar *s;
2010-05-06 10:59:09 +01:00
if ( (s=itake_nmtoken(p, decl, &nm)) )
{ decl = s;
if ( (s=isee_func(dtd, decl, CF_VI)) ) /* name= */
{ dtd_attr *a;
if ( !HasClass(dtd, nm->name[0], CH_NMSTART) )
2010-05-06 10:59:09 +01:00
gripe(p, ERC_SYNTAX_WARNING,
"Illegal start of attribute-name", decl);
decl = s;
if ( !(a=find_attribute(e, nm)) )
{ a = sgml_calloc(1, sizeof(*a));
a->name = nm;
a->type = AT_CDATA;
a->def = AT_IMPLIED;
2010-05-06 10:59:09 +01:00
add_attribute(p, e, a);
if ( !e->undefined &&
!(dtd->dialect != DL_SGML &&
(istreq(L"xmlns", nm->name) ||
istrprefix(L"xmlns:", nm->name))) )
2010-05-06 10:59:09 +01:00
gripe(p, ERC_NO_ATTRIBUTE, e->name->name, nm->name);
}
atts[attn].definition = a;
if ( (decl=get_attribute_value(p, decl, atts+attn)) )
{ attn++;
continue;
}
} else if ( e->structure )
{ dtd_attr_list *al; /* value shorthand */
for(al=e->attributes; al; al=al->next)
{ dtd_attr *a = al->attribute;
if ( a->type == AT_NAMEOF || a->type == AT_NOTATION )
{ dtd_name_list *nl;
for(nl=a->typeex.nameof; nl; nl = nl->next)
{ if ( nl->value == nm )
{ if ( dtd->dialect != DL_SGML )
2010-05-06 10:59:09 +01:00
gripe(p, ERC_SYNTAX_WARNING,
"Value short-hand in XML mode", decl);
atts[attn].flags = 0;
atts[attn].definition = a;
atts[attn].value.textW = istrdup(nm->name);
atts[attn].value.number = (long)istrlen(nm->name);
attn++;
goto next;
}
}
}
}
2010-05-06 10:59:09 +01:00
gripe(p, ERC_NO_ATTRIBUTE_VALUE, e->name->name, nm->name);
decl = s;
} else
2010-05-06 10:59:09 +01:00
{ gripe(p, ERC_SYNTAX_ERROR, L"Bad attribute", decl);
decl = s;
}
} else
{ *argc = attn;
return decl;
}
2010-05-06 10:59:09 +01:00
next:
;
}
*argc = attn;
return decl;
}
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
sgml_add_default_attributes()
This function adds attributes for omitted default and fixed attributes.
These attributes are added to the end of the attribute list. This
function returns the new number of attributes. The `atts' array is
assumed to be MAXATTRIBUTES long, normally passed from
process_begin_element.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
static int
add_default_attributes(dtd_parser *p, dtd_element *e,
int natts, sgml_attribute *atts)
{ dtd_attr_list *al;
if ( e == CDATA_ELEMENT )
return natts;
for(al=e->attributes; al; al=al->next)
{ dtd_attr *a = al->attribute;
switch(a->def)
{ case AT_REQUIRED: /* TBD: check if present */
case AT_CURRENT: /* TBD: register in DTD and reuse */
case AT_CONREF:
case AT_IMPLIED:
goto next;
case AT_FIXED:
case AT_DEFAULT:
{ int i;
sgml_attribute *ap;
for(i=0, ap=atts; i<natts; i++, ap++)
{ if ( ap->definition == a )
goto next;
}
ap->definition = a;
ap->value.textW = NULL;
ap->value.number = 0;
ap->flags = SGML_AT_DEFAULT;
switch(a->type)
{ case AT_CDATA:
ap->value.textW = a->att_def.cdata;
ap->value.number = (long)istrlen(ap->value.textW);
break;
case AT_NUMBER:
if ( p->dtd->number_mode == NU_TOKEN )
{ ap->value.textW = (ichar*)a->att_def.name->name;
ap->value.number = (long)istrlen(ap->value.textW);
} else
{ ap->value.number = a->att_def.number;
}
break;
default:
if ( a->islist )
{ ap->value.textW = a->att_def.list;
} else
{ ap->value.textW = (ichar*)a->att_def.name->name;
}
ap->value.number = (long)istrlen(ap->value.textW);
}
natts++;
}
}
next:;
}
return natts;
}
static void
free_attribute_values(int argc, sgml_attribute *argv)
{ int i;
for(i=0; i<argc; i++, argv++)
{ if ( (argv->flags & SGML_AT_DEFAULT) )
continue; /* shared with the DTD */
if ( argv->value.textW )
sgml_free(argv->value.textW);
}
}
static int
process_begin_element(dtd_parser *p, const ichar *decl)
{ dtd *dtd = p->dtd;
dtd_symbol *id;
const ichar *s;
2010-05-06 10:59:09 +01:00
if ( (s=itake_name(p, decl, &id)) )
{ sgml_attribute atts[MAXATTRIBUTES];
int natts;
dtd_element *e = find_element(dtd, id);
int empty = FALSE;
int conref = FALSE;
2010-05-06 10:59:09 +01:00
int rc = TRUE;
if ( !e->structure )
{ dtd_edef *def;
e->undefined = TRUE;
STAT(edefs_implicit++);
def_element(dtd, id);
def = e->structure;
def->type = C_EMPTY;
}
open_element(p, e, TRUE);
decl=s;
if ( (s=process_attributes(p, e, decl, atts, &natts)) )
decl=s;
if ( dtd->dialect != DL_SGML )
{ if ( (s=isee_func(dtd, decl, CF_ETAGO2)) )
{ empty = TRUE; /* XML <tag/> */
decl = s;
}
#ifdef XMLNS
if ( dtd->dialect == DL_XMLNS )
update_xmlns(p, e, natts, atts);
#endif
if ( dtd->dialect != DL_SGML )
update_space_mode(p, e, natts, atts);
} else
{ int i;
for(i=0; i<natts; i++)
{ if ( atts[i].definition->def == AT_CONREF )
{ empty = TRUE;
conref = TRUE;
}
}
}
if ( *decl )
2010-05-06 10:59:09 +01:00
gripe(p, ERC_SYNTAX_ERROR, L"Bad attribute list", decl);
if ( !(p->flags & SGML_PARSER_NODEFS) )
natts = add_default_attributes(p, e, natts, atts);
if ( empty ||
(dtd->dialect == DL_SGML &&
e->structure &&
e->structure->type == C_EMPTY &&
!e->undefined) )
p->empty_element = e;
else
p->empty_element = NULL;
if ( p->on_begin_element )
2010-05-06 10:59:09 +01:00
rc = (*p->on_begin_element)(p, e, natts, atts);
free_attribute_values(natts, atts);
if ( p->empty_element )
{ p->empty_element = NULL;
close_element(p, e, conref);
if ( conref ) /* might be S_CDATA due to declared content */
p->cdata_state = p->state = S_PCDATA;
}
2010-05-06 10:59:09 +01:00
return rc;
}
2010-05-06 10:59:09 +01:00
return gripe(p, ERC_SYNTAX_ERROR, L"Bad open-element tag", decl);
}
static int
process_end_element(dtd_parser *p, const ichar *decl)
{ dtd *dtd = p->dtd;
dtd_symbol *id;
const ichar *s;
2010-05-06 10:59:09 +01:00
emit_cdata(p, TRUE);
2010-05-06 10:59:09 +01:00
if ( (s=itake_name(p, decl, &id)) && *s == '\0' )
return close_element(p, find_element(dtd, id), FALSE);
if ( p->dtd->shorttag && *decl == '\0' ) /* </>: close current element */
return close_current_element(p);
2010-05-06 10:59:09 +01:00
return gripe(p, ERC_SYNTAX_ERROR, L"Bad close-element tag", decl);
}
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
process_net(dtd_parser *p)
We've seen a / of a shorttag element. Close this one.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
static int
process_net(dtd_parser *p)
{ sgml_environment *env;
prepare_cdata(p);
for(env = p->environments; env; env=env->parent)
{ if ( env->wants_net )
{ sgml_environment *parent;
pop_to(p, env, NULL); /* close parents */
2010-05-06 10:59:09 +01:00
validate_completeness(p, env);
parent = env->parent;
emit_cdata(p, TRUE);
p->first = FALSE;
if ( p->on_end_element )
{ WITH_CLASS(p, EV_SHORTTAG,
(*p->on_end_element)(p, env->element));
}
free_environment(env);
p->environments = parent;
p->map = (parent ? parent->map : NULL);
return TRUE;
}
}
return FALSE;
}
static int /* <!DOCTYPE ...> */
process_doctype(dtd_parser *p, const ichar *decl, const ichar *decl0)
{ dtd *dtd = p->dtd;
dtd_symbol *id;
const ichar *s;
dtd_entity *et = NULL;
2010-05-06 10:59:09 +01:00
if ( !(s=itake_name(p, decl, &id)) )
return gripe(p, ERC_SYNTAX_ERROR, L"Name expected", decl);
decl = s;
if ( (s=isee_identifier(dtd, decl, "system")) )
{ et = sgml_calloc(1, sizeof(*et));
et->type = ET_SYSTEM;
decl = s;
} else if ( (s=isee_identifier(dtd, decl, "public")) )
{ et = sgml_calloc(1, sizeof(*et));
et->type = ET_PUBLIC;
decl = s;
} else if ( isee_func(dtd, decl, CF_DSO) )
goto local;
if ( et )
{ et->name = id;
et->catalog_location = CAT_DOCTYPE;
if ( !(s=process_entity_value_declaration(p, decl, et)) )
return FALSE;
decl = s;
}
if ( !dtd->doctype ) /* i.e. anonymous DTD */
{ ichar *file;
dtd_parser *clone;
dtd->doctype = istrdup(id->name); /* Fill it */
if ( et )
file = entity_file(dtd, et);
else
file = istrdup(find_in_catalogue(CAT_DOCTYPE,
dtd->doctype, NULL, NULL,
dtd->dialect != DL_SGML));
if ( !file )
2010-05-06 10:59:09 +01:00
{ gripe(p, ERC_EXISTENCE, L"DTD", dtd->doctype);
} else
{ clone = clone_dtd_parser(p);
if ( !load_dtd_from_file(clone, file) )
2010-05-06 10:59:09 +01:00
gripe(p, ERC_EXISTENCE, L"file", file);
free_dtd_parser(clone);
sgml_free(file);
}
}
if ( et )
free_entity_list(et);
local:
if ( (s=isee_func(dtd, decl, CF_DSO)) ) /* [...] */
{ int grouplevel = 1;
data_mode oldmode = p->dmode;
dtdstate oldstate = p->state;
locbuf oldloc;
const ichar *q;
icharbuf *saved_ibuf = p->buffer;
push_location(p, &oldloc);
/* try to find start-location. */
/* fails if there is comment before */
/* the []! */
sgml_cplocation(&p->location, &p->startloc);
inc_location(&p->location, '<');
for(q=decl0; q < s; q++)
inc_location(&p->location, *q);
p->dmode = DM_DTD;
p->state = S_PCDATA;
p->buffer = new_icharbuf();
for( ; *s; s++ )
{ if ( isee_func(dtd, s, CF_LIT) || /* skip quoted strings */
isee_func(dtd, s, CF_LITA) )
{ ichar q = *s;
putchar_dtd_parser(p, *s++); /* pass open quote */
for( ; *s && *s != q; s++ )
putchar_dtd_parser(p, *s);
if ( *s == q ) /* pass closing quote */
putchar_dtd_parser(p, *s);
continue;
}
if ( isee_func(dtd, s, CF_DSO) )
grouplevel++;
else if ( isee_func(dtd, s, CF_DSC) && --grouplevel == 0 )
break;
putchar_dtd_parser(p, *s);
}
p->dtd->implicit = FALSE;
p->state = oldstate;
p->dmode = oldmode;
free_icharbuf(p->buffer);
p->buffer = saved_ibuf;
pop_location(p, &oldloc);
}
p->enforce_outer_element = id; /* make this the outer element */
return TRUE;
}
static void
init_decoding(dtd_parser *p)
{
#ifdef UTF8
int decode;
dtd *dtd = p->dtd;
if ( dtd->encoding == SGML_ENC_UTF8 &&
p->encoded == TRUE )
decode = TRUE;
else
decode = FALSE;
if ( p->utf8_decode != decode )
{ DEBUG(fprintf(stderr, "%s UTF-8 decoding on %p\n",
decode ? "Enable" : "Disable",
p));
p->utf8_decode = decode;
}
#endif
}
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
xml_set_encoding() is the public interface to set the encoding for the
parser.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
static int /* strcasecmp() with C locale */
posix_strcasecmp(const char *s1, const char *s2)
{ for(; *s1 && *s2; s1++, s2++)
{ int c1 = *s1&0xff;
int c2 = *s2&0xff;
if ( c1 >= 'A' && c1 <= 'Z' ) c1 += 'a'-'A';
if ( c2 >= 'A' && c2 <= 'Z' ) c2 += 'a'-'A';
if ( c1 != c2 )
return c1-c2;
}
return *s1 - *s2;
}
int
xml_set_encoding(dtd_parser *p, const char *enc)
{ dtd *dtd = p->dtd;
if ( posix_strcasecmp(enc, "iso-8859-1") == 0 )
{ dtd->encoding = SGML_ENC_ISO_LATIN1;
} else if ( posix_strcasecmp(enc, "us-ascii") == 0 )
{ dtd->encoding = SGML_ENC_ISO_LATIN1; /* doesn't make a difference */
} else if ( posix_strcasecmp(enc, "utf-8") == 0 )
{ dtd->encoding = SGML_ENC_UTF8;
} else
return FALSE;
init_decoding(p);
return TRUE;
}
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
set_encoding() sets the encoding from the encoding="..." field of the
XML header.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
static void
set_encoding(dtd_parser *p, const ichar *enc)
{ char buf[32];
char *e = buf+sizeof(buf)-1;
char *o;
const ichar *i;
for(i=enc, o=buf; *i; )
{ if ( *i < 128 && o < e )
{ *o++ = (char)*i++;
} else
{ goto error;
2010-05-06 10:59:09 +01:00
}
}
*o = '\0';
if ( !xml_set_encoding(p, buf) )
{ error:
2010-05-06 10:59:09 +01:00
gripe(p, ERC_EXISTENCE, L"character encoding", enc);
}
}
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Process <? ... ?>
Should deal with character encoding for XML documents.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
static int
process_pi(dtd_parser *p, const ichar *decl)
{ const ichar *s;
dtd *dtd = p->dtd;
if ( (s=isee_identifier(dtd, decl, "xml")) ) /* <?xml version="1.0"?> */
{ decl = s;
switch(dtd->dialect)
{ case DL_SGML:
set_dialect_dtd(dtd, DL_XML);
break;
case DL_XML:
case DL_XMLNS:
break;
}
while(*decl)
{ dtd_symbol *nm;
2010-05-06 10:59:09 +01:00
if ( (s=itake_name(p, decl, &nm)) &&
(s=isee_func(dtd, s, CF_VI)) ) /* = */
{ ichar *start;
int len;
ichar buf[MAXSTRINGLEN];
const ichar *end;
if ( !(end=itake_string(dtd, s, &start, &len)) )
2010-05-06 10:59:09 +01:00
{ end=itake_nmtoken_chars(p, s, buf, sizeof(buf)/sizeof(ichar));
start = buf;
len = (int)istrlen(buf);
}
if ( end )
{ decl = end;
if ( istrcaseeq(nm->name, L"encoding") )
{ ichar tmp[32];
if ( len < (int)(sizeof(tmp)/sizeof(ichar)-1) )
{ istrncpy(tmp, start, len);
tmp[len] = 0;
set_encoding(p, tmp);
} else
2010-05-06 10:59:09 +01:00
{ gripe(p, ERC_SYNTAX_ERROR, L"Unterminated encoding?", decl);
}
}
/* fprintf(stderr, "XML %s = %s\n", nm->name, buf); */
continue;
}
}
2010-05-06 10:59:09 +01:00
gripe(p, ERC_SYNTAX_ERROR, L"Illegal XML parameter", decl);
break;
}
return TRUE;
}
if ( p->on_pi )
(*p->on_pi)(p, decl);
return FALSE; /* Warn? */
}
static int
process_sgml_declaration(dtd_parser *p, const ichar *decl)
2010-05-06 10:59:09 +01:00
{ return gripe(p, ERC_SYNTAX_WARNING, L"Ignored <!SGML ...> declaration", NULL);
}
static int
process_declaration(dtd_parser *p, const ichar *decl)
{ const ichar *s;
dtd *dtd = p->dtd;
if ( p->dmode != DM_DTD )
{ if ( (s=isee_func(dtd, decl, CF_ETAGO2)) ) /* </ ... > */
{ return process_end_element(p, s);
} else if ( HasClass(dtd, *decl, CH_NAME) ) /* <letter */
{ return process_begin_element(p, decl);
}
}
if ( (s=isee_func(dtd, decl, CF_MDO2)) ) /* <! ... >*/
{ decl = s;
if ( p->on_decl )
(*p->on_decl)(p, decl);
if ( (s = isee_identifier(dtd, decl, "entity")) )
process_entity_declaration(p, s);
else if ( (s = isee_identifier(dtd, decl, "element")) )
process_element_declaraction(p, s);
else if ( (s = isee_identifier(dtd, decl, "attlist")) )
process_attlist_declaraction(p, s);
else if ( (s = isee_identifier(dtd, decl, "notation")) )
process_notation_declaration(p, s);
else if ( (s = isee_identifier(dtd, decl, "shortref")) )
process_shortref_declaration(p, s);
else if ( (s = isee_identifier(dtd, decl, "usemap")) )
process_usemap_declaration(p, s);
else if ( (s = isee_identifier(dtd, decl, "sgml")) )
process_sgml_declaration(p, s);
else if ( (s = isee_identifier(dtd, decl, "doctype")) )
{ if ( p->dmode != DM_DTD )
process_doctype(p, s, decl-1);
} else
{ s = iskip_layout(dtd, decl);
2010-05-06 10:59:09 +01:00
if ( *s )
2010-05-06 10:59:09 +01:00
gripe(p, ERC_SYNTAX_ERROR, L"Invalid declaration", s);
}
return TRUE;
}
2010-05-06 10:59:09 +01:00
return gripe(p, ERC_SYNTAX_ERROR, L"Invalid declaration", decl);
}
/*******************************
* STREAM BINDING *
*******************************/
void
set_file_dtd_parser(dtd_parser *p, input_type type, const ichar *name)
{ p->location.type = type;
p->location.name.file = name;
p->location.line = 1;
p->location.linepos = 0;
p->location.charpos = 0;
}
static void
set_src_dtd_parser(dtd_parser *p, input_type type, const ichar *name)
{ p->location.type = type;
p->location.name.entity = name;
p->location.line = 1;
p->location.linepos = 0;
p->location.charpos = 0;
}
void
set_mode_dtd_parser(dtd_parser *p, data_mode m)
{ p->dmode = m; /* DM_DTD or DM_DATA */
p->state = S_PCDATA;
p->blank_cdata = TRUE;
}
dtd_parser *
new_dtd_parser(dtd *dtd)
{ dtd_parser *p = sgml_calloc(1, sizeof(*p));
2010-05-06 10:59:09 +01:00
if ( !dtd )
dtd = new_dtd(NULL);
dtd->references++;
p->magic = SGML_PARSER_MAGIC;
p->dtd = dtd;
p->state = S_PCDATA;
p->mark_state = MS_INCLUDE;
p->dmode = DM_DTD;
p->encoded = TRUE; /* encoded octet stream */
p->buffer = new_icharbuf();
p->cdata = new_ocharbuf();
p->event_class = EV_EXPLICIT;
set_src_dtd_parser(p, IN_NONE, NULL);
return p;
}
static dtd_parser *
clone_dtd_parser(dtd_parser *p)
{ dtd_parser *clone = sgml_calloc(1, sizeof(*p));
2010-05-06 10:59:09 +01:00
*clone = *p;
clone->dtd->references++;
clone->environments = NULL;
clone->marked = NULL;
clone->etag = NULL;
clone->grouplevel = 0;
clone->state = S_PCDATA;
clone->mark_state = MS_INCLUDE;
clone->dmode = DM_DTD;
clone->buffer = new_icharbuf();
clone->cdata = new_ocharbuf();
return clone;
}
void
free_dtd_parser(dtd_parser *p)
{ free_icharbuf(p->buffer);
free_ocharbuf(p->cdata);
2010-05-06 10:59:09 +01:00
#ifdef XMLNS
xmlns_free(p->xmlns);
#endif
free_dtd(p->dtd);
sgml_free(p);
}
static int
process_chars(dtd_parser *p, input_type in, const ichar *name, const ichar *s)
{ locbuf old;
2010-05-06 10:59:09 +01:00
push_location(p, &old);
set_src_dtd_parser(p, in, name);
empty_icharbuf(p->buffer); /* dubious */
for(; *s; s++)
putchar_dtd_parser(p, *s);
pop_location(p, &old);
return TRUE;
}
static int
process_include(dtd_parser *p, const ichar *entity_name)
{ dtd_symbol *id;
dtd_entity *pe;
dtd *dtd = p->dtd;
if ( (id=dtd_find_entity_symbol(dtd, entity_name)) &&
(pe=find_pentity(p->dtd, id)) )
{ ichar *file;
if ( (file = entity_file(dtd, pe)) )
{ int rc = sgml_process_file(p, file, SGML_SUB_DOCUMENT);
sgml_free(file);
return rc;
} else
{ const ichar *text = entity_value(p, pe, NULL);
if ( !text )
2010-05-06 10:59:09 +01:00
return gripe(p, ERC_NO_VALUE, pe->name->name);
return process_chars(p, IN_ENTITY, entity_name, text);
}
}
2010-05-06 10:59:09 +01:00
return gripe(p, ERC_EXISTENCE, L"parameter entity", entity_name);
}
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Process <![ KEYWORD [
Switches ->mark_state according to KEYWORD. Processes the rest in normal
S_PCDATA style, which pops the mark-stack on seeing ]]>
For the purpose of <!DOCTYPE spec [additions]> we switch to S_GROUP if
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
static void
process_marked_section(dtd_parser *p)
{ ichar buf[MAXDECL];
dtd *dtd = p->dtd;
const ichar *decl = p->buffer->data;
const ichar *s;
if ( (decl=isee_func(dtd, decl, CF_MDO2)) && /* ! */
(decl=isee_func(dtd, decl, CF_DSO)) && /* [ */
expand_pentities(p, decl, ZERO_TERM_LEN, buf, sizeof(buf)/sizeof(ichar)) )
{ dtd_symbol *kwd;
decl = buf;
2010-05-06 10:59:09 +01:00
if ( (s=itake_name(p, decl, &kwd)) &&
isee_func(dtd, s, CF_DSO) ) /* [ */
{ dtd_marked *m = sgml_calloc(1, sizeof(*m));
m->keyword = kwd; /* push on the stack */
m->parent = p->marked;
p->marked = m;
if ( istrcaseeq(kwd->name, L"IGNORE") )
m->type = MS_IGNORE;
else if ( istrcaseeq(kwd->name, L"INCLUDE") )
m->type = MS_INCLUDE;
else if ( istrcaseeq(kwd->name, L"TEMP") )
m->type = MS_INCLUDE;
else if ( istrcaseeq(kwd->name, L"CDATA") )
m->type = MS_CDATA;
else if ( istrcaseeq(kwd->name, L"RCDATA") )
m->type = MS_RCDATA;
else
m->type = MS_INCLUDE; /* default */
empty_icharbuf(p->buffer);
if ( m->type == MS_CDATA )
p->state = S_MSCDATA;
else
p->state = S_PCDATA;
if ( p->mark_state != MS_IGNORE )
p->mark_state = m->type;
}
} else
{ decl = p->buffer->data;
if ( (decl=isee_func(dtd, decl, CF_MDO2)) && /* ! */
!isee_func(dtd, decl, CF_DSO) ) /* [ */
{ p->state = S_GROUP;
p->grouplevel = 1;
}
}
}
static void
pop_marked_section(dtd_parser *p)
{ dtd_marked *m = p->marked;
if ( m )
{ p->marked = m->parent;
sgml_free(m);
p->mark_state = (p->marked ? p->marked->type : MS_INCLUDE);
}
2010-05-06 10:59:09 +01:00
}
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Update the space-mode for the current element. The space mode defines
how spaces are handled in the CDATA output.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
static dtd_space_mode
istr_to_space_mode(const ichar *val)
{ if ( istreq(val, L"default") )
return SP_DEFAULT;
if ( istreq(val, L"preserve") )
return SP_PRESERVE;
if ( istreq(val, L"sgml") )
return SP_SGML;
if ( istreq(val, L"remove") )
return SP_REMOVE;
return SP_INHERIT; /* interpret as error */
}
static void
update_space_mode(dtd_parser *p, dtd_element *e,
int natts, sgml_attribute *atts)
{ for( ; natts-- > 0; atts++ )
{ const ichar *name = atts->definition->name->name;
if ( istreq(name, L"xml:space") &&
atts->definition->type == AT_CDATA &&
atts->value.textW )
{ dtd_space_mode m = istr_to_space_mode(atts->value.textW);
if ( m != SP_INHERIT )
p->environments->space_mode = m;
else
2010-05-06 10:59:09 +01:00
gripe(p, ERC_EXISTENCE, L"xml:space-mode", atts->value.textW);
return;
}
}
if ( e->space_mode != SP_INHERIT )
p->environments->space_mode = e->space_mode;
}
static void
empty_cdata(dtd_parser *p)
{ if ( p->dmode == DM_DATA )
{ empty_ocharbuf(p->cdata);
p->blank_cdata = TRUE;
p->cdata_must_be_empty = FALSE;
}
}
static void
cb_cdata(dtd_parser *p, ocharbuf *buf, int offset, int size)
{ if ( p->on_data )
(*p->on_data)(p, EC_CDATA, size, buf->data.w+offset);
}
static int
emit_cdata(dtd_parser *p, int last)
{ dtd *dtd = p->dtd;
locbuf locsafe;
ocharbuf *cdata = p->cdata;
int offset = 0;
int size = cdata->size;
2010-05-06 10:59:09 +01:00
if ( size == 0 )
return TRUE; /* empty or done */
push_location(p, &locsafe);
sgml_cplocation(&p->location, &p->startloc); /* start of markup */
sgml_cplocation(&p->startloc, &p->startcdata); /* real start of CDATA */
if ( p->environments )
{ switch(p->environments->space_mode)
{ case SP_SGML:
case SP_DEFAULT:
if ( p->first )
{ wint_t c = fetch_ocharbuf(cdata, offset);
if ( HasClass(dtd, c, CH_RE) )
{ inc_location(&p->startloc, c);
offset++;
size--;
c = fetch_ocharbuf(cdata, offset);
}
2010-05-06 10:59:09 +01:00
if ( HasClass(dtd, c, CH_RS) )
{ inc_location(&p->startloc, c);
offset++;
size--;
}
}
if ( last && size > 0 )
{ wint_t c = fetch_ocharbuf(cdata, offset+size-1);
if ( HasClass(dtd, c, CH_RS) )
{ dec_location(&p->location, c);
size--;
poke_ocharbuf(cdata, offset+size, '\0');
if ( size > 0 )
c = fetch_ocharbuf(cdata, offset+size-1);
else
c = 0; /* HasClass(CH_RE) must fail */
}
if ( HasClass(dtd, c, CH_RE) )
{ dec_location(&p->location, c);
size--;
poke_ocharbuf(cdata, offset+size, '\0');
}
}
if ( p->environments->space_mode == SP_DEFAULT )
{ int o = 0;
int i;
2010-05-06 10:59:09 +01:00
for(i=0; i<size; i++)
{ wint_t c = fetch_ocharbuf(cdata, offset+i);
if ( HasClass(dtd, c, CH_BLANK) )
{ for(i++; i<size; i++)
{ wint_t c = fetch_ocharbuf(cdata, offset+i);
if ( !HasClass(dtd, c, CH_BLANK) )
break;
}
i--;
poke_ocharbuf(cdata, o++, ' ');
continue;
}
poke_ocharbuf(cdata, o++, c);
}
poke_ocharbuf(cdata, o, '\0');
offset = 0; /* wrote new output from offset=0 */
size = o;
}
break;
case SP_REMOVE:
{ int o = 0;
int i;
int end = 0;
for(i=0; i<size; i++)
{ wint_t c = fetch_ocharbuf(cdata, offset+i);
if ( HasClass(dtd, c, CH_BLANK) )
inc_location(&p->startloc, c);
else
break;
}
if ( i<size )
{ for(; i<size; i++)
{ wint_t c = fetch_ocharbuf(cdata, offset+i);
if ( HasClass(dtd, c, CH_BLANK) )
{ i++;
while(i<size && HasClass(dtd,
(wint_t)fetch_ocharbuf(cdata, offset+i),
CH_BLANK))
i++;
i--;
poke_ocharbuf(cdata, o++, ' ');
continue;
}
poke_ocharbuf(cdata, o++, c);
end = o;
}
}
/* TBD: adjust end */
poke_ocharbuf(cdata, end, '\0');
size = end;
break;
}
case SP_PRESERVE:
break;
case SP_INHERIT:
assert(0);
return FALSE;
}
}
if ( size == 0 )
{ pop_location(p, &locsafe);
empty_cdata(p);
return TRUE;
}
assert(size > 0);
if ( !p->blank_cdata )
{ if ( p->cdata_must_be_empty )
2010-05-06 10:59:09 +01:00
{ gripe(p, ERC_NOT_ALLOWED_PCDATA, p->cdata); /* TBD: now passes buffer! */
}
cb_cdata(p, cdata, offset, size);
} else if ( p->environments )
{ sgml_environment *env = p->environments;
dtd_state *new;
2010-05-06 10:59:09 +01:00
/* If an element is not in the DTD we must */
/* assume mixed content and emit spaces */
if ( (new=make_dtd_transition(env->state, CDATA_ELEMENT)) )
{ env->state = new;
cb_cdata(p, cdata, offset, size);
} else if ( env->element->undefined &&
p->environments->space_mode == SP_PRESERVE )
{ cb_cdata(p, cdata, offset, size);
}
}
2010-05-06 10:59:09 +01:00
pop_location(p, &locsafe);
empty_cdata(p);
return TRUE;
}
static int
prepare_cdata(dtd_parser *p)
{ if ( p->cdata->size == 0 )
return TRUE;
terminate_ocharbuf(p->cdata);
if ( p->mark_state == MS_INCLUDE )
{ dtd *dtd = p->dtd;
if ( p->environments ) /* needed for <img> <img> */
{ dtd_element *e = p->environments->element;
if ( e->structure && e->structure->type == C_EMPTY && !e->undefined )
close_element(p, e, FALSE);
}
if ( p->blank_cdata == TRUE )
{ int blank = TRUE;
int i;
for(i=0; i<p->cdata->size; i++)
{ wint_t c = fetch_ocharbuf(p->cdata, i);
if ( !HasClass(dtd, c, CH_BLANK) )
{ blank = FALSE;
break;
}
}
p->blank_cdata = blank;
if ( !blank )
{ if ( p->dmode == DM_DTD )
2010-05-06 10:59:09 +01:00
gripe(p, ERC_SYNTAX_ERROR, L"CDATA in DTD", p->cdata->data);
else
open_element(p, CDATA_ELEMENT, TRUE);
}
}
}
return TRUE;
}
static int
process_cdata(dtd_parser *p, int last)
2010-05-06 10:59:09 +01:00
{ prepare_cdata(p);
2010-05-06 10:59:09 +01:00
return emit_cdata(p, last);
}
static int
process_entity(dtd_parser *p, const ichar *name)
{ if ( name[0] == '#' ) /* #charcode: character entity */
{ int v = char_entity_value(name);
if ( v <= 0 )
2010-05-06 10:59:09 +01:00
return gripe(p, ERC_SYNTAX_ERROR, L"Bad character entity", name);
add_ocharbuf(p->cdata, v);
} else
{ dtd_symbol *id;
dtd_entity *e;
dtd *dtd = p->dtd;
int len;
const ichar *text;
const ichar *s;
int chr;
ichar *file;
if ( !(id=dtd_find_entity_symbol(dtd, name)) ||
!(e=id->entity) )
{ if ( dtd->default_entity )
e = dtd->default_entity;
else
2010-05-06 10:59:09 +01:00
return gripe(p, ERC_EXISTENCE, L"entity", name);
}
if ( !e->value &&
e->content == EC_SGML &&
(file=entity_file(p->dtd, e)) )
{ int rc;
empty_icharbuf(p->buffer); /* dubious */
rc = sgml_process_file(p, file, SGML_SUB_DOCUMENT);
sgml_free(file);
return rc;
}
if ( !(text = entity_value(p, e, &len)) )
2010-05-06 10:59:09 +01:00
return gripe(p, ERC_NO_VALUE, e->name->name);
switch ( e->content )
{ case EC_SGML:
case EC_CDATA:
if ( (s=isee_character_entity(dtd, text, &chr)) && *s == '\0' )
{ if ( chr == 0 )
2010-05-06 10:59:09 +01:00
return gripe(p, ERC_SYNTAX_ERROR, L"Illegal character entity", text);
if ( p->blank_cdata == TRUE &&
!HasClass(dtd, (wint_t)chr, CH_BLANK) )
{ p->cdata_must_be_empty = !open_element(p, CDATA_ELEMENT, FALSE);
p->blank_cdata = FALSE;
}
2010-05-06 10:59:09 +01:00
add_ocharbuf(p->cdata, chr);
return TRUE;
}
if ( e->content == EC_SGML )
{ locbuf oldloc;
int decode = p->utf8_decode;
push_location(p, &oldloc);
p->utf8_decode = FALSE;
set_src_dtd_parser(p, IN_ENTITY, e->name->name);
empty_icharbuf(p->buffer); /* dubious */
for(s=text; *s; s++)
putchar_dtd_parser(p, *s);
p->utf8_decode = decode;
pop_location(p, &oldloc);
} else if ( *text )
{ const ichar *o;
if ( p->blank_cdata == TRUE )
{ p->cdata_must_be_empty = !open_element(p, CDATA_ELEMENT, FALSE);
p->blank_cdata = FALSE;
}
for(o=text; *o; o++)
add_ocharbuf(p->cdata, *o);
}
break;
case EC_SDATA:
case EC_NDATA:
process_cdata(p, FALSE);
if ( p->on_data )
(*p->on_data)(p, e->content, len, text);
break;
case EC_PI:
process_cdata(p, FALSE);
if ( p->on_pi )
(*p->on_pi)(p, text);
case EC_STARTTAG:
#if 0
prepare_cdata(p);
process_begin_element(p, text);
#endif
break;
case EC_ENDTAG:
#if 0
prepare_cdata(p);
process_end_element(p, text);
#endif
break;
}
return TRUE;
}
return TRUE;
}
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Deal with end of input. We should give a proper error message depending
on the state and the start-location of the error.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
2010-05-06 10:59:09 +01:00
int
end_document_dtd_parser(dtd_parser *p)
{ int rval;
switch(p->state)
{ case S_RCDATA:
case S_CDATA:
case S_PCDATA:
rval = TRUE;
break;
case S_CMT:
case S_CMT1:
case S_CMTE0:
case S_CMTE1:
case S_DECLCMT0:
case S_DECLCMT:
case S_DECLCMTE0:
2010-05-06 10:59:09 +01:00
rval = gripe(p, ERC_SYNTAX_ERROR,
L"Unexpected end-of-file in comment", L"");
break;
case S_ECDATA1:
case S_ECDATA2:
case S_EMSC1:
case S_EMSC2:
case S_DECL0:
case S_DECL:
case S_MDECL0:
case S_STRING:
case S_CMTO:
case S_GROUP:
case S_PENT:
case S_ENT:
case S_ENT0:
2010-05-06 10:59:09 +01:00
rval = gripe(p, ERC_SYNTAX_ERROR,
L"Unexpected end-of-file", L"");
break;
#ifdef UTF8
case S_UTF8:
2010-05-06 10:59:09 +01:00
rval = gripe(p, ERC_SYNTAX_ERROR,
L"Unexpected end-of-file in UTF-8 sequence", L"");
break;
#endif
case S_MSCDATA:
case S_EMSCDATA1:
case S_EMSCDATA2:
2010-05-06 10:59:09 +01:00
rval = gripe(p, ERC_SYNTAX_ERROR,
L"Unexpected end-of-file in CDATA marked section", L"");
break;
case S_PI:
case S_PI2:
2010-05-06 10:59:09 +01:00
rval = gripe(p, ERC_SYNTAX_ERROR,
L"Unexpected end-of-file in processing instruction", L"");
break;
default:
2010-05-06 10:59:09 +01:00
rval = gripe(p, ERC_SYNTAX_ERROR,
L"Unexpected end-of-file in ???");
break;
}
if ( p->dmode == DM_DATA )
{ sgml_environment *env;
if ( p->cdata->size > 0 &&
fetch_ocharbuf(p->cdata, p->cdata->size-1) == CR )
del_ocharbuf(p->cdata);
process_cdata(p, TRUE);
if ( (env=p->environments) )
{ dtd_element *e;
while(env->parent)
env = env->parent;
pop_to(p, env, CDATA_ELEMENT);
e = env->element;
if ( e->structure && !e->structure->omit_close )
2010-05-06 10:59:09 +01:00
gripe(p, ERC_OMITTED_CLOSE, e->name->name);
close_element(p, e, FALSE);
}
}
return rval;
}
int
begin_document_dtd_parser(dtd_parser *p)
{ init_decoding(p);
return TRUE;
}
void
reset_document_dtd_parser(dtd_parser *p)
{ if ( p->environments )
{ sgml_environment *env, *parent;
for(env = p->environments; env; env=parent)
{ parent = env->parent;
free_environment(env);
}
p->environments = NULL;
}
while(p->marked)
pop_marked_section(p);
empty_icharbuf(p->buffer);
empty_ocharbuf(p->cdata);
p->mark_state = MS_INCLUDE;
p->state = S_PCDATA;
p->grouplevel = 0;
p->blank_cdata = TRUE;
p->event_class = EV_EXPLICIT;
p->dmode = DM_DATA;
begin_document_dtd_parser(p);
}
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Set the UTF-8 state
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
#ifdef UTF8
static void
process_utf8(dtd_parser *p, int chr)
{ int bytes;
int mask;
for( bytes=1, mask=0x20; chr&mask; bytes++, mask >>= 1 )
;
mask--; /* 0x20 --> 0x1f */
p->utf8_saved_state = p->state; /* state to return to */
p->state = S_UTF8;
p->utf8_char = chr & mask;
p->utf8_left = bytes;
}
#endif
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
add_cdata() adds a character to the output data. It also maps \r\n onto
a single \n for Windows newline conventions.
There is a problem here in shortref handling. We open the CDATA_ELEMENT
as soon as we find a character as this may open other elements through
omitted tags and thus install a new shortref map.
If, at a later stage, all CDATA read sofar turns out to be a shortref we
have incorrectly opened the CDATA_ELEMENT. As `undoing' the
open_element() is not an option (it may already have caused `events' on
omitted tags) we are in trouble.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
static void
add_cdata(dtd_parser *p, int chr)
{ if ( p->mark_state == MS_INCLUDE )
{ ocharbuf *buf = p->cdata;
if ( p->blank_cdata == TRUE &&
!HasClass(p->dtd, (wint_t)chr, CH_BLANK) )
{ p->cdata_must_be_empty = !open_element(p, CDATA_ELEMENT, FALSE);
p->blank_cdata = FALSE;
}
if ( chr == '\n' ) /* insert missing CR */
{ int sz;
if ( (sz=buf->size) == 0 ||
fetch_ocharbuf(buf, sz-1) != CR )
add_cdata(p, CR);
}
add_ocharbuf(buf, chr);
2010-05-06 10:59:09 +01:00
if ( p->map &&
chr <= 0xff && p->map->ends[chr] &&
match_shortref(p) )
return;
if ( chr == '\n' ) /* dubious. Whould we do that */
{ int sz; /* here or in space-handling? */
if ( (sz=buf->size) > 1 &&
fetch_ocharbuf(buf, sz-1) == LF &&
fetch_ocharbuf(buf, sz-2) == CR )
{ poke_ocharbuf(buf, sz-2, LF);
buf->size--;
}
}
}
}
static void
add_verbatim_cdata(dtd_parser *p, int chr)
{ if ( p->mark_state != MS_IGNORE )
{ ocharbuf *buf = p->cdata;
if ( p->blank_cdata == TRUE &&
!HasClass(p->dtd, (wint_t)chr, CH_BLANK) )
{ p->cdata_must_be_empty = !open_element(p, CDATA_ELEMENT, FALSE);
p->blank_cdata = FALSE;
}
if ( chr == '\n' && buf->size > 0 &&
fetch_ocharbuf(buf, buf->size-1) == '\r' )
buf->size--;
2010-05-06 10:59:09 +01:00
add_ocharbuf(buf, chr);
}
}
/* We discovered illegal markup and now process it as normal CDATA
*/
static void
recover_parser(dtd_parser *p)
{ const ichar *s;
terminate_icharbuf(p->buffer);
add_cdata(p, p->saved);
for(s=p->buffer->data; *s; s++)
add_cdata(p, *s);
p->state = S_PCDATA;
}
static inline void
setlocation(dtd_srcloc *d, dtd_srcloc *loc, int line, int lpos)
{ d->line = line;
d->linepos = lpos;
d->charpos = loc->charpos - 1;
d->type = loc->type;
d->name = loc->name;
}
2010-05-06 10:59:09 +01:00
int
putchar_dtd_parser(dtd_parser *p, int chr)
{ dtd *dtd = p->dtd;
const ichar *f = dtd->charfunc->func;
int line = p->location.line;
int lpos = p->location.linepos;
2010-05-06 10:59:09 +01:00
p->location.charpos++; /* TBD: actually `bytepos' */
#ifdef UTF8
if ( p->state == S_UTF8 )
{ if ( (chr & 0xc0) != 0x80 ) /* TBD: recover */
2010-05-06 10:59:09 +01:00
gripe(p, ERC_SYNTAX_ERROR, L"Bad UTF-8 sequence", L"");
p->utf8_char <<= 6;
p->utf8_char |= (chr & ~0xc0);
if ( --p->utf8_left == 0 )
{ chr = p->utf8_char;
p->state = p->utf8_saved_state;
} else
2010-05-06 10:59:09 +01:00
{ return TRUE;
}
} else if ( ISUTF8_MB(chr) && p->utf8_decode )
{ process_utf8(p, chr);
2010-05-06 10:59:09 +01:00
return TRUE;
}
#endif
if ( f[CF_RS] == chr )
{ p->location.line++;
p->location.linepos = 0;
} else
{ if ( f[CF_RE] == chr )
p->location.linepos = 0;
else
p->location.linepos++;
}
reprocess:
switch(p->state)
{ case S_PCDATA:
{ if ( f[CF_MDO1] == chr ) /* < */
{ setlocation(&p->startloc, &p->location, line, lpos);
p->state = S_DECL0;
empty_icharbuf(p->buffer);
2010-05-06 10:59:09 +01:00
return TRUE;
}
if ( p->dmode == DM_DTD )
{ if ( f[CF_PERO] == chr ) /* % */
{ setlocation(&p->startloc, &p->location, line, lpos);
p->state = S_PENT;
2010-05-06 10:59:09 +01:00
return TRUE;
}
} else
{ if ( f[CF_ERO] == chr ) /* & */
{ setlocation(&p->startloc, &p->location, line, lpos);
p->state = S_ENT0;
2010-05-06 10:59:09 +01:00
return TRUE;
}
}
2010-05-06 10:59:09 +01:00
if ( p->marked && f[CF_DSC] == chr ) /* ] in marked section */
{ empty_icharbuf(p->buffer);
p->state = S_EMSC1;
p->saved = chr; /* for recovery */
2010-05-06 10:59:09 +01:00
return TRUE;
}
if ( p->waiting_for_net && f[CF_ETAGO2] == chr ) /* shorttag */
{ setlocation(&p->startloc, &p->location, line, lpos);
2010-05-06 10:59:09 +01:00
process_net(p);
return TRUE;
}
/* Real character data */
if ( p->cdata->size == 0 )
setlocation(&p->startcdata, &p->location, line, lpos);
add_cdata(p, chr);
2010-05-06 10:59:09 +01:00
return TRUE;
}
case S_ECDATA2: /* Seen </ in CDATA/RCDATA */
{ if ( f[CF_MDC] == chr &&
p->etaglen == p->buffer->size &&
istrncaseeq(p->buffer->data, p->etag, p->etaglen) )
{ p->cdata->size -= p->etaglen+2; /* 2 for </ */
terminate_ocharbuf(p->cdata);
terminate_icharbuf(p->buffer);
if ( p->mark_state == MS_INCLUDE )
2010-05-06 10:59:09 +01:00
{ process_cdata(p, TRUE);
process_end_element(p, p->buffer->data);
empty_cdata(p);
}
empty_icharbuf(p->buffer);
p->cdata_state = p->state = S_PCDATA;
} else
{ add_verbatim_cdata(p, chr);
if ( p->etaglen < p->buffer->size ||
!HasClass(dtd, (wint_t)chr, CH_NAME))
{ empty_icharbuf(p->buffer); /* mismatch */
p->state = p->cdata_state;
} else
add_icharbuf(p->buffer, chr);
}
2010-05-06 10:59:09 +01:00
return TRUE;
}
case S_ECDATA1: /* seen < in CDATA */
{ add_verbatim_cdata(p, chr);
if ( f[CF_ETAGO2] == chr ) /* / */
{ empty_icharbuf(p->buffer);
p->state = S_ECDATA2;
} else if ( f[CF_ETAGO1] != chr ) /* <: do not change state */
p->state = p->cdata_state;
2010-05-06 10:59:09 +01:00
return TRUE;
}
case S_RCDATA:
{ if ( f[CF_ERO] == chr ) /* & */
{ setlocation(&p->startloc, &p->location, line, lpos);
p->state = S_ENT0;
2010-05-06 10:59:09 +01:00
return TRUE;
}
/*FALLTHROUGH*/
}
case S_CDATA:
{ add_verbatim_cdata(p, chr);
if ( f[CF_MDO1] == chr ) /* < */
{ setlocation(&p->startloc, &p->location, line, lpos);
p->state = S_ECDATA1;
}
/* / in CDATA shorttag element */
if ( p->waiting_for_net && f[CF_ETAGO2] == chr )
{ setlocation(&p->startloc, &p->location, line, lpos);
p->cdata->size--;
terminate_ocharbuf(p->cdata);
terminate_icharbuf(p->buffer);
if ( p->mark_state == MS_INCLUDE )
2010-05-06 10:59:09 +01:00
{ process_cdata(p, TRUE);
process_net(p);
empty_cdata(p);
}
empty_icharbuf(p->buffer);
p->cdata_state = p->state = S_PCDATA;
}
2010-05-06 10:59:09 +01:00
return TRUE;
}
case S_MSCDATA:
{ add_verbatim_cdata(p, chr);
if ( f[CF_DSC] == chr ) /* ] */
p->state = S_EMSCDATA1;
2010-05-06 10:59:09 +01:00
return TRUE;
}
case S_EMSCDATA1:
{ add_verbatim_cdata(p, chr);
if ( f[CF_DSC] == chr ) /* ]] */
p->state = S_EMSCDATA2;
else
p->state = S_MSCDATA;
2010-05-06 10:59:09 +01:00
return TRUE;
}
case S_EMSCDATA2:
{ add_verbatim_cdata(p, chr);
if ( f[CF_MDC] == chr ) /* ]]> */
{ p->cdata->size -= 3; /* Delete chars for ]] */
pop_marked_section(p);
p->state = S_PCDATA;
} else if ( f[CF_DSC] != chr ) /* if ]]], stay in this state */
p->state = S_MSCDATA;
2010-05-06 10:59:09 +01:00
return TRUE;
}
case S_EMSC1:
{ if ( f[CF_DSC] == chr ) /* ]] in marked section */
{ p->state = S_EMSC2;
2010-05-06 10:59:09 +01:00
return TRUE;
} else
{ add_icharbuf(p->buffer, chr);
recover_parser(p);
2010-05-06 10:59:09 +01:00
return TRUE;
}
}
case S_EMSC2:
{ if ( f[CF_MDC] == chr ) /* ]]> in marked section */
{ pop_marked_section(p);
p->state = S_PCDATA;
2010-05-06 10:59:09 +01:00
return TRUE;
} else
{ add_icharbuf(p->buffer, chr);
recover_parser(p);
2010-05-06 10:59:09 +01:00
return TRUE;
}
}
case S_PENT: /* %parameter entity; */
{ if ( f[CF_ERC] == chr )
{ p->state = S_PCDATA;
terminate_icharbuf(p->buffer);
if ( p->mark_state == MS_INCLUDE )
2010-05-06 10:59:09 +01:00
{ process_include(p, p->buffer->data);
}
empty_icharbuf(p->buffer);
2010-05-06 10:59:09 +01:00
return TRUE;
}
if ( HasClass(dtd, (wint_t)chr, CH_NAME) )
{ add_icharbuf(p->buffer, chr);
2010-05-06 10:59:09 +01:00
return TRUE;
}
terminate_icharbuf(p->buffer);
2010-05-06 10:59:09 +01:00
return gripe(p, ERC_SYNTAX_ERROR,
L"Illegal parameter entity", p->buffer->data);
}
case S_ENT0: /* Seen & */
{ if ( chr == '#' || HasClass(dtd, (wint_t)chr, CH_NAME) )
{ empty_icharbuf(p->buffer);
add_icharbuf(p->buffer, chr);
p->state = S_ENT;
} else
{ if ( dtd->dialect != DL_SGML )
{ wchar_t buf[3];
buf[0] = '&';
buf[1] = chr;
buf[2] = '\0';
2010-05-06 10:59:09 +01:00
gripe(p, ERC_SYNTAX_ERROR, L"Illegal entity", buf);
}
add_cdata(p, f[CF_ERO]);
p->state = p->cdata_state;
goto reprocess;
}
2010-05-06 10:59:09 +01:00
return TRUE;
}
case S_ENT: /* &entity; */
{ if ( HasClass(dtd, (wint_t)chr, CH_NAME) )
{ add_icharbuf(p->buffer, chr);
2010-05-06 10:59:09 +01:00
return TRUE;
}
terminate_icharbuf(p->buffer);
p->state = p->cdata_state;
if ( p->mark_state == MS_INCLUDE )
2010-05-06 10:59:09 +01:00
{ process_entity(p, p->buffer->data);
}
empty_icharbuf(p->buffer);
2010-05-06 10:59:09 +01:00
if ( chr == CR )
p->state = S_ENTCR;
else if ( f[CF_ERC] != chr && chr != '\n' )
goto reprocess;
2010-05-06 10:59:09 +01:00
return TRUE;
}
case S_ENTCR: /* seen &entCR, eat the LF */
{ p->state = p->cdata_state;
if ( chr != LF )
goto reprocess;
2010-05-06 10:59:09 +01:00
return TRUE;
}
case S_DECL0: /* Seen < */
{ if ( f[CF_ETAGO2] == chr ) /* </ */
{ add_icharbuf(p->buffer, chr);
p->state = S_DECL;
} else if ( HasClass(dtd, (wint_t)chr, CH_NAME) ) /* <letter */
{ add_icharbuf(p->buffer, chr);
p->state = S_DECL;
} else if ( f[CF_MDO2] == chr ) /* <! */
{ p->state = S_MDECL0;
} else if ( f[CF_PRO2] == chr ) /* <? */
{ p->state = S_PI;
} else /* recover */
{ add_cdata(p, f[CF_MDO1]);
add_cdata(p, chr);
p->state = S_PCDATA;
}
2010-05-06 10:59:09 +01:00
return TRUE;
}
case S_MDECL0: /* Seen <! */
{ if ( f[CF_CMT] == chr ) /* <!- */
{ p->state = S_CMTO;
2010-05-06 10:59:09 +01:00
return TRUE;
}
add_icharbuf(p->buffer, f[CF_MDO2]);
add_icharbuf(p->buffer, chr);
p->state = S_DECL;
2010-05-06 10:59:09 +01:00
return TRUE;
}
case S_DECL: /* <...> */
{ if ( f[CF_MDC] == chr ) /* > */
{ prepare_cdata(p);
p->state = S_PCDATA;
terminate_icharbuf(p->buffer);
if ( p->mark_state == MS_INCLUDE )
2010-05-06 10:59:09 +01:00
{ process_declaration(p, p->buffer->data);
}
empty_icharbuf(p->buffer);
2010-05-06 10:59:09 +01:00
return TRUE;
}
if ( dtd->shorttag && f[CF_ETAGO2] == chr && p->buffer->size > 0 )
{ prepare_cdata(p);
p->state = S_PCDATA;
terminate_icharbuf(p->buffer);
if ( p->mark_state == MS_INCLUDE )
{ WITH_CLASS(p, EV_SHORTTAG,
2010-05-06 10:59:09 +01:00
process_declaration(p, p->buffer->data));
}
empty_icharbuf(p->buffer);
p->waiting_for_net = TRUE;
2010-05-06 10:59:09 +01:00
return TRUE;
}
add_icharbuf(p->buffer, chr);
if ( f[CF_LIT] == chr ) /* " */
{ p->state = S_STRING;
p->saved = chr;
p->lit_saved_state = S_DECL;
} else if ( f[CF_LITA] == chr ) /* ' */
{ p->state = S_STRING;
p->saved = chr;
p->lit_saved_state = S_DECL;
2010-05-06 10:59:09 +01:00
return TRUE;
} else if ( f[CF_CMT] == chr && /* - */
p->buffer->data[0] == f[CF_MDO2] ) /* Started <! */
{ p->state = S_DECLCMT0;
} else if ( f[CF_DSO] == chr ) /* [: marked section */
{ terminate_icharbuf(p->buffer);
process_marked_section(p);
}
2010-05-06 10:59:09 +01:00
return TRUE;
}
case S_DECLCMT0: /* <...- */
{ if ( f[CF_CMT] == chr )
{ p->buffer->size--;
p->state = S_DECLCMT;
} else
{ add_icharbuf(p->buffer, chr);
p->state = S_DECL;
}
2010-05-06 10:59:09 +01:00
return TRUE;
}
case S_DECLCMT: /* <...--.. */
{ if ( f[CF_CMT] == chr )
p->state = S_DECLCMTE0;
2010-05-06 10:59:09 +01:00
return TRUE;
}
case S_DECLCMTE0: /* <...--..- */
{ if ( f[CF_CMT] == chr )
p->state = S_DECL;
else
p->state = S_DECLCMT;
2010-05-06 10:59:09 +01:00
return TRUE;
}
case S_PI:
{ add_icharbuf(p->buffer, chr);
if ( f[CF_PRO2] == chr ) /* <? ... ? */
p->state = S_PI2;
if ( f[CF_PRC] == chr ) /* no ? is ok too (XML/SGML) */
goto pi;
2010-05-06 10:59:09 +01:00
return TRUE;
}
case S_PI2:
{ if ( f[CF_PRC] == chr )
{ pi:
process_cdata(p, FALSE);
p->state = S_PCDATA;
p->buffer->size--;
terminate_icharbuf(p->buffer);
if ( p->mark_state == MS_INCLUDE )
2010-05-06 10:59:09 +01:00
{ process_pi(p, p->buffer->data);
}
empty_icharbuf(p->buffer);
2010-05-06 10:59:09 +01:00
return TRUE;
}
add_icharbuf(p->buffer, chr);
p->state = S_PI;
2010-05-06 10:59:09 +01:00
return TRUE;
}
case S_STRING:
{ add_icharbuf(p->buffer, chr);
if ( chr == p->saved )
p->state = p->lit_saved_state;
2010-05-06 10:59:09 +01:00
return TRUE;
}
case S_CMTO: /* Seen <!- */
{ if ( f[CF_CMT] == chr ) /* - */
{ p->state = S_CMT1;
2010-05-06 10:59:09 +01:00
return TRUE;
} else
{ add_cdata(p, f[CF_MDO1]);
add_cdata(p, f[CF_MDO2]);
add_cdata(p, f[CF_CMT]);
add_cdata(p, chr);
p->state = S_PCDATA;
2010-05-06 10:59:09 +01:00
return TRUE;
}
}
case S_CMT1: /* <!-- */
{ if ( f[CF_CMT] == chr ) /* <!--- */
{ if ( dtd->dialect != DL_SGML )
2010-05-06 10:59:09 +01:00
gripe(p, ERC_SYNTAX_ERROR, L"Illegal comment", L"<!---");
}
p->state = S_CMT;
2010-05-06 10:59:09 +01:00
return TRUE;
}
case S_CMT:
{ if ( f[CF_CMT] == chr )
p->state = S_CMTE0; /* <!--...- */
2010-05-06 10:59:09 +01:00
return TRUE;
}
case S_CMTE0: /* <!--... -- */
{ if ( f[CF_CMT] == chr )
p->state = S_CMTE1;
else
p->state = S_CMT;
2010-05-06 10:59:09 +01:00
return TRUE;
}
case S_CMTE1: /* <!--...-- seen */
{ if ( f[CF_MDC] == chr ) /* > */
{ if ( p->on_decl )
(*p->on_decl)(p, (ichar*)"");
p->state = S_PCDATA;
} else
{ if ( dtd->dialect != DL_SGML )
2010-05-06 10:59:09 +01:00
gripe(p, ERC_SYNTAX_ERROR, L"Illegal comment", L"");
if ( f[CF_CMT] != chr )
p->state = S_CMT;
}
2010-05-06 10:59:09 +01:00
return TRUE;
}
case S_GROUP: /* [...] in declaration */
{ add_icharbuf(p->buffer, chr);
if ( f[CF_DSO] == chr )
{ p->grouplevel++;
} else if ( f[CF_DSC] == chr )
{ if ( --p->grouplevel == 0 )
p->state = S_DECL;
} else if ( f[CF_LIT] == chr ) /* " */
{ p->state = S_STRING;
p->saved = chr;
p->lit_saved_state = S_GROUP;
} else if ( f[CF_LITA] == chr ) /* ' */
{ p->state = S_STRING;
p->saved = chr;
p->lit_saved_state = S_GROUP;
2010-05-06 10:59:09 +01:00
return TRUE;
}
2010-05-06 10:59:09 +01:00
return TRUE;
}
#ifdef UTF8
case S_UTF8:
#endif
2010-05-06 10:59:09 +01:00
default:
assert(0);
return FALSE;
}
}
/*******************************
* TOPLEVEL *
*******************************/
int
load_dtd_from_file(dtd_parser *p, const ichar *file)
{ FILE *fd;
int rval;
data_mode oldmode = p->dmode;
dtdstate oldstate = p->state;
locbuf oldloc;
push_location(p, &oldloc);
p->dmode = DM_DTD;
p->state = S_PCDATA;
empty_icharbuf(p->buffer); /* dubious */
set_file_dtd_parser(p, IN_FILE, file);
if ( (fd = wfopen(file, "rb")) )
{ int chr;
while( (chr = getc(fd)) != EOF )
putchar_dtd_parser(p, chr);
fclose(fd);
p->dtd->implicit = FALSE;
rval = TRUE;
} else
rval = FALSE;
pop_location(p, &oldloc);
p->dmode = oldmode;
p->state = oldstate;
return rval;
}
dtd *
file_to_dtd(const ichar *file, const ichar *doctype, dtd_dialect dialect)
{ dtd_parser *p = new_dtd_parser(new_dtd(doctype));
set_dialect_dtd(p->dtd, dialect);
if ( load_dtd_from_file(p, file) )
{ dtd *dtd = p->dtd;
dtd->references++; /* avoid deletion */
free_dtd_parser(p);
return dtd;
} else
{ free_dtd_parser(p);
return NULL;
}
}
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
SGML sees a file as
[<LF>]Line 1<CR>
<LF> Line 2<CR>
I.e. the newline appearing just before the end-of-file should be
ignored. In addition, Unix-style files are mapped to CR-LF. Thanks to
Richard O'Keefe.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
int
sgml_process_stream(dtd_parser *p, FILE *fd, unsigned flags)
{ int p0, p1;
if ( (p0 = getc(fd)) == EOF )
return TRUE;
if ( (p1 = getc(fd)) == EOF )
{ putchar_dtd_parser(p, p0);
return end_document_dtd_parser(p);
}
for(;;)
{ int p2 = getc(fd);
2010-05-06 10:59:09 +01:00
if ( p2 == EOF )
{ putchar_dtd_parser(p, p0);
if ( p1 != LF )
putchar_dtd_parser(p, p1);
else if ( p0 != CR )
putchar_dtd_parser(p, CR);
if ( flags & SGML_SUB_DOCUMENT )
return TRUE;
else
return end_document_dtd_parser(p);
}
putchar_dtd_parser(p, p0);
p0 = p1;
p1 = p2;
2010-05-06 10:59:09 +01:00
}
}
int
sgml_process_file(dtd_parser *p, const ichar *file, unsigned flags)
{ FILE *fd;
int rval;
locbuf oldloc;
push_location(p, &oldloc);
set_file_dtd_parser(p, IN_FILE, file);
if ( !(flags & SGML_SUB_DOCUMENT) )
set_mode_dtd_parser(p, DM_DATA);
if ( (fd = wfopen(file, "rb")) )
{ rval = sgml_process_stream(p, fd, flags);
fclose(fd);
} else
rval = FALSE;
pop_location(p, &oldloc);
return rval;
}
/*******************************
* ERRORS *
*******************************/
static wchar_t *
format_location(wchar_t *s, size_t len, dtd_srcloc *l)
{ int first = TRUE;
if ( !l || l->type == IN_NONE )
return s;
for( ; l && l->type != IN_NONE;
l = l->parent, first = FALSE )
{ if ( !first )
{ swprintf(s, len, L" (from ");
s += wcslen(s);
}
switch(l->type)
{ case IN_NONE:
assert(0);
case IN_FILE:
swprintf(s, len, L"%ls:%d:%d", l->name.file, l->line, l->linepos);
break;
case IN_ENTITY:
swprintf(s, len, L"&%ls;%d:%d", l->name.entity, l->line, l->linepos);
break;
}
s += wcslen(s);
if ( !first )
{ *s++ = L')';
}
}
*s++ = L':';
*s++ = L' ';
return s;
}
static void
format_message(dtd_error *e)
{ wchar_t buf[1024];
wchar_t *s;
int prefix_len;
int left;
switch(e->severity)
{ case ERS_ERROR:
wcscpy(buf, L"Error: ");
break;
case ERS_WARNING:
wcscpy(buf, L"Warning: ");
break;
default:
buf[0] = '\0';
}
s = buf+wcslen(buf);
s = format_location(s, 1024-(s-buf), e->location);
prefix_len = (int)(s-buf);
left = 1024-prefix_len;
switch(e->id)
{ case ERC_REPRESENTATION:
swprintf(s, left, L"Cannot represent due to %ls", e->argv[0]);
break;
case ERC_RESOURCE:
swprintf(s, left, L"Insufficient %ls resources", e->argv[0]);
break;
case ERC_LIMIT:
swprintf(s, left, L"%ls limit exceeded", e->argv[0]);
break;
case ERC_VALIDATE:
swprintf(s, left, L"%ls", e->argv[0]);
break;
case ERC_SYNTAX_ERROR:
swprintf(s, left, L"Syntax error: %ls", e->argv[0]);
break;
case ERC_EXISTENCE:
swprintf(s, left, L"%ls \"%ls\" does not exist", e->argv[0], e->argv[1]);
break;
case ERC_REDEFINED:
swprintf(s, left, L"Redefined %ls \"%ls\"", e->argv[0], e->argv[1]);
break;
default:
;
}
e->message = str2ring(buf);
e->plain_message = e->message + prefix_len;
}
int
2010-05-06 10:59:09 +01:00
gripe(dtd_parser *p, dtd_error_id e, ...)
{ va_list args;
wchar_t buf[1024];
dtd_error error;
int dtdmode = FALSE;
void *freeme = NULL;
va_start(args, e);
memset(&error, 0, sizeof(error));
error.minor = e; /* detailed error code */
2010-05-06 10:59:09 +01:00
if ( p )
{ error.location = &p->location;
if ( p->dmode == DM_DTD )
dtdmode = TRUE;
} else
{ error.location = NULL;
}
switch(e)
{ case ERC_REPRESENTATION:
case ERC_RESOURCE:
error.severity = ERS_ERROR;
error.argv[0] = va_arg(args, wchar_t *);
break;
case ERC_LIMIT:
error.severity = ERS_WARNING;
error.argv[0] = va_arg(args, wchar_t *);
break;
case ERC_SYNTAX_ERROR:
case ERC_SYNTAX_WARNING:
{ wchar_t *m = va_arg(args, wchar_t *);
const wchar_t *s = va_arg(args, const wchar_t *);
if ( s && *s )
{ swprintf(buf, 1024, L"%ls, found \"%ls\"", m, str_summary(s, 25));
error.argv[0] = buf;
} else
error.argv[0] = m;
2010-05-06 10:59:09 +01:00
error.severity = (e == ERC_SYNTAX_WARNING ? ERS_WARNING : ERS_ERROR);
e = ERC_SYNTAX_ERROR;
break;
}
case ERC_DOMAIN:
{ const wchar_t *expected = va_arg(args, const wchar_t *);
const wchar_t *found = str_summary(va_arg(args, const wchar_t *), 25);
swprintf(buf, 1024, L"Expected type %ls, found \"%ls\"", expected, found);
error.argv[0] = buf;
error.severity = ERS_ERROR;
e = (dtdmode ? ERC_SYNTAX_ERROR : ERC_VALIDATE);
break;
}
case ERC_REDEFINED:
{ dtd_symbol *name;
error.argv[0] = va_arg(args, wchar_t *); /* type */
name = va_arg(args, dtd_symbol *); /* name */
error.argv[1] = (ichar*)name->name;
error.severity = ERS_STYLE;
break;
}
case ERC_EXISTENCE:
{ error.argv[0] = va_arg(args, wchar_t *); /* type */
error.argv[1] = va_arg(args, wchar_t *); /* name */
error.severity = ERS_ERROR;
break;
}
case ERC_VALIDATE:
{ error.argv[0] = va_arg(args, wchar_t *); /* message */
error.severity = ERS_WARNING;
break;
}
case ERC_OMITTED_CLOSE:
2010-05-06 10:59:09 +01:00
{ const wchar_t *element = va_arg(args, const wchar_t *);
swprintf(buf, 1024, L"Inserted omitted end-tag for \"%ls\"", element);
error.argv[0] = buf;
error.severity = ERS_WARNING;
e = ERC_VALIDATE;
break;
}
case ERC_OMITTED_OPEN:
2010-05-06 10:59:09 +01:00
{ const wchar_t *element = va_arg(args, const wchar_t *);
swprintf(buf, 1024, L"Inserted omitted start-tag for \"%ls\"", element);
error.argv[0] = buf;
error.severity = ERS_WARNING;
e = ERC_VALIDATE;
break;
}
case ERC_NOT_OPEN:
2010-05-06 10:59:09 +01:00
{ const wchar_t *element = va_arg(args, const wchar_t *);
swprintf(buf, 1024, L"Ignored end-tag for \"%ls\" which is not open",
element);
error.argv[0] = buf;
error.severity = ERS_WARNING;
e = ERC_VALIDATE;
break;
}
case ERC_NOT_ALLOWED:
2010-05-06 10:59:09 +01:00
{ const wchar_t *element = va_arg(args, const wchar_t *);
swprintf(buf, 1024, L"Element \"%ls\" not allowed here", element);
error.argv[0] = buf;
error.severity = ERS_WARNING;
e = ERC_VALIDATE;
break;
}
case ERC_NOT_ALLOWED_PCDATA:
2010-05-06 10:59:09 +01:00
{ const ocharbuf *cdata = va_arg(args, const ocharbuf *);
2010-05-06 10:59:09 +01:00
swprintf(buf, 1024, L"#PCDATA (\"%ls\") not allowed here",
str_summary(cdata->data.w, 25));
error.argv[0] = buf;
error.severity = ERS_WARNING;
e = ERC_VALIDATE;
2010-05-06 10:59:09 +01:00
break;
}
case ERC_NO_ATTRIBUTE:
{ const wchar_t *elem = va_arg(args, wchar_t *); /* element */
const wchar_t *attr = va_arg(args, wchar_t *); /* attribute */
swprintf(buf, 1024, L"Element \"%ls\" has no attribute \"%ls\"",
elem, attr);
error.argv[0] = buf;
error.severity = ERS_WARNING;
e = ERC_VALIDATE;
break;
}
case ERC_NO_ATTRIBUTE_VALUE:
{ const wchar_t *elem = va_arg(args, wchar_t *); /* element */
const wchar_t *value = va_arg(args, wchar_t *); /* attribute value */
swprintf(buf, 1024, L"Element \"%ls\" has no attribute with value \"%ls\"",
elem, value);
error.argv[0] = buf;
error.severity = ERS_WARNING;
e = ERC_VALIDATE;
break;
}
case ERC_NO_VALUE:
{ error.argv[0] = L"entity value";
error.argv[1] = va_arg(args, wchar_t *); /* entity */
error.severity = ERS_ERROR;
e = ERC_EXISTENCE;
break;
}
case ERC_NO_DOCTYPE:
{ const wchar_t *doctype = va_arg(args, wchar_t *); /* element */
const wchar_t *file = va_arg(args, wchar_t *); /* DTD file */
swprintf(buf, 1024, L"No <!DOCTYPE ...>, assuming \"%ls\" from DTD file \"%s\"",
doctype, file);
error.argv[0] = buf;
error.severity = ERS_WARNING;
2010-05-06 10:59:09 +01:00
e = ERC_VALIDATE;
break;
}
case ERC_NO_CATALOGUE:
{ char *file = va_arg(args, char *); /* catalogue file */
error.argv[0] = L"catalogue file";
freeme = error.argv[1] = utf8towcs(file);
error.severity = ERS_WARNING;
e = ERC_EXISTENCE;
break;
}
2010-05-06 10:59:09 +01:00
}
error.id = e;
format_message(&error);
2010-05-06 10:59:09 +01:00
if ( p && p->on_error )
(*p->on_error)(p, &error);
else
fwprintf(stderr, L"SGML: %ls\n", error.message);
if ( freeme )
sgml_free(freeme);
va_end(args);
return FALSE;
}