update sgml package.
This commit is contained in:
parent
0fabe2b9c6
commit
261b5163c7
@ -155,7 +155,7 @@ localpath(const ichar *ref, const ichar *name)
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
int
|
||||
register_catalog_file_unlocked(const ichar *file, catalog_location where)
|
||||
{ catalog_file **f = &catalog;
|
||||
catalog_file *cf;
|
||||
@ -205,7 +205,7 @@ wgetenv(const char *name)
|
||||
|
||||
|
||||
static void
|
||||
init_catalog(void)
|
||||
init_catalog()
|
||||
{ static int done = FALSE;
|
||||
|
||||
LOCK();
|
||||
@ -241,7 +241,7 @@ init_catalog(void)
|
||||
int
|
||||
register_catalog_file(const ichar *file, catalog_location where)
|
||||
{ int rc;
|
||||
|
||||
|
||||
init_catalog();
|
||||
|
||||
LOCK();
|
||||
@ -310,7 +310,7 @@ cs_streql(ichar const *a, ichar const *b)
|
||||
|
||||
static int
|
||||
scan_overflow(size_t buflen)
|
||||
{ gripe(ERC_REPRESENTATION, L"token length");
|
||||
{ gripe(NULL, ERC_REPRESENTATION, L"token length");
|
||||
|
||||
return EOF;
|
||||
}
|
||||
@ -439,7 +439,7 @@ load_one_catalogue(catalog_file * file)
|
||||
int override = 0;
|
||||
|
||||
if ( !src )
|
||||
{ gripe(ERC_NO_CATALOGUE, file->file);
|
||||
{ gripe(NULL, ERC_NO_CATALOGUE, file->file);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -514,7 +514,7 @@ load_one_catalogue(catalog_file * file)
|
||||
|
||||
To look up a parameter entity:
|
||||
f = find_in_catalogue(CAT_PENTITY, name, pubid, sysid, ci);
|
||||
The name may begin with a % but need not; if it doesn't
|
||||
The name may begin with a % but need not; if it doesn't
|
||||
a % will be prefixed for the search.
|
||||
If it cannot otherwise be found ${name}.pen will be returned.
|
||||
|
||||
@ -635,7 +635,7 @@ find_in_catalogue(int kind,
|
||||
return 0;
|
||||
|
||||
if ( istrlen(name)+4+1 > penlen )
|
||||
{ gripe(ERC_REPRESENTATION, L"entity name");
|
||||
{ gripe(NULL, ERC_REPRESENTATION, L"entity name");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
@ -44,7 +44,7 @@ new_charclass()
|
||||
char_range(map, 'a', 'z', CH_LCLETTER);
|
||||
char_range(map, 'A', 'Z', CH_LCLETTER);
|
||||
char_range(map, '0', '9', CH_DIGIT);
|
||||
|
||||
|
||||
ca['.'] |= CH_CNM;
|
||||
ca['-'] |= CH_CNM;
|
||||
ca[183] |= CH_CNM; /* XML */
|
||||
|
@ -466,7 +466,7 @@ dtd * new_dtd(const ichar *doctype);
|
||||
int set_dialect_dtd(dtd *dtd, dtd_dialect dialect);
|
||||
int set_option_dtd(dtd *dtd, dtd_option option, int set);
|
||||
|
||||
void putchar_dtd_parser(dtd_parser *p, int chr);
|
||||
int putchar_dtd_parser(dtd_parser *p, int chr);
|
||||
int begin_document_dtd_parser(dtd_parser *p);
|
||||
int end_document_dtd_parser(dtd_parser *p);
|
||||
void reset_document_dtd_parser(dtd_parser *p);
|
||||
|
@ -27,6 +27,7 @@
|
||||
#include <string.h>
|
||||
#include <wchar.h>
|
||||
#include "dtd.h"
|
||||
#include "util.h"
|
||||
#include "prolog.h"
|
||||
|
||||
#define streq(s,q) strcmp((s), (q)) == 0
|
||||
@ -42,10 +43,12 @@ int
|
||||
main(int argc, char **argv)
|
||||
{ dtd_dialect dialect = DL_SGML;
|
||||
|
||||
init_ring();
|
||||
|
||||
program = argv[0];
|
||||
argv++;
|
||||
argc--;
|
||||
|
||||
|
||||
while(argc > 0 && argv[0][0] == '-')
|
||||
{ if ( streq(argv[0], "-xml") )
|
||||
{ dialect = DL_XML;
|
||||
@ -63,7 +66,7 @@ main(int argc, char **argv)
|
||||
|
||||
if ( argc == 1 )
|
||||
{ int wl = mbstowcs(NULL, argv[0], 0);
|
||||
|
||||
|
||||
if ( wl > 0 )
|
||||
{ wchar_t *ws = malloc((wl+1)*sizeof(wchar_t));
|
||||
dtd *dtd;
|
||||
|
@ -3,9 +3,9 @@
|
||||
Part of SWI-Prolog
|
||||
|
||||
Author: Jan Wielemaker
|
||||
E-mail: jan@swi.psy.uva.nl
|
||||
E-mail: J.Wielemaker@cs.vu.nl
|
||||
WWW: http://www.swi-prolog.org
|
||||
Copyright (C): 1985-2002, University of Amsterdam
|
||||
Copyright (C): 1985-2009, University of Amsterdam
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
@ -32,48 +32,52 @@
|
||||
|
||||
int
|
||||
sgml2pl_error(plerrorid id, ...)
|
||||
{ term_t except = PL_new_term_ref();
|
||||
term_t formal = PL_new_term_ref();
|
||||
term_t swi = PL_new_term_ref();
|
||||
{ int rc;
|
||||
term_t except, formal, swi;
|
||||
va_list args;
|
||||
char msgbuf[1024];
|
||||
char *msg = NULL;
|
||||
|
||||
if ( !(except = PL_new_term_ref()) ||
|
||||
!(formal = PL_new_term_ref()) ||
|
||||
!(swi = PL_new_term_ref()) )
|
||||
return FALSE;
|
||||
|
||||
va_start(args, id);
|
||||
switch(id)
|
||||
{ case ERR_ERRNO:
|
||||
{ int err = va_arg(args, int);
|
||||
|
||||
|
||||
msg = strerror(err);
|
||||
|
||||
switch(err)
|
||||
{ case ENOMEM:
|
||||
PL_unify_term(formal,
|
||||
PL_FUNCTOR_CHARS, "resource_error", 1,
|
||||
PL_CHARS, "no_memory");
|
||||
rc = PL_unify_term(formal,
|
||||
PL_FUNCTOR_CHARS, "resource_error", 1,
|
||||
PL_CHARS, "no_memory");
|
||||
break;
|
||||
case EACCES:
|
||||
{ const char *file = va_arg(args, const char *);
|
||||
const char *action = va_arg(args, const char *);
|
||||
|
||||
PL_unify_term(formal,
|
||||
PL_FUNCTOR_CHARS, "permission_error", 3,
|
||||
PL_CHARS, action,
|
||||
PL_CHARS, "file",
|
||||
PL_CHARS, file);
|
||||
rc = PL_unify_term(formal,
|
||||
PL_FUNCTOR_CHARS, "permission_error", 3,
|
||||
PL_CHARS, action,
|
||||
PL_CHARS, "file",
|
||||
PL_CHARS, file);
|
||||
break;
|
||||
}
|
||||
case ENOENT:
|
||||
{ const char *file = va_arg(args, const char *);
|
||||
|
||||
PL_unify_term(formal,
|
||||
PL_FUNCTOR_CHARS, "existence_error", 2,
|
||||
PL_CHARS, "file",
|
||||
PL_CHARS, file);
|
||||
rc = PL_unify_term(formal,
|
||||
PL_FUNCTOR_CHARS, "existence_error", 2,
|
||||
PL_CHARS, "file",
|
||||
PL_CHARS, file);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
PL_unify_atom_chars(formal, "system_error");
|
||||
rc = PL_unify_atom_chars(formal, "system_error");
|
||||
break;
|
||||
}
|
||||
break;
|
||||
@ -84,12 +88,12 @@ sgml2pl_error(plerrorid id, ...)
|
||||
|
||||
if ( PL_is_variable(actual) &&
|
||||
strcmp(expected, "variable") != 0 )
|
||||
PL_unify_atom_chars(formal, "instantiation_error");
|
||||
rc = PL_unify_atom_chars(formal, "instantiation_error");
|
||||
else
|
||||
PL_unify_term(formal,
|
||||
PL_FUNCTOR_CHARS, "type_error", 2,
|
||||
PL_CHARS, expected,
|
||||
PL_TERM, actual);
|
||||
rc = PL_unify_term(formal,
|
||||
PL_FUNCTOR_CHARS, "type_error", 2,
|
||||
PL_CHARS, expected,
|
||||
PL_TERM, actual);
|
||||
break;
|
||||
}
|
||||
case ERR_DOMAIN:
|
||||
@ -97,31 +101,31 @@ sgml2pl_error(plerrorid id, ...)
|
||||
term_t actual = va_arg(args, term_t);
|
||||
|
||||
if ( PL_is_variable(actual) )
|
||||
PL_unify_atom_chars(formal, "instantiation_error");
|
||||
rc = PL_unify_atom_chars(formal, "instantiation_error");
|
||||
else
|
||||
PL_unify_term(formal,
|
||||
PL_FUNCTOR_CHARS, "domain_error", 2,
|
||||
PL_CHARS, expected,
|
||||
PL_TERM, actual);
|
||||
rc = PL_unify_term(formal,
|
||||
PL_FUNCTOR_CHARS, "domain_error", 2,
|
||||
PL_CHARS, expected,
|
||||
PL_TERM, actual);
|
||||
break;
|
||||
}
|
||||
case ERR_EXISTENCE:
|
||||
{ const char *type = va_arg(args, const char *);
|
||||
term_t obj = va_arg(args, term_t);
|
||||
|
||||
PL_unify_term(formal,
|
||||
PL_FUNCTOR_CHARS, "existence_error", 2,
|
||||
PL_CHARS, type,
|
||||
PL_TERM, obj);
|
||||
rc = PL_unify_term(formal,
|
||||
PL_FUNCTOR_CHARS, "existence_error", 2,
|
||||
PL_CHARS, type,
|
||||
PL_TERM, obj);
|
||||
|
||||
break;
|
||||
}
|
||||
case ERR_FAIL:
|
||||
{ term_t goal = va_arg(args, term_t);
|
||||
|
||||
PL_unify_term(formal,
|
||||
PL_FUNCTOR_CHARS, "goal_failed", 1,
|
||||
PL_TERM, goal);
|
||||
rc = PL_unify_term(formal,
|
||||
PL_FUNCTOR_CHARS, "goal_failed", 1,
|
||||
PL_TERM, goal);
|
||||
|
||||
break;
|
||||
}
|
||||
@ -129,10 +133,10 @@ sgml2pl_error(plerrorid id, ...)
|
||||
{ const char *limit = va_arg(args, const char *);
|
||||
long maxval = va_arg(args, long);
|
||||
|
||||
PL_unify_term(formal,
|
||||
PL_FUNCTOR_CHARS, "limit_exceeded", 2,
|
||||
PL_CHARS, limit,
|
||||
PL_LONG, maxval);
|
||||
rc = PL_unify_term(formal,
|
||||
PL_FUNCTOR_CHARS, "limit_exceeded", 2,
|
||||
PL_CHARS, limit,
|
||||
PL_LONG, maxval);
|
||||
|
||||
break;
|
||||
}
|
||||
@ -142,10 +146,10 @@ sgml2pl_error(plerrorid id, ...)
|
||||
|
||||
vsprintf(msgbuf, fmt, args);
|
||||
msg = msgbuf;
|
||||
|
||||
PL_unify_term(formal,
|
||||
PL_FUNCTOR_CHARS, "miscellaneous", 1,
|
||||
PL_CHARS, id);
|
||||
|
||||
rc = PL_unify_term(formal,
|
||||
PL_FUNCTOR_CHARS, "miscellaneous", 1,
|
||||
PL_CHARS, id);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
@ -153,26 +157,29 @@ sgml2pl_error(plerrorid id, ...)
|
||||
}
|
||||
va_end(args);
|
||||
|
||||
if ( msg )
|
||||
if ( rc && msg )
|
||||
{ term_t predterm = PL_new_term_ref();
|
||||
term_t msgterm = PL_new_term_ref();
|
||||
|
||||
if ( msg )
|
||||
{ PL_put_atom_chars(msgterm, msg);
|
||||
}
|
||||
|
||||
PL_unify_term(swi,
|
||||
PL_FUNCTOR_CHARS, "context", 2,
|
||||
PL_TERM, predterm,
|
||||
PL_TERM, msgterm);
|
||||
if ( !(predterm = PL_new_term_ref()) ||
|
||||
!(msgterm = PL_new_term_ref()) ||
|
||||
!PL_put_atom_chars(msgterm, msg) ||
|
||||
!PL_unify_term(swi,
|
||||
PL_FUNCTOR_CHARS, "context", 2,
|
||||
PL_TERM, predterm,
|
||||
PL_TERM, msgterm) )
|
||||
rc = FALSE;
|
||||
}
|
||||
|
||||
PL_unify_term(except,
|
||||
PL_FUNCTOR_CHARS, "error", 2,
|
||||
PL_TERM, formal,
|
||||
PL_TERM, swi);
|
||||
if ( rc )
|
||||
rc = PL_unify_term(except,
|
||||
PL_FUNCTOR_CHARS, "error", 2,
|
||||
PL_TERM, formal,
|
||||
PL_TERM, swi);
|
||||
|
||||
if ( rc )
|
||||
return PL_raise_exception(except);
|
||||
|
||||
return PL_raise_exception(except);
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
|
@ -44,4 +44,3 @@ typedef enum
|
||||
int sgml2pl_error(plerrorid, ...);
|
||||
|
||||
#endif /*H_ERROR_INCLUDED*/
|
||||
|
||||
|
@ -107,7 +107,7 @@ visit(dtd_state *state, visited *visited)
|
||||
{ if ( visited->states[i] == state )
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
|
||||
if ( visited->size >= MAX_VISITED )
|
||||
{ fprintf(stderr, "Reached MAX_VISITED!\n");
|
||||
return FALSE;
|
||||
@ -262,7 +262,7 @@ do_find_omitted_path(dtd_state *state, dtd_element *e,
|
||||
}
|
||||
|
||||
|
||||
int
|
||||
int
|
||||
find_omitted_path(dtd_state *state, dtd_element *e, dtd_element **path)
|
||||
{ int pl = 0;
|
||||
visited visited;
|
||||
@ -314,13 +314,13 @@ static transition *
|
||||
state_transitions(dtd_state *state)
|
||||
{ if ( !state->transitions && state->expander )
|
||||
{ expander *ex = state->expander;
|
||||
|
||||
|
||||
switch(ex->type)
|
||||
{ case EX_AND:
|
||||
{ dtd_model_list *left = ex->kind.and.set;
|
||||
|
||||
if ( !left ) /* empty AND (should not happen) */
|
||||
{ link(state, ex->target, NULL);
|
||||
{ link(state, ex->target, NULL);
|
||||
} else if ( !left->next ) /* only one left */
|
||||
{ translate_model(left->model, state, ex->target);
|
||||
} else
|
||||
@ -378,7 +378,7 @@ translate_one(dtd_model *m, dtd_state *from, dtd_state *to)
|
||||
|
||||
ex->target = to;
|
||||
ex->type = EX_AND;
|
||||
|
||||
|
||||
for( sub = m->content.group; sub; sub = sub->next )
|
||||
add_model_list(&ex->kind.and.set, sub);
|
||||
|
||||
@ -436,7 +436,7 @@ make_state_engine(dtd_element *e)
|
||||
{ if ( def->content )
|
||||
{ def->initial_state = new_dtd_state();
|
||||
def->final_state = new_dtd_state();
|
||||
|
||||
|
||||
translate_model(def->content, def->initial_state, def->final_state);
|
||||
} else if ( def->type == C_CDATA || def->type == C_RCDATA )
|
||||
{ def->initial_state = new_dtd_state();
|
||||
@ -450,7 +450,7 @@ make_state_engine(dtd_element *e)
|
||||
|
||||
return def->initial_state;
|
||||
}
|
||||
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -492,7 +492,7 @@ free_expander(expander *e, visited *visited)
|
||||
static void
|
||||
do_free_state_engine(dtd_state *state, visited *visited)
|
||||
{ transition *t, *next;
|
||||
|
||||
|
||||
for(t=state->transitions; t; t=next)
|
||||
{ next = t->next;
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -145,6 +145,12 @@ typedef enum
|
||||
DM_DATA /* Environment has only elements */
|
||||
} data_mode;
|
||||
|
||||
#ifdef XMLNS
|
||||
typedef enum
|
||||
{ NONS_ERROR = 0,
|
||||
NONS_QUIET
|
||||
} xmlnons;
|
||||
#endif
|
||||
|
||||
typedef struct _sgml_environment
|
||||
{ dtd_element *element; /* element that opened the env */
|
||||
@ -201,6 +207,10 @@ typedef struct _dtd_parser
|
||||
dtd_srcloc startcdata; /* Start of last cdata */
|
||||
dtd_symbol *enforce_outer_element; /* Outer element to look for */
|
||||
sgml_event_class event_class; /* EV_* */
|
||||
xmlnons xml_no_ns; /* What if namespace does not exist? */
|
||||
#ifdef XMLNS
|
||||
struct _xmlns *xmlns; /* Outer xmlns declaration */
|
||||
#endif
|
||||
|
||||
void *closure; /* client handle */
|
||||
sgml_begin_element_f on_begin_element; /* start an element */
|
||||
@ -221,7 +231,7 @@ typedef struct _dtd_parser
|
||||
#include "xmlns.h"
|
||||
#endif
|
||||
|
||||
extern int gripe(dtd_error_id e, ...);
|
||||
extern int gripe(dtd_parser *p, dtd_error_id e, ...);
|
||||
|
||||
#define SGML_SUB_DOCUMENT 0x1
|
||||
|
||||
|
@ -342,7 +342,7 @@ prolog_print_attribute(dtd_element *e, dtd_attr *at)
|
||||
printf("list(nutoken)");
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
printf(", "); /* print default */
|
||||
switch(at->def)
|
||||
{ case AT_REQUIRED:
|
||||
@ -427,7 +427,7 @@ prolog_print_element(dtd_element *e, unsigned int flags)
|
||||
|
||||
if ( def->excluded )
|
||||
{ dtd_element_list *el;
|
||||
|
||||
|
||||
for(el = def->excluded; el; el=el->next)
|
||||
wprintf(L"exclude(%ls, %ls).\n",
|
||||
atom(e->name->name),
|
||||
@ -435,7 +435,7 @@ prolog_print_element(dtd_element *e, unsigned int flags)
|
||||
}
|
||||
if ( def->included )
|
||||
{ dtd_element_list *el;
|
||||
|
||||
|
||||
for(el = def->included; el; el=el->next)
|
||||
wprintf(L"include(%ls, %ls).\n",
|
||||
atom(e->name->name),
|
||||
|
@ -26,14 +26,18 @@
|
||||
#include <SWI-Prolog.h>
|
||||
#include <stdlib.h>
|
||||
#ifdef HAVE_MALLOC_H
|
||||
#include <malloc.h>
|
||||
#include HAVE_MALLOC_H
|
||||
#endif
|
||||
#include "error.h"
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <wctype.h>
|
||||
#include "xml_unicode.h"
|
||||
#include "dtd.h"
|
||||
#ifdef __WINDOWS__
|
||||
#define inline __inline
|
||||
#endif
|
||||
|
||||
static atom_t ATOM_iso_latin_1;
|
||||
static atom_t ATOM_utf8;
|
||||
@ -86,7 +90,7 @@ room_buf(charbuf *b, size_t room)
|
||||
b->end = b->bufp + used;
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
|
||||
@ -178,11 +182,11 @@ do_quote(term_t in, term_t quoted, char **map, int maxchr)
|
||||
if ( inA )
|
||||
{ for(s = (unsigned char*)inA ; len-- > 0; s++ )
|
||||
{ int c = *s;
|
||||
|
||||
|
||||
if ( map[c] )
|
||||
{ if ( !add_str_buf(&buffer, map[c]) )
|
||||
return FALSE;
|
||||
|
||||
|
||||
changes++;
|
||||
} else if ( c > maxchr )
|
||||
{ char buf[10];
|
||||
@ -190,7 +194,7 @@ do_quote(term_t in, term_t quoted, char **map, int maxchr)
|
||||
sprintf(buf, "&#%d;", c);
|
||||
if ( !add_str_buf(&buffer, buf) )
|
||||
return FALSE;
|
||||
|
||||
|
||||
changes++;
|
||||
} else
|
||||
{ add_char_buf(&buffer, c);
|
||||
@ -204,11 +208,11 @@ do_quote(term_t in, term_t quoted, char **map, int maxchr)
|
||||
} else
|
||||
{ for( ; len-- > 0; inW++ )
|
||||
{ int c = *inW;
|
||||
|
||||
|
||||
if ( c <= 0xff && map[c] )
|
||||
{ if ( !add_str_bufW(&buffer, map[c]) )
|
||||
return FALSE;
|
||||
|
||||
|
||||
changes++;
|
||||
} else if ( c > maxchr )
|
||||
{ char buf[10];
|
||||
@ -216,13 +220,13 @@ do_quote(term_t in, term_t quoted, char **map, int maxchr)
|
||||
sprintf(buf, "&#%d;", c);
|
||||
if ( !add_str_bufW(&buffer, buf) )
|
||||
return FALSE;
|
||||
|
||||
|
||||
changes++;
|
||||
}else
|
||||
{ add_char_bufW(&buffer, c);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if ( changes > 0 )
|
||||
rc = PL_unify_wchars(quoted, PL_ATOM,
|
||||
used_buf(&buffer)/sizeof(wchar_t),
|
||||
@ -230,7 +234,7 @@ do_quote(term_t in, term_t quoted, char **map, int maxchr)
|
||||
else
|
||||
rc = PL_unify(in, quoted);
|
||||
}
|
||||
|
||||
|
||||
free_buf(&buffer);
|
||||
|
||||
return rc;
|
||||
@ -321,12 +325,39 @@ xml_quote_cdata(term_t in, term_t out, term_t encoding)
|
||||
}
|
||||
|
||||
|
||||
static inline int
|
||||
is_xml_nmstart(dtd_charclass *map, int c)
|
||||
{ if ( c <= 0xff )
|
||||
{ return (map->class[c] & CH_NMSTART);
|
||||
} else
|
||||
{ return ( xml_basechar(c) ||
|
||||
xml_ideographic(c)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static inline int
|
||||
is_xml_chname(dtd_charclass *map, int c)
|
||||
{ if ( c <= 0xff )
|
||||
{ return (map->class[c] & CH_NAME);
|
||||
} else
|
||||
{ return ( xml_basechar(c) ||
|
||||
xml_digit(c) ||
|
||||
xml_ideographic(c) ||
|
||||
xml_combining_char(c) ||
|
||||
xml_extender(c)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
static dtd_charclass *map;
|
||||
|
||||
static foreign_t
|
||||
xml_name(term_t in, term_t encoding)
|
||||
{ char *ins;
|
||||
wchar_t *inW;
|
||||
size_t len;
|
||||
static dtd_charclass *map;
|
||||
unsigned int i;
|
||||
int maxchr;
|
||||
|
||||
@ -345,7 +376,7 @@ xml_name(term_t in, term_t encoding)
|
||||
c = ins[0] & 0xff;
|
||||
if ( c > maxchr )
|
||||
return FALSE;
|
||||
|
||||
|
||||
if ( !(map->class[c] & CH_NMSTART) )
|
||||
return FALSE;
|
||||
for(i=1; i<len; i++)
|
||||
@ -360,22 +391,16 @@ xml_name(term_t in, term_t encoding)
|
||||
if ( PL_get_wchars(in, &len, &inW, CVT_ATOMIC) )
|
||||
{ if ( len == 0 )
|
||||
return FALSE;
|
||||
|
||||
if ( inW[0] > maxchr )
|
||||
|
||||
if ( inW[0] > maxchr ||
|
||||
!is_xml_nmstart(map, inW[0]) )
|
||||
return FALSE;
|
||||
|
||||
if ( inW[0] <= 0xff &&
|
||||
!(map->class[inW[0]] & CH_NMSTART) )
|
||||
return FALSE;
|
||||
if ( inW[0] > 0xff && !iswalpha(inW[0]) )
|
||||
return FALSE;
|
||||
|
||||
for(i=1; i<len; i++)
|
||||
{ int c = inW[i];
|
||||
|
||||
if ( c <= 0xff && !(map->class[c] & CH_NAME) )
|
||||
return FALSE;
|
||||
if ( c > 0xff && !iswalnum((wint_t)c) )
|
||||
if ( c > maxchr ||
|
||||
!is_xml_chname(map, c) )
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
@ -386,6 +411,57 @@ xml_name(term_t in, term_t encoding)
|
||||
}
|
||||
|
||||
|
||||
static foreign_t
|
||||
iri_xml_namespace(term_t iri, term_t namespace, term_t localname)
|
||||
{ char *s;
|
||||
pl_wchar_t *w;
|
||||
size_t len;
|
||||
|
||||
if ( !map )
|
||||
map = new_charclass();
|
||||
|
||||
if ( PL_get_nchars(iri, &len, &s, CVT_ATOM|CVT_STRING) )
|
||||
{ const char *e = &s[len];
|
||||
const char *p = e;
|
||||
|
||||
while(p>s && (map->class[p[-1]&0xff] & CH_NAME))
|
||||
p--;
|
||||
while(p<e && !(map->class[p[0]&0xff] & CH_NMSTART))
|
||||
p++;
|
||||
|
||||
if ( !PL_unify_atom_nchars(namespace, p-s, s) )
|
||||
return FALSE;
|
||||
if ( localname &&
|
||||
!PL_unify_atom_nchars(localname, e-p, p) )
|
||||
return FALSE;
|
||||
|
||||
return TRUE;
|
||||
} else if ( PL_get_wchars(iri, &len, &w, CVT_ATOM|CVT_STRING|CVT_EXCEPTION) )
|
||||
{ const pl_wchar_t *e = &w[len];
|
||||
const pl_wchar_t *p = e;
|
||||
|
||||
while(p>w && is_xml_chname(map, p[-1]) )
|
||||
p--;
|
||||
while(p<e && !is_xml_nmstart(map, p[0]) )
|
||||
p++;
|
||||
|
||||
if ( !PL_unify_wchars(namespace, PL_ATOM, p-w, w) )
|
||||
return FALSE;
|
||||
if ( localname &&
|
||||
!PL_unify_wchars(localname, PL_ATOM, e-p, p) )
|
||||
return FALSE;
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
|
||||
static foreign_t
|
||||
iri_xml_namespace2(term_t iri, term_t namespace)
|
||||
{ return iri_xml_namespace(iri, namespace, 0);
|
||||
}
|
||||
|
||||
|
||||
install_t
|
||||
@ -398,4 +474,6 @@ install_xml_quote()
|
||||
PL_register_foreign("xml_quote_attribute", 3, xml_quote_attribute, 0);
|
||||
PL_register_foreign("xml_quote_cdata", 3, xml_quote_cdata, 0);
|
||||
PL_register_foreign("xml_name", 2, xml_name, 0);
|
||||
PL_register_foreign("iri_xml_namespace", 3, iri_xml_namespace, 0);
|
||||
PL_register_foreign("iri_xml_namespace", 2, iri_xml_namespace2, 0);
|
||||
}
|
||||
|
@ -95,10 +95,10 @@ print_word(dtd_parser * p, char c, /* preceding character */
|
||||
static void
|
||||
wprint_escaped(FILE *f, const wchar_t *s, int len)
|
||||
{ const wchar_t *e = &s[len];
|
||||
|
||||
|
||||
while ( s < e )
|
||||
{ wint_t x = *s++;
|
||||
|
||||
|
||||
if (x >= ' ')
|
||||
{ if (x == '\\') /* \ --> \\ */
|
||||
wputc(x, f);
|
||||
@ -352,7 +352,7 @@ mb2wc(const char *s)
|
||||
|
||||
return ws;
|
||||
}
|
||||
|
||||
|
||||
perror("mbstowcs");
|
||||
exit(1);
|
||||
}
|
||||
|
@ -26,7 +26,7 @@ Markup languages are an increasingly important method for
|
||||
data-representation and exchange. This article documents the package
|
||||
\pllib{sgml}, a foreign library for SWI-Prolog to parse SGML
|
||||
and XML documents, returning information on both the document and the
|
||||
document's DTD. The parser is designed to be small, fast and flexible.
|
||||
document's DTD. The parser is designed to be small, fast and flexible.
|
||||
\end{abstract}
|
||||
|
||||
\pagebreak
|
||||
@ -56,17 +56,17 @@ The parser described in this document is small (less than 100 kBytes
|
||||
executable on a Pentium), fast (between 2 and 5 times faster than SP),
|
||||
provides access to the DTD, and provides flexible input handling.
|
||||
|
||||
The document output is equal to the output produced by \jargon{xml2pl},
|
||||
The document output is equal to the output produced by \jargon{xml2pl},
|
||||
an SP interface to SWI-Prolog written by Anjo Anjewierden.
|
||||
|
||||
|
||||
\section{Bluffer's Guide}
|
||||
|
||||
This package allows you to parse SGML, XML and HTML data into a Prolog
|
||||
data structure. The high-level interface defined in \pllib{sgml}
|
||||
This package allows you to parse SGML, XML and HTML data into a Prolog
|
||||
data structure. The high-level interface defined in \pllib{sgml}
|
||||
provides access at the file-level, while the low-level interface defined
|
||||
in the foreign module works with Prolog streams. Please use the source
|
||||
of \file{sgml.pl} as a starting point for dealing with data from
|
||||
in the foreign module works with Prolog streams. Please use the source
|
||||
of \file{sgml.pl} as a starting point for dealing with data from
|
||||
other sources than files, such as SWI-Prolog resources, network-sockets,
|
||||
character strings, \emph{etc.} The first example below loads an HTML file.
|
||||
|
||||
@ -123,9 +123,9 @@ This is called `omitted-tag' handling.
|
||||
].
|
||||
\end{code}
|
||||
|
||||
The document is represented as a list, each element being an atom to
|
||||
The document is represented as a list, each element being an atom to
|
||||
represent \const{CDATA} or a term \term{element}{Name, Attributes, Content}.
|
||||
Entities (e.g. \verb$<$) are expanded and included in the
|
||||
Entities (e.g. \verb$<$) are expanded and included in the
|
||||
atom representing the element content or attribute value.%
|
||||
\footnote{Up to SWI-Prolog 5.4.x, Prolog could not represent
|
||||
\jargon{wide} characters and entities that did not fit in
|
||||
@ -141,23 +141,24 @@ self-contained files in SGML, HTML, or XML into a structured term. They
|
||||
are based on load_structure/3.
|
||||
|
||||
\begin{description}
|
||||
\predicate{load_sgml_file}{2}{+File, -ListOfContent}
|
||||
Same as \term{load_structure}{File, ListOfContent, [dialect(sgml)]}.
|
||||
\predicate{load_sgml_file}{2}{+Source, -ListOfContent}
|
||||
Same as \term{load_structure}{Source, ListOfContent, [dialect(sgml)]}.
|
||||
|
||||
\predicate{load_xml_file}{2}{+File, -ListOfContent}
|
||||
Same as \term{load_structure(File, ListOfContent, [dialect(xml)]}.
|
||||
\predicate{load_xml_file}{2}{+Source, -ListOfContent}
|
||||
Same as \term{load_structure(Source, ListOfContent, [dialect(xml)]}.
|
||||
|
||||
\predicate{load_html_file}{2}{+File, -Content}
|
||||
Load \arg{File} and parse as HTML. Implemented as below. Note that
|
||||
load_html_file/2 re-uses a cached DTD object as defined by dtd/2. As DTD
|
||||
objects may be corrupted while loading errornous documents sharing is
|
||||
undesirable if the documents are not known to be correct. See dtd/2 for
|
||||
details.
|
||||
\predicate{load_html_file}{2}{+Source, -Content}
|
||||
Load \arg{Source} and parse as HTML. \arg{Source} is either the
|
||||
name of a file or term \term{stream}{Handle}. Implemented as
|
||||
below. Note that load_html_file/2 re-uses a cached DTD object as defined
|
||||
by dtd/2. As DTD objects may be corrupted while loading errornous
|
||||
documents sharing is undesirable if the documents are not known to be
|
||||
correct. See dtd/2 for details.
|
||||
|
||||
\begin{code}
|
||||
load_html_file(File, Term) :-
|
||||
load_html_file(Source, Term) :-
|
||||
dtd(html, DTD),
|
||||
load_structure(File, Term,
|
||||
load_structure(Source, Term,
|
||||
[ dtd(DTD),
|
||||
dialect(sgml),
|
||||
shorttag(false)
|
||||
@ -171,8 +172,8 @@ load_html_file(File, Term) :-
|
||||
\subsection{Loading Structured Documents}
|
||||
|
||||
SGML or XML files are loaded through the common predicate
|
||||
load_structure/3. This is a predicate with many options. For
|
||||
simplicity a number of commonly used shorthands are provided:
|
||||
load_structure/3. This is a predicate with many options. For
|
||||
simplicity a number of commonly used shorthands are provided:
|
||||
load_sgml_file/2, load_xml_file/2, and
|
||||
load_html_file/2.
|
||||
|
||||
@ -184,18 +185,18 @@ Parse \arg{Source} and return the resulting structure in
|
||||
options controlling the conversion process.
|
||||
|
||||
A proper XML document contains only a single toplevel element whose name
|
||||
matches the document type. Nevertheless, a list is returned for
|
||||
matches the document type. Nevertheless, a list is returned for
|
||||
consistency with the representation of element content. The <aref/
|
||||
ListOfContent/ consists of the following types:
|
||||
|
||||
\begin{description}
|
||||
\termitem{\arg{Atom}}{}
|
||||
Atoms are used to represent \const{CDATA}. Note
|
||||
Atoms are used to represent \const{CDATA}. Note
|
||||
this is possible in SWI-Prolog, as there is no length-limit on atoms and
|
||||
atom garbage collection is provided.
|
||||
|
||||
\termitem{element}{Name, ListAttributes, ListOfContent}
|
||||
\arg{Name} is the name of the element. Using SGML, which is
|
||||
\arg{Name} is the name of the element. Using SGML, which is
|
||||
case-insensitive, all element names are returned as lowercase atoms.
|
||||
|
||||
\arg{ListOfAttributes} is a list of \arg{Name}=\arg{Value} pairs for
|
||||
@ -209,31 +210,31 @@ integers is supported. \arg{ListOfContent} defines the content for the
|
||||
element.
|
||||
|
||||
\termitem{sdata}{Text}
|
||||
If an entity with declared content-type \const{SDATA} is encountered, this
|
||||
If an entity with declared content-type \const{SDATA} is encountered, this
|
||||
term is returned holding the data in \arg{Text}.
|
||||
|
||||
\termitem{ndata}{Text}
|
||||
If an entity with declared content-type \const{NDATA} is encountered, this
|
||||
If an entity with declared content-type \const{NDATA} is encountered, this
|
||||
term is returned holding the data in \arg{Text}.
|
||||
\termitem{pi}{Text}
|
||||
If a processing instruction is encountered (\verb$<?...?>$), <aref/
|
||||
Text/ holds the text of the processing instruction. Please note that the
|
||||
\verb$<?xml ...?>$ instruction is handled internally.
|
||||
\verb$<?xml ...?>$ instruction is handled internally.
|
||||
\end{description}
|
||||
|
||||
|
||||
The \arg{Options} list controls the conversion process. Currently
|
||||
The \arg{Options} list controls the conversion process. Currently
|
||||
defined options are:
|
||||
|
||||
\begin{description}
|
||||
\termitem{dtd}{?DTD}
|
||||
Reference to a DTD object. If specified, the \verb$<!DOCTYPE ...>$
|
||||
declaration is ignored and the document is parsed and validated against
|
||||
declaration is ignored and the document is parsed and validated against
|
||||
the provided DTD. If provided as a variable, the created DTD is
|
||||
returned. See \secref{implicitdtd}.
|
||||
|
||||
\termitem{dialect}{+Dialect}
|
||||
Specify the parsing dialect. Supported are \const{sgml} (default), \const{xml}
|
||||
Specify the parsing dialect. Supported are \const{sgml} (default), \const{xml}
|
||||
and \const{xmlns}. See \secref{xml} for details on the differences.
|
||||
|
||||
\termitem{shorttag}{+Bool}
|
||||
@ -272,14 +273,14 @@ Defines (overwrites) an entity definition. At the moment, only
|
||||
entity options are allowed.
|
||||
|
||||
\termitem{file}{+Name}
|
||||
Sets the name of the file on which errors are reported. Sets the
|
||||
Sets the name of the file on which errors are reported. Sets the
|
||||
linenumber to 1.
|
||||
|
||||
\termitem{line}{+Line}
|
||||
Sets the starting line-number for reporting errors.
|
||||
|
||||
\termitem{max_errors}{+Max}
|
||||
Sets the maximum number of errors. If this number is reached, an
|
||||
Sets the maximum number of errors. If this number is reached, an
|
||||
exception of the format below is raised. The default is 50. Using
|
||||
\term{max_errors}{-1} makes the parser continue, no matter how many
|
||||
errors it encounters.
|
||||
@ -303,26 +304,26 @@ modes are:
|
||||
\termitem{space}{sgml}
|
||||
In SGML, newlines at the start and end of an element are removed.<fn>In
|
||||
addition, newlines at the end of lines containing only markup should be
|
||||
deleted. This is not yet implemented.</fn> This is the default mode for
|
||||
the SGML dialect.
|
||||
deleted. This is not yet implemented.</fn> This is the default mode for
|
||||
the SGML dialect.
|
||||
|
||||
\termitem{space}{preserve}
|
||||
White space is passed literally to the application. This mode leaves all
|
||||
white space handling to the application. This is the default mode for
|
||||
the XML dialect.
|
||||
the XML dialect.
|
||||
|
||||
\termitem{space}{default}
|
||||
In addition to \const{sgml} space-mode, all consequtive white-space is
|
||||
reduced to a single space-character. This mode canonises all white
|
||||
space.
|
||||
In addition to \const{sgml} space-mode, all consequtive white-space is
|
||||
reduced to a single space-character. This mode canonises all white
|
||||
space.
|
||||
|
||||
\termitem{space}{remove}
|
||||
In addition to \const{default}, all leading and trailing white-space is
|
||||
removed from \const{CDATA} objects. If, as a result, the \const{CDATA}
|
||||
becomes empty, nothing is passed to the application. This mode is
|
||||
especially handy for processing `data-oriented' documents, such as RDF.
|
||||
It is not suitable for normal text documents. Consider the HTML
|
||||
fragment below. When processed in this mode, the spaces between the
|
||||
In addition to \const{default}, all leading and trailing white-space is
|
||||
removed from \const{CDATA} objects. If, as a result, the \const{CDATA}
|
||||
becomes empty, nothing is passed to the application. This mode is
|
||||
especially handy for processing `data-oriented' documents, such as RDF.
|
||||
It is not suitable for normal text documents. Consider the HTML
|
||||
fragment below. When processed in this mode, the spaces between the
|
||||
three modified words are lost. This mode is not part of any standard;
|
||||
XML 1.0 allows only \const{default} and \const{preserve}.
|
||||
|
||||
@ -333,9 +334,9 @@ Consider adjacent <b>bold</b> <ul>and</ul> <it>italic</it> words.
|
||||
|
||||
\subsection{XML documents} \label{sec:xml}
|
||||
|
||||
The parser can operate in two modes: \const{sgml} mode and \const{xml} mode, as
|
||||
defined by the \term{dialect}{Dialect} option. Regardless of this
|
||||
option, if the first line of the document reads as below, the parser is
|
||||
The parser can operate in two modes: \const{sgml} mode and \const{xml} mode, as
|
||||
defined by the \term{dialect}{Dialect} option. Regardless of this
|
||||
option, if the first line of the document reads as below, the parser is
|
||||
switched automatically into XML mode.
|
||||
|
||||
\begin{code}
|
||||
@ -346,21 +347,21 @@ Currently switching to XML mode implies:
|
||||
|
||||
\begin{itemlist}
|
||||
\item [XML empty elements]
|
||||
The construct \verb$<element [attribute...] />$ is recognised as
|
||||
an empty element.
|
||||
The construct \verb$<element [attribute...] />$ is recognised as
|
||||
an empty element.
|
||||
|
||||
\item [Predefined entities]
|
||||
The following entitities are predefined: \const{lt} (\verb$<$), \const{gt}
|
||||
(\verb$>$), \const{amp} (\verb$&$), \const{apos} (\verb$'$)
|
||||
and \const{quot} (\verb$"$).
|
||||
(\verb$>$), \const{amp} (\verb$&$), \const{apos} (\verb$'$)
|
||||
and \const{quot} (\verb$"$).
|
||||
|
||||
\item [Case sensitivity]
|
||||
In XML mode, names are treated case-sensitive, except for the DTD
|
||||
reserved names (i.e. \exam{ELEMENT}, \emph{etc.}).
|
||||
In XML mode, names are treated case-sensitive, except for the DTD
|
||||
reserved names (i.e. \exam{ELEMENT}, \emph{etc.}).
|
||||
|
||||
\item [Character classes]
|
||||
In XML mode, underscores (\verb$_$) and colon (\verb$:$) are
|
||||
allowed in names.
|
||||
allowed in names.
|
||||
|
||||
\item [White-space handling]
|
||||
White space mode is set to \const{preserve}. In addition to setting
|
||||
@ -378,28 +379,28 @@ preserves space, regardless of the default processing mode.
|
||||
|
||||
\subsubsection{XML Namespaces} \label{sec:xmlns}
|
||||
|
||||
Using the \jargon{dialect} \const{xmlns}, the parser will interpret XML
|
||||
namespaces. In this case, the names of elements are returned as a term
|
||||
Using the \jargon{dialect} \const{xmlns}, the parser will interpret XML
|
||||
namespaces. In this case, the names of elements are returned as a term
|
||||
of the format
|
||||
|
||||
\begin{quote}
|
||||
\arg{URL}\const{:}\arg{LocalName}
|
||||
\arg{URL}\const{:}\arg{LocalName}
|
||||
\end{quote}
|
||||
|
||||
If an identifier has no namespace and there is no default namespace it
|
||||
is returned as a simple atom. If an identifier has a namespace but this
|
||||
namespace is undeclared, the namespace name rather than the related URL
|
||||
If an identifier has no namespace and there is no default namespace it
|
||||
is returned as a simple atom. If an identifier has a namespace but this
|
||||
namespace is undeclared, the namespace name rather than the related URL
|
||||
is returned.
|
||||
|
||||
Attributes declaring namespaces ({\tt xmlns:<ns>=<url>}) are reported
|
||||
as if \const{xmlns} were not a defined resource.
|
||||
|
||||
In many cases, getting attribute-names as <xmp>\arg{url}:\arg{name}</xmp>
|
||||
is not desirable. Such terms are hard to unify and sometimes multiple
|
||||
URLs may be mapped to the same identifier. This may happen due to poor
|
||||
version management, poor standardisation or because the the application
|
||||
doesn't care too much about versions. This package defines two
|
||||
call-backs that can be set using set_sgml_parser/2 to deal
|
||||
In many cases, getting attribute-names as \arg{url}:\arg{name}
|
||||
is not desirable. Such terms are hard to unify and sometimes multiple
|
||||
URLs may be mapped to the same identifier. This may happen due to poor
|
||||
version management, poor standardisation or because the the application
|
||||
doesn't care too much about versions. This package defines two
|
||||
call-backs that can be set using set_sgml_parser/2 to deal
|
||||
with this problem.
|
||||
|
||||
The call-back \const{xmlns} is called as XML namespaces are noticed.
|
||||
@ -428,6 +429,41 @@ load_rdf_xml(File, Term) :-
|
||||
]).
|
||||
\end{code}
|
||||
|
||||
The library provides iri_xml_namespace/3 to break down an IRI into
|
||||
its namespace and localname:
|
||||
|
||||
\begin{description}
|
||||
\predicate[det]{iri_xml_namespace}{3}{+IRI, -Namespace, -Localname}
|
||||
Split an IRI (Unicode URI) into its \arg{Namespace} (an IRI) and
|
||||
\arg{Localname} (a Unicode XML name, see xml_name/2). The
|
||||
\arg{Localname} is defined as the longest last part of the IRI that
|
||||
satisfies the syntax of an XML name. With IRI schemas that are designed
|
||||
to work with XML namespaces, this will typically break the IRI on the
|
||||
last \chr{\#} or \chr{/}. Note however that this can produce unexpected
|
||||
results. E.g., in the example below, one might expect the namespace to
|
||||
be \url{http://example.com/images\#}, but an XML name cannot start with
|
||||
a digit.
|
||||
|
||||
\begin{code}
|
||||
?- iri_xml_namespace('http://example.com/images#12345', NS, L).
|
||||
NS = 'http://example.com/images#12345',
|
||||
L = ''.
|
||||
\end{code}
|
||||
|
||||
As we see from the example above, the \arg{Localname} can be the empty
|
||||
atom. Similarly, \arg{Namespace} can be the empty atom if \arg{IRI} is
|
||||
an XML name. Applications will often have to check for either or both
|
||||
these conditions. We decided against failing in these conditions because
|
||||
the application typically wants to know which of the two conditions
|
||||
(empty namespace or empty localname) holds. This predicate is often used
|
||||
for generating RDF/XML from an RDF graph.
|
||||
|
||||
\predicate[det]{iri_xml_namespace}{2}{+IRI, -Namespace}
|
||||
Same as iri_xml_namespace/3, but avoids creating an atom for the
|
||||
\arg{Localname}.
|
||||
\end{description}
|
||||
|
||||
|
||||
\subsection{DTD-Handling}
|
||||
|
||||
The DTD (\textbf{D}ocument \textbf{T}ype \textbf{D}efinition) is a
|
||||
@ -438,7 +474,7 @@ predicates for handling the DTD.
|
||||
|
||||
\begin{description}
|
||||
\predicate{new_dtd}{2}{+DocType, -DTD}
|
||||
Creates an empty DTD for the named \arg{DocType}. The returned
|
||||
Creates an empty DTD for the named \arg{DocType}. The returned
|
||||
DTD-reference is an opaque term that can be used in the other predicates
|
||||
of this package.
|
||||
|
||||
@ -468,7 +504,7 @@ Define the DTD dialect. Default is \const{sgml}. Using \const{xml} or
|
||||
|
||||
\predicate{dtd}{2}{+DocType, -DTD}
|
||||
Find the DTD representing the indicated \jargon{doctype}. This predicate
|
||||
uses a cache of DTD objects. If a doctype has no associated dtd, it
|
||||
uses a cache of DTD objects. If a doctype has no associated dtd, it
|
||||
searches for a file using the file search path \exam{dtd} using the call:
|
||||
|
||||
\begin{code}
|
||||
@ -488,15 +524,15 @@ parse multiple documents should be restricted to situations where the
|
||||
documents processed are known to be error-free.
|
||||
|
||||
\predicate{dtd_property}{2}{+DTD, ?Property}
|
||||
This predicate is used to examine the content of a DTD. Property is one
|
||||
This predicate is used to examine the content of a DTD. Property is one
|
||||
of:
|
||||
|
||||
\begin{description}
|
||||
\termitem{doctype}{DocType}
|
||||
An atom representing the document-type defined by this DTD.
|
||||
An atom representing the document-type defined by this DTD.
|
||||
|
||||
\termitem{elements}{ListOfElements}
|
||||
A list of atoms representing the names of the elements in this DTD.
|
||||
A list of atoms representing the names of the elements in this DTD.
|
||||
|
||||
\termitem{element}{Name, Omit, Content}
|
||||
The DTD contains an element with the given name. \arg{Omit} is a term of
|
||||
@ -508,7 +544,7 @@ form:
|
||||
|
||||
\begin{description}
|
||||
\termitem{empty}{}
|
||||
The element has no content.
|
||||
The element has no content.
|
||||
|
||||
\termitem{cdata}{}
|
||||
The element contains non-parsed character data. All data up to the
|
||||
@ -524,30 +560,30 @@ any order.
|
||||
\termitem{\#pcdata}{}
|
||||
The element contains parsed character data .
|
||||
|
||||
\termitem{\arg{element}} An element with this name.
|
||||
\termitem{\arg{element}} An element with this name.
|
||||
|
||||
\termitem{*}{SubModel}
|
||||
0 or more appearances.
|
||||
0 or more appearances.
|
||||
|
||||
\termitem{?}{SubModel}
|
||||
0 or one appearance.
|
||||
0 or one appearance.
|
||||
|
||||
\termitem{+}{SubModel}
|
||||
1 or more appearances.
|
||||
1 or more appearances.
|
||||
|
||||
\termitem{,}{SubModel1, SubModel2}
|
||||
\arg{SubModel1} followed by \arg{SubModel2}.
|
||||
\arg{SubModel1} followed by \arg{SubModel2}.
|
||||
|
||||
\termitem{\&}{SubModel1, SubModel2}
|
||||
\arg{SubModel1} and \arg{SubModel2} in any order.
|
||||
\arg{SubModel1} and \arg{SubModel2} in any order.
|
||||
|
||||
\termitem{\chr{|}}{SubModel1, SubModel2}
|
||||
\arg{SubModel1} or \arg{SubModel2}.
|
||||
\arg{SubModel1} or \arg{SubModel2}.
|
||||
\end{description}
|
||||
|
||||
\termitem{attributes}{Element, ListOfAttributes}
|
||||
\arg{ListOfAttributes} is a list of atoms representing the attributes
|
||||
of the element \arg{Element}.
|
||||
\arg{ListOfAttributes} is a list of atoms representing the attributes
|
||||
of the element \arg{Element}.
|
||||
|
||||
\termitem{attribute}{Element, Attribute, Type, Default}
|
||||
Query an element. \arg{Type} is one of \const{cdata}, \const{entity},
|
||||
@ -555,34 +591,34 @@ Query an element. \arg{Type} is one of \const{cdata}, \const{entity},
|
||||
\const{notation}, \const{number} or \const{nutoken}. For DTD types that
|
||||
allow for a list, the notation \term{list}{Type} is used. Finally, the
|
||||
DTD construct \verb$(a|b|...)$ is mapped to the term
|
||||
\term{nameof}{ListOfValues}.
|
||||
\term{nameof}{ListOfValues}.
|
||||
|
||||
\arg{Default} describes the sgml default. It is one \const{required},
|
||||
\const{current}, \const{conref} or \const{implied}. If a real default is
|
||||
present, it is one of \term{default}{Value} or \term{fixed}{Value}.
|
||||
present, it is one of \term{default}{Value} or \term{fixed}{Value}.
|
||||
|
||||
\termitem{entities}{ListOfEntities}
|
||||
\arg{ListOfEntities} is a list of atoms representing the names of the
|
||||
defined entities.
|
||||
\arg{ListOfEntities} is a list of atoms representing the names of the
|
||||
defined entities.
|
||||
|
||||
\termitem{entity}{Name, Value}
|
||||
\arg{Name} is the name of an entity with given value. Value is one of
|
||||
\arg{Name} is the name of an entity with given value. Value is one of
|
||||
\begin{description}
|
||||
|
||||
\termitem{\arg{Atom}}{}
|
||||
If the value is atomic, it represents the literal value of the entity.
|
||||
If the value is atomic, it represents the literal value of the entity.
|
||||
|
||||
\termitem{system}{Url}
|
||||
\arg{Url} is the URL of the system external entity.
|
||||
\arg{Url} is the URL of the system external entity.
|
||||
|
||||
\termitem{public}{Id, Url}
|
||||
For external public entities, \arg{Id} is the identifier. If an URL is
|
||||
provided this is returned in \arg{Url}. Otherwise this argument is
|
||||
unbound.
|
||||
For external public entities, \arg{Id} is the identifier. If an URL is
|
||||
provided this is returned in \arg{Url}. Otherwise this argument is
|
||||
unbound.
|
||||
\end{description}
|
||||
|
||||
\termitem{notations}{ListOfNotations}
|
||||
Returns a list holding the names of all \const{NOTATION} declarations.
|
||||
Returns a list holding the names of all \const{NOTATION} declarations.
|
||||
|
||||
\termitem{notation}{Name, Decl}
|
||||
Unify \arg{Decl} with a list if \term{system}{+File} and/or
|
||||
@ -592,11 +628,11 @@ Unify \arg{Decl} with a list if \term{system}{+File} and/or
|
||||
|
||||
\subsubsection{The DOCTYPE declaration}
|
||||
|
||||
As this parser allows for processing partial documents and process the
|
||||
As this parser allows for processing partial documents and process the
|
||||
DTD separately, the DOCTYPE declaration plays a special role.
|
||||
|
||||
If a document has no DOCTYPE declaraction, the parser returns a list
|
||||
holding all elements and CDATA found. If the document has a DOCTYPE
|
||||
If a document has no DOCTYPE declaraction, the parser returns a list
|
||||
holding all elements and CDATA found. If the document has a DOCTYPE
|
||||
declaraction, the parser will open the element defined in the DOCTYPE as
|
||||
soon as the first real data is encountered.
|
||||
|
||||
@ -632,53 +668,63 @@ elements_in_xml_document(File, Elements) :-
|
||||
|
||||
\begin{description}
|
||||
\predicate{new_sgml_parser}{2}{-Parser, +Options}
|
||||
Creates a new parser. A parser can be used one or multiple times for
|
||||
parsing documents or parts thereof. It may be bound to a DTD or the DTD
|
||||
may be left implicit, in which case it is created from the document
|
||||
Creates a new parser. A parser can be used one or multiple times for
|
||||
parsing documents or parts thereof. It may be bound to a DTD or the DTD
|
||||
may be left implicit, in which case it is created from the document
|
||||
prologue or parsing is performed without a DTD. Options:
|
||||
\begin{description}
|
||||
\termitem{dtd}{?DTD}
|
||||
If specified with an initialised DTD, this DTD is used for parsing the
|
||||
document, regardless of the document prologue. If specified using as a
|
||||
variable, a reference to the created DTD is returned. This DTD may be
|
||||
created from the document prologue or build implicitely from the
|
||||
document's content.
|
||||
If specified with an initialised DTD, this DTD is used for parsing the
|
||||
document, regardless of the document prologue. If specified using as a
|
||||
variable, a reference to the created DTD is returned. This DTD may be
|
||||
created from the document prologue or build implicitely from the
|
||||
document's content.
|
||||
\end{description}
|
||||
|
||||
\predicate{free_sgml_parser}{1}{+Parser}
|
||||
Destroy all resources related to the parser. This does not destroy the
|
||||
Destroy all resources related to the parser. This does not destroy the
|
||||
DTD if the parser was created using the \term{dtd}{DTD} option.
|
||||
|
||||
\predicate{set_sgml_parser}{2}{+Parser, +Option}
|
||||
Sets attributes to the parser. Currently defined attributes:
|
||||
Sets attributes to the parser. Currently defined attributes:
|
||||
|
||||
\begin{description}
|
||||
\termitem{file}{File}
|
||||
Sets the file for reporting errors and warnings. Sets the line to 1.
|
||||
Sets the file for reporting errors and warnings. Sets the line to 1.
|
||||
\termitem{line}{Line}
|
||||
Sets the current line. Useful if the stream is not at the start of the
|
||||
(file) object for generating proper line-numbers.
|
||||
Sets the current line. Useful if the stream is not at the start of the
|
||||
(file) object for generating proper line-numbers.
|
||||
\termitem{charpos}{Offset}
|
||||
Sets the current character location. See also the \term{file}{File}
|
||||
option.
|
||||
\termitem{dialect}{Dialect}
|
||||
Set the markup dialect. Known dialects:
|
||||
Set the markup dialect. Known dialects:
|
||||
\begin{description}
|
||||
|
||||
\termitem{sgml}{}
|
||||
The default dialect is to process as SGML. This implies markup is
|
||||
case-insensitive and standard SGML abbreviation is allowed (abreviated
|
||||
attributes and omitted tags).
|
||||
The default dialect is to process as SGML. This implies markup is
|
||||
case-insensitive and standard SGML abbreviation is allowed (abreviated
|
||||
attributes and omitted tags).
|
||||
|
||||
\termitem{xml}{}
|
||||
This dialect is selected automatically if the processing instruction
|
||||
\verb$<?xml ...>$ is encountered. See \secref{xml} for details.
|
||||
\verb$<?xml ...>$ is encountered. See \secref{xml} for details.
|
||||
|
||||
\termitem{xmlns}{}
|
||||
Process file as XML file with namespace support. See \secref{xmlns} for
|
||||
details. See also the \verb$qualify_attributes$ option below.
|
||||
\end{description}
|
||||
|
||||
\termitem{xmlns}{+URI}
|
||||
Set the default namespace of the outer environment. This option is
|
||||
provided to process partial XML content with proper namespace
|
||||
resolution.
|
||||
|
||||
\termitem{xmlns}{+NS, +URI}
|
||||
Specify a namespace for the outer environment. This option is
|
||||
provided to process partial XML content with proper namespace
|
||||
resolution.
|
||||
|
||||
\termitem{qualify_attributes}{Boolean}
|
||||
How to handle unqualified attribute (i.e. without an explicit namespace)
|
||||
in XML namespace (\const{xmlns}) mode. Default and standard compliant is
|
||||
@ -715,20 +761,20 @@ sgml_parse/2.
|
||||
\end{description}
|
||||
|
||||
\predicate{get_sgml_parser}{2}{+Parser, -Option}
|
||||
Retrieve infomation on the current status of the parser. Notably useful
|
||||
if the parser is used in the call-back mode. Currently defined options:
|
||||
Retrieve infomation on the current status of the parser. Notably useful
|
||||
if the parser is used in the call-back mode. Currently defined options:
|
||||
|
||||
\begin{description}
|
||||
\termitem{file}{-File}
|
||||
Current file-name. Note that this may be different from the provided
|
||||
file if an external entity is being loaded.
|
||||
Current file-name. Note that this may be different from the provided
|
||||
file if an external entity is being loaded.
|
||||
|
||||
\termitem{line}{-Line}
|
||||
Line-offset from where the parser started its processing in the file-object.
|
||||
Line-offset from where the parser started its processing in the file-object.
|
||||
|
||||
\termitem{charpos}{-CharPos}
|
||||
Offset from where the parser started its processing in the file-object.
|
||||
See \secref{indexaccess}.
|
||||
Offset from where the parser started its processing in the file-object.
|
||||
See \secref{indexaccess}.
|
||||
|
||||
\termitem{charpos}{-Start, -End}
|
||||
Character offsets of the start and end of the source processed causing the
|
||||
@ -736,8 +782,8 @@ current call-back. Used in \program{PceEmacs} to for colouring
|
||||
text in SGML and XML modes.
|
||||
|
||||
\termitem{source}{-Stream}
|
||||
Prolog stream being processed. May be used in the \const{on_begin}, \emph{etc.}
|
||||
callbacks from sgml_parse/2.
|
||||
Prolog stream being processed. May be used in the \const{on_begin}, \emph{etc.}
|
||||
callbacks from sgml_parse/2.
|
||||
|
||||
\termitem{dialect}{-Dialect}
|
||||
Return the current dialect used by the parser (\const{sgml}, \const{xml} or \const{xmlns}).
|
||||
@ -822,8 +868,8 @@ Input is a stream. A full description of the option-list is below.
|
||||
|
||||
\begin{description}
|
||||
\termitem{document}{+Term}
|
||||
A variable that will be unified with a list describing the content of
|
||||
the document (see load_structure/2).
|
||||
A variable that will be unified with a list describing the content of
|
||||
the document (see load_structure/2).
|
||||
\termitem{source}{+Stream}
|
||||
An input stream that is read. This option <em/must/ be given.
|
||||
\termitem{content_length}{+Characters}
|
||||
@ -840,7 +886,7 @@ Default. Parse everything upto the end of the input.
|
||||
The parser stops after reading the first element. Using
|
||||
\term{source}{Stream}, this implies reading is stopped as soon
|
||||
as the element is complete, and another call may be issued on the same
|
||||
stream to read the next element.
|
||||
stream to read the next element.
|
||||
|
||||
\termitem{content}{}
|
||||
The value \const{content} is like \const{element} but assumes the
|
||||
@ -860,9 +906,9 @@ all open elements.
|
||||
\end{description}
|
||||
|
||||
\termitem{max_errors}{+MaxErrors}
|
||||
Set the maximum number of errors. If this number is exceeded further
|
||||
writes to the stream will yield an I/O error exception. Printing of
|
||||
errors is suppressed after reaching this value. The default is 100.
|
||||
Set the maximum number of errors. If this number is exceeded further
|
||||
writes to the stream will yield an I/O error exception. Printing of
|
||||
errors is suppressed after reaching this value. The default is 100.
|
||||
\termitem{syntax_errors}{+ErrorMode}
|
||||
Defines how syntax errors are handled.
|
||||
\begin{description}
|
||||
@ -875,28 +921,35 @@ Defines how syntax errors are handled.
|
||||
using <pref builtin>print_message/2 with severity
|
||||
\const{informational}.
|
||||
\end{description}
|
||||
|
||||
\termitem{xml_no_ns}{+Mode}
|
||||
Error handling if an XML namespace is not defined. Default generates
|
||||
an error. If \const{quiet}, the error is suppressed. Can be used
|
||||
together with \term{call}{urlns, Closure} to provide external expansion
|
||||
of namespaces. See also \secref{xmlns}.
|
||||
|
||||
\termitem{call}{+Event, :PredicateName}
|
||||
Issue call-backs on the specified events. \arg{PredicateName} is the
|
||||
name of the predicate to call on this event, possibly prefixed with a
|
||||
Issue call-backs on the specified events. \arg{PredicateName} is the
|
||||
name of the predicate to call on this event, possibly prefixed with a
|
||||
module identifier. If the handler throws an exception, parsing is stopped
|
||||
and sgml_parse/2 re-throws the exception. The defined events are:
|
||||
\begin{description}
|
||||
\termitem{begin}{}
|
||||
An open-tag has been parsed. The named handler is called with three
|
||||
arguments: \term{\arg{Handler}}{+Tag, +Attributes, +Parser}.
|
||||
An open-tag has been parsed. The named handler is called with three
|
||||
arguments: \term{\arg{Handler}}{+Tag, +Attributes, +Parser}.
|
||||
\termitem{end}{}
|
||||
A close-tag has been parsed. The named handler is called with two
|
||||
arguments: \term{\arg{Handler}}{+Tag, +Parser}.
|
||||
A close-tag has been parsed. The named handler is called with two
|
||||
arguments: \term{\arg{Handler}}{+Tag, +Parser}.
|
||||
|
||||
\termitem{cdata}{}
|
||||
CDATA has been parsed. The named handler is called with two arguments:
|
||||
\term{Handler}{+CDATA, +Parser}, where CDATA is an atom
|
||||
representing the data.
|
||||
representing the data.
|
||||
|
||||
\termitem{pi}{}
|
||||
A processing instruction has been parsed. The named handler is called
|
||||
A processing instruction has been parsed. The named handler is called
|
||||
with two arguments: \term{\arg{Handler}}{+Text, +Parser}, where
|
||||
\arg{Text} is the text of the processing instruction.
|
||||
\arg{Text} is the text of the processing instruction.
|
||||
|
||||
\termitem{decl}{}
|
||||
A declaration (\verb$<!...>$) has been read. The named handler is
|
||||
@ -918,33 +971,33 @@ If this option is present, errors and warnings are not reported using
|
||||
print_message/3
|
||||
|
||||
\termitem{xmlns}{}
|
||||
When parsing an in \const{xmlns} mode, a new namespace declaraction is
|
||||
pushed on the environment. The named handler is called with three
|
||||
When parsing an in \const{xmlns} mode, a new namespace declaraction is
|
||||
pushed on the environment. The named handler is called with three
|
||||
arguments: \term{\arg{Handler}}{+NameSpace, +URL, +Parser}.
|
||||
See \secref{xmlns} for details.
|
||||
See \secref{xmlns} for details.
|
||||
|
||||
\termitem{urlns}{}
|
||||
When parsing an in \const{xmlns} mode, this predicate can be used to map a
|
||||
url into either a canonical URL for this namespace or another internal
|
||||
identifier. See \secref{xmlns} for details.
|
||||
When parsing an in \const{xmlns} mode, this predicate can be used to map a
|
||||
url into either a canonical URL for this namespace or another internal
|
||||
identifier. See \secref{xmlns} for details.
|
||||
\end{description}
|
||||
\end{description}
|
||||
\end{description}
|
||||
|
||||
\subsubsection{Partial Parsing}
|
||||
|
||||
In some cases, part of a document needs to be parsed. One option is to
|
||||
use load_structure/2 or one of its variations and extract
|
||||
the desired elements from the returned structure. This is a clean
|
||||
solution, especially on small and medium-sized documents. It however is
|
||||
unsuitable for parsing really big documents. Such documents can only be
|
||||
In some cases, part of a document needs to be parsed. One option is to
|
||||
use load_structure/2 or one of its variations and extract
|
||||
the desired elements from the returned structure. This is a clean
|
||||
solution, especially on small and medium-sized documents. It however is
|
||||
unsuitable for parsing really big documents. Such documents can only be
|
||||
handled with the call-back output interface realised by the
|
||||
\term{call}{Event, Action} option of sgml_parse/2.
|
||||
Event-driven processing is not very natural in Prolog.
|
||||
|
||||
The SGML2PL library allows for a mixed approach. Consider the case where
|
||||
we want to process all descriptions from RDF elements in a document. The
|
||||
code below calls <xmp>process_rdf_description(Element)</xmp> on each element
|
||||
code below calls <xmp>process_rdf_description(Element)</xmp> on each element
|
||||
that is directly inside an RDF element.
|
||||
|
||||
\begin{code}
|
||||
@ -994,26 +1047,28 @@ set_sgml_parser/2 or, for XML, based on the \const{encoding}
|
||||
attribute of the XML header. The parser reads from SWI-Prolog streams,
|
||||
which also provide encoding handling. Therefore, there are two modes
|
||||
for parsing. If the SWI-Prolog stream has encoding \const{octet} (which
|
||||
is the default for binary streams), the decoder of the SGML parser will
|
||||
is the default for binary streams), the decoder of the SGML parser will
|
||||
be used and positions reported by the parser are octet offsets in the
|
||||
stream. In other cases, the Prolog stream decoder is used and offsets
|
||||
are character code counts.
|
||||
|
||||
\input{xpath.tex}
|
||||
|
||||
\section{Processing Indexed Files} \label{sec:indexaccess}
|
||||
|
||||
In some cases applications wish to process small portions of large
|
||||
SGML, XML or RDF files. For example, the \emph{OpenDirectory} project
|
||||
by Netscape has produced a 90MB RDF file representing the main index.
|
||||
The parser described here can process this document as a unit, but
|
||||
loading takes 85 seconds on a Pentium-II 450 and the resulting term
|
||||
requires about 70MB global stack. One option is to process the entire
|
||||
document and output it as a Prolog fact-base of RDF triplets, but in
|
||||
many cases this is undesirable. Another example is a large SGML file
|
||||
containing online documentation. The application normally wishes to
|
||||
provide only small portions at a time to the user. Loading the entire
|
||||
In some cases applications wish to process small portions of large
|
||||
SGML, XML or RDF files. For example, the \emph{OpenDirectory} project
|
||||
by Netscape has produced a 90MB RDF file representing the main index.
|
||||
The parser described here can process this document as a unit, but
|
||||
loading takes 85 seconds on a Pentium-II 450 and the resulting term
|
||||
requires about 70MB global stack. One option is to process the entire
|
||||
document and output it as a Prolog fact-base of RDF triplets, but in
|
||||
many cases this is undesirable. Another example is a large SGML file
|
||||
containing online documentation. The application normally wishes to
|
||||
provide only small portions at a time to the user. Loading the entire
|
||||
document into memory is then undesirable.
|
||||
|
||||
Using the \term{parse}{element} option, we open a file, seek
|
||||
Using the \term{parse}{element} option, we open a file, seek
|
||||
(using <pref builtin>seek/4) to the position of the element and
|
||||
read the desired element.
|
||||
|
||||
@ -1059,12 +1114,12 @@ rdf_element(Id, Term) :-
|
||||
|
||||
\section{External entities}
|
||||
|
||||
While processing an SGML document the document may refer to external
|
||||
data. This occurs in three places: external parameter entities, normal
|
||||
external entities and the \const{DOCTYPE} declaration. The current version
|
||||
of this tool deals rather primitively with external data. External
|
||||
entities can only be loaded from a file and the mapping between the
|
||||
entity names and the file is done using a \jargon{catalog} file in a
|
||||
While processing an SGML document the document may refer to external
|
||||
data. This occurs in three places: external parameter entities, normal
|
||||
external entities and the \const{DOCTYPE} declaration. The current version
|
||||
of this tool deals rather primitively with external data. External
|
||||
entities can only be loaded from a file and the mapping between the
|
||||
entity names and the file is done using a \jargon{catalog} file in a
|
||||
format compatible with that used by James Clark's SP Parser,
|
||||
based on the SGML Open (now OASIS) specification.
|
||||
|
||||
@ -1075,23 +1130,23 @@ sgml_register_catalog_file/2 or the environment variable
|
||||
\begin{description}
|
||||
\predicate{sgml_register_catalog_file}{2}{+File, +Location}
|
||||
Register the indicated \arg{File} as a catalog file. \arg{Location} is
|
||||
either \const{start} or \const{end} and defines whether the catalog is
|
||||
either \const{start} or \const{end} and defines whether the catalog is
|
||||
considered first or last. This predicate has no effect if \arg{File} is
|
||||
already part of the catalog.
|
||||
|
||||
If no files are registered using this predicate, the first query on the
|
||||
If no files are registered using this predicate, the first query on the
|
||||
catalog examines \env{SGML_CATALOG_FILES} and fills the catalog with
|
||||
all files in this path.
|
||||
all files in this path.
|
||||
\end{description}
|
||||
|
||||
Two types of lines are used by this package.
|
||||
|
||||
\begin{quote}
|
||||
\const{DOCTYPE} \arg{doctype} \arg{file} \\
|
||||
\const{PUBLIC} \exam{"}\arg{Id}\exam{"} \arg{file}
|
||||
\const{PUBLIC} \exam{"}\arg{Id}\exam{"} \arg{file}
|
||||
\end{quote}
|
||||
|
||||
The specified \arg{file} path is taken relative to the location of the
|
||||
The specified \arg{file} path is taken relative to the location of the
|
||||
catolog file. For the \const{DOCTYPE} declaraction, \pllib{sgml} first
|
||||
makes an attempt to resolve the \const{SYSTEM} or \const{PUBLIC}
|
||||
identifier. If this fails it tries to resolve the \arg{doctype} using
|
||||
@ -1102,10 +1157,12 @@ where system identifiers must be Universal Resource Indicators, not
|
||||
local file names. Simple uses of relative URIs will work correctly under
|
||||
UNIX and Windows.
|
||||
|
||||
In the future we will design a call-back mechanism for locating and
|
||||
processing external entities, so Prolog-based file-location and Prolog
|
||||
In the future we will design a call-back mechanism for locating and
|
||||
processing external entities, so Prolog-based file-location and Prolog
|
||||
resources can be used to store external entities.
|
||||
|
||||
\input{pwp.tex}
|
||||
|
||||
\section{Writing markup}
|
||||
|
||||
\subsection{Writing documents}
|
||||
@ -1149,14 +1206,14 @@ elements are written using increasing indentation. This introduces
|
||||
(depending on the mode and defined whitespace handling) CDATA sequences
|
||||
with only layout between elements when read back in. If \const{false}, no
|
||||
layout characters are added. As this mode does not need to analyse the
|
||||
document it is faster and guarantees correct output when read back.
|
||||
Unfortunately the output is hardly human readable and causes problems
|
||||
document it is faster and guarantees correct output when read back.
|
||||
Unfortunately the output is hardly human readable and causes problems
|
||||
with many editors.
|
||||
\termitem{indent}{Integer}
|
||||
Set the initial element indentation. It more than zero, the indent
|
||||
is written before the document.
|
||||
\termitem{nsmap}{Map}
|
||||
Set the initial namespace map. \arg{Map} is a list of
|
||||
Set the initial namespace map. \arg{Map} is a list of
|
||||
\arg{Name} = \arg{URI}. This option, together with \const{header} and
|
||||
\const{ident} is added to use xml_write/3 to generate XML
|
||||
that is embedded in a larger XML document.
|
||||
@ -1197,7 +1254,7 @@ values are \const{ascii}, \const{iso_latin_1}, \const{utf8} and
|
||||
\const{unicode}. Versions with two arguments are provided for backward
|
||||
compatibility, making the safe \const{ascii} encoding assumption.
|
||||
|
||||
\begin{description}
|
||||
\begin{description}
|
||||
\predicate{xml_quote_attribute}{3}{+In, -Quoted, +Encoding}
|
||||
Map the characters that may not appear in XML attributes to entities.
|
||||
Currently these are \verb$<>&"$.%
|
||||
@ -1222,8 +1279,8 @@ Assumes \const{ascii} encoding.
|
||||
Succeed if \arg{In} is an atom or string that satisfies the rules for
|
||||
a valid XML element or attribute name. As with the other predicates in
|
||||
this group, if \arg{Encoding} cannot represent one of the characters, this
|
||||
function fails. It uses a hard-coded table for ASCII-range characters and
|
||||
iswalpha()/iswalnum() for the first and remaining characters of the name.
|
||||
function fails. Character classification is based on
|
||||
\url{http://www.w3.org/TR/2006/REC-xml-20060816}.
|
||||
|
||||
\predicate{xml_name}{1}{+In}
|
||||
Backward compatibility version for xml_name/2. Assumes \const{ascii}
|
||||
@ -1238,8 +1295,8 @@ Known missing SGML features include
|
||||
|
||||
\begin{itemlist}
|
||||
\item [NOTATION on entities]
|
||||
Though notation is parsed, notation attributes on external entity
|
||||
declarations are not handed to the user.
|
||||
Though notation is parsed, notation attributes on external entity
|
||||
declarations are not handed to the user.
|
||||
\item [NOTATION attributes]
|
||||
SGML notations may have attributes, declared using
|
||||
\verb$<!ATTLIST #NOTATION name attributes>$. Those data attributes
|
||||
@ -1261,8 +1318,8 @@ Empty start tags (\verb$<>$), unclosed start tags
|
||||
(\verb$<a<b</verb>) and unclosed end tags (<verb></a<b$) are not
|
||||
supported.
|
||||
\item [SGML declaration]
|
||||
The `SGML declaration' is fixed, though most of the parameters are
|
||||
handled through indirections in the implementation.
|
||||
The `SGML declaration' is fixed, though most of the parameters are
|
||||
handled through indirections in the implementation.
|
||||
\item [The DATATAG feature]
|
||||
It is regarded as superseeded by SHORTREF, which is supported.
|
||||
(SP does not support it either.)
|
||||
@ -1276,7 +1333,7 @@ one DTD at the same time. It is not supported.
|
||||
\end{itemlist}
|
||||
|
||||
|
||||
In XML mode the parser recognises SGML constructs that are not allowed
|
||||
In XML mode the parser recognises SGML constructs that are not allowed
|
||||
in XML. Also various extensions of XML over SGML are not yet realised.
|
||||
In particular, XInclude is not implemented because the designers of
|
||||
XInclude can't make up their minds whether to base it on elements or
|
||||
@ -1305,7 +1362,7 @@ refers to the SWI-Prolog `home-directory'.
|
||||
|
||||
\section{Acknowledgements}
|
||||
|
||||
The Prolog representation for parsed documents is based on the
|
||||
The Prolog representation for parsed documents is based on the
|
||||
SWI-Prolog interface to SP by Anjo Anjewierden.
|
||||
|
||||
Richard O'Keefe has put a lot of effort testing and providing bug
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -61,7 +61,7 @@ sgml__utf8_get_char(const char *in, int *chr)
|
||||
}
|
||||
|
||||
*chr = *in;
|
||||
|
||||
|
||||
return (char *)in+1;
|
||||
}
|
||||
|
||||
|
@ -26,7 +26,6 @@
|
||||
|
||||
#define UTIL_H_IMPLEMENTATION
|
||||
#include "util.h"
|
||||
#include <unistd.h>
|
||||
#include <ctype.h>
|
||||
#include <wctype.h>
|
||||
#include <stdlib.h>
|
||||
@ -50,7 +49,7 @@
|
||||
size_t
|
||||
istrlen(const ichar *s)
|
||||
{ size_t len =0;
|
||||
|
||||
|
||||
while(*s++)
|
||||
len++;
|
||||
|
||||
@ -67,7 +66,7 @@ istrdup(const ichar *s)
|
||||
while(*s)
|
||||
*d++ = *s++;
|
||||
*d = 0;
|
||||
|
||||
|
||||
return dup;
|
||||
} else
|
||||
{ return NULL;
|
||||
@ -140,10 +139,10 @@ int
|
||||
istreq(const ichar *s1, const ichar *s2)
|
||||
{ while(*s1 && *s1 == *s2)
|
||||
s1++, s2++;
|
||||
|
||||
|
||||
if ( *s1 == 0 && *s2 == 0 )
|
||||
return TRUE;
|
||||
|
||||
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
@ -152,10 +151,10 @@ int
|
||||
istrncaseeq(const ichar *s1, const ichar *s2, int len)
|
||||
{ while(--len >= 0 && towlower(*s1) == towlower(*s2))
|
||||
s1++, s2++;
|
||||
|
||||
|
||||
if ( len < 0 )
|
||||
return TRUE;
|
||||
|
||||
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
@ -164,10 +163,10 @@ int
|
||||
istrprefix(const ichar *pref, const ichar *s)
|
||||
{ while(*pref && *pref == *s)
|
||||
pref++, s++;
|
||||
|
||||
|
||||
if ( *pref == 0 )
|
||||
return TRUE;
|
||||
|
||||
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
@ -212,7 +211,7 @@ istrhash(const ichar *t, int tsize)
|
||||
|
||||
while(*t)
|
||||
{ unsigned int c = *t++;
|
||||
|
||||
|
||||
c -= 'a';
|
||||
value ^= c << (shift & 0xf);
|
||||
shift ^= c;
|
||||
@ -231,7 +230,7 @@ istrcasehash(const ichar *t, int tsize)
|
||||
|
||||
while(*t)
|
||||
{ unsigned int c = towlower(*t++); /* case insensitive */
|
||||
|
||||
|
||||
c -= 'a';
|
||||
value ^= c << (shift & 0xf);
|
||||
shift ^= c;
|
||||
@ -301,7 +300,7 @@ __add_icharbuf(icharbuf *buf, int chr)
|
||||
else
|
||||
buf->data = sgml_malloc(buf->allocated*sizeof(ichar));
|
||||
}
|
||||
|
||||
|
||||
buf->data[buf->size++] = chr;
|
||||
}
|
||||
|
||||
@ -349,7 +348,7 @@ init_ocharbuf(ocharbuf *buf)
|
||||
ocharbuf *
|
||||
new_ocharbuf()
|
||||
{ ocharbuf *buf = sgml_malloc(sizeof(*buf));
|
||||
|
||||
|
||||
return init_ocharbuf(buf);
|
||||
}
|
||||
|
||||
@ -436,24 +435,76 @@ empty_ocharbuf(ocharbuf *buf)
|
||||
*******************************/
|
||||
|
||||
#define RINGSIZE 16
|
||||
static void *ring[RINGSIZE];
|
||||
static int ringp;
|
||||
|
||||
typedef struct ring
|
||||
{ void *ring[RINGSIZE];
|
||||
int ringp;
|
||||
} ring;
|
||||
|
||||
#ifdef _REENTRANT
|
||||
#include <pthread.h>
|
||||
static pthread_key_t ring_key;
|
||||
|
||||
static void
|
||||
free_ring(void *ptr)
|
||||
{ ring *r = ptr;
|
||||
int i;
|
||||
void **bp;
|
||||
|
||||
for(i=0, bp=r->ring; i<RINGSIZE; i++, bp++)
|
||||
{ if ( *bp )
|
||||
{ sgml_free(*bp);
|
||||
*bp = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
sgml_free(r);
|
||||
}
|
||||
|
||||
|
||||
static ring *
|
||||
my_ring()
|
||||
{ ring *r;
|
||||
|
||||
if ( (r=pthread_getspecific(ring_key)) )
|
||||
return r;
|
||||
|
||||
if ( (r = sgml_calloc(1, sizeof(*r))) )
|
||||
pthread_setspecific(ring_key, r);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
void
|
||||
init_ring(void)
|
||||
{ pthread_key_create(&ring_key, free_ring);
|
||||
}
|
||||
|
||||
#else
|
||||
static ring ring_store;
|
||||
#define my_ring() (&ring_store)
|
||||
|
||||
void init_ring(void) {}
|
||||
#endif
|
||||
|
||||
|
||||
wchar_t *
|
||||
str2ring(const wchar_t *in)
|
||||
{ wchar_t *copy = sgml_malloc((wcslen(in)+1)*sizeof(wchar_t));
|
||||
{ ring *r;
|
||||
wchar_t *copy;
|
||||
|
||||
if ( !copy )
|
||||
if ( !(r=my_ring()) ||
|
||||
!(copy = sgml_malloc((wcslen(in)+1)*sizeof(wchar_t))) )
|
||||
{ sgml_nomem();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
wcscpy(copy, in);
|
||||
if ( ring[ringp] )
|
||||
sgml_free(ring[ringp]);
|
||||
ring[ringp++] = copy;
|
||||
if ( ringp == RINGSIZE )
|
||||
ringp = 0;
|
||||
if ( r->ring[r->ringp] )
|
||||
sgml_free(r->ring[r->ringp]);
|
||||
r->ring[r->ringp++] = copy;
|
||||
if ( r->ringp == RINGSIZE )
|
||||
r->ringp = 0;
|
||||
|
||||
return copy;
|
||||
}
|
||||
@ -461,13 +512,19 @@ str2ring(const wchar_t *in)
|
||||
|
||||
void *
|
||||
ringallo(size_t size)
|
||||
{ char *result = sgml_malloc(size);
|
||||
|
||||
if ( ring[ringp] )
|
||||
sgml_free(ring[ringp]);
|
||||
ring[ringp++] = result;
|
||||
if ( ringp == RINGSIZE )
|
||||
ringp = 0;
|
||||
{ ring *r;
|
||||
char *result;
|
||||
|
||||
if ( !(r=my_ring()) || !(result = sgml_malloc(size)) )
|
||||
{ sgml_nomem();
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if ( r->ring[r->ringp] )
|
||||
sgml_free(r->ring[r->ringp]);
|
||||
r->ring[r->ringp++] = result;
|
||||
if ( r->ringp == RINGSIZE )
|
||||
r->ringp = 0;
|
||||
|
||||
return result;
|
||||
}
|
||||
@ -529,7 +586,7 @@ wcstoutf8(const wchar_t *in)
|
||||
{ size++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
rc = sgml_malloc(size+1);
|
||||
for(o=rc, s=in; *s; s++)
|
||||
{ o = utf8_put_char(o, *s);
|
||||
@ -605,7 +662,7 @@ load_sgml_file_to_charp(const ichar *file, int normalise_rsre, size_t *length)
|
||||
|
||||
if ( r )
|
||||
{ char *s = r;
|
||||
|
||||
|
||||
while(len>0)
|
||||
{ int n;
|
||||
|
||||
@ -652,7 +709,7 @@ load_sgml_file_to_charp(const ichar *file, int normalise_rsre, size_t *length)
|
||||
|
||||
if ( last_is_lf )
|
||||
r2[--len] = '\0'; /* delete last LF */
|
||||
|
||||
|
||||
if ( length )
|
||||
*length = len;
|
||||
sgml_free(r);
|
||||
|
@ -34,16 +34,16 @@
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
|
||||
typedef struct
|
||||
typedef struct
|
||||
{ int allocated;
|
||||
int size;
|
||||
ichar *data;
|
||||
} icharbuf;
|
||||
|
||||
typedef struct
|
||||
typedef struct
|
||||
{ int allocated;
|
||||
int size;
|
||||
union
|
||||
union
|
||||
{ wchar_t *w; /* UCS */
|
||||
} data;
|
||||
wchar_t localbuf[256]; /* Initial local store */
|
||||
@ -98,6 +98,7 @@ void empty_ocharbuf(ocharbuf *buf);
|
||||
{ buf->data.w[at] = chr; \
|
||||
}
|
||||
|
||||
void init_ring(void);
|
||||
const wchar_t * str_summary(const wchar_t *s, int len);
|
||||
wchar_t * str2ring(const wchar_t *in);
|
||||
void * ringallo(size_t);
|
||||
@ -107,8 +108,6 @@ ichar * load_sgml_file_to_charp(const ichar *file, int normalise_rsre,
|
||||
size_t *len);
|
||||
FILE * wfopen(const wchar_t *name, const char *mode);
|
||||
|
||||
void wputs(ichar *s);
|
||||
|
||||
#if defined(USE_STRING_FUNCTIONS) && !defined(UTIL_H_IMPLEMENTATION)
|
||||
|
||||
#define istrlen(s1) wcslen((s1))
|
||||
|
@ -29,8 +29,6 @@
|
||||
the GNU General Public License.
|
||||
*/
|
||||
|
||||
#include "xml_unicode.h"
|
||||
|
||||
|
||||
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
||||
XML character classification.
|
||||
|
@ -29,35 +29,36 @@
|
||||
|
||||
#ifdef XMLNS
|
||||
|
||||
static xmlns *
|
||||
xmlns *
|
||||
xmlns_push(dtd_parser *p, const ichar *ns, const ichar *url)
|
||||
{ sgml_environment *env = p->environments;
|
||||
dtd_symbol *n = (*ns ? dtd_add_symbol(p->dtd, ns) : (dtd_symbol *)NULL);
|
||||
dtd_symbol *u = dtd_add_symbol(p->dtd, url); /* TBD: ochar/ichar */
|
||||
xmlns *x = sgml_malloc(sizeof(*x));
|
||||
|
||||
if ( p->on_xmlns )
|
||||
(*p->on_xmlns)(p, n, u);
|
||||
x->name = n;
|
||||
x->url = u;
|
||||
|
||||
if ( env )
|
||||
{ xmlns *x = sgml_malloc(sizeof(*n));
|
||||
{ if ( p->on_xmlns )
|
||||
(*p->on_xmlns)(p, n, u);
|
||||
|
||||
x->name = n;
|
||||
x->url = u;
|
||||
x->next = env->xmlns;
|
||||
env->xmlns = x;
|
||||
|
||||
return x;
|
||||
} else
|
||||
{ x->next = p->xmlns;
|
||||
p->xmlns = x;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
return x;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
xmlns_free(sgml_environment *env)
|
||||
{ xmlns *n, *next;
|
||||
xmlns_free(xmlns *n)
|
||||
{ xmlns *next;
|
||||
|
||||
for(n = env->xmlns; n; n = next)
|
||||
for(; n; n = next)
|
||||
{ next = n->next;
|
||||
|
||||
sgml_free(n);
|
||||
@ -66,16 +67,22 @@ xmlns_free(sgml_environment *env)
|
||||
|
||||
|
||||
xmlns *
|
||||
xmlns_find(sgml_environment *env, dtd_symbol *ns)
|
||||
{ for(; env; env = env->parent)
|
||||
{ xmlns *n;
|
||||
xmlns_find(dtd_parser *p, dtd_symbol *ns)
|
||||
{ sgml_environment *env = p->environments;
|
||||
xmlns *n;
|
||||
|
||||
for(n=env->xmlns; n; n = n->next)
|
||||
for(; env; env = env->parent)
|
||||
{ for(n=env->xmlns; n; n = n->next)
|
||||
{ if ( n->name == ns )
|
||||
return n;
|
||||
}
|
||||
}
|
||||
|
||||
for (n=p->xmlns; n; n = n->next)
|
||||
{ if ( n->name == ns )
|
||||
return n;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -97,7 +104,7 @@ void
|
||||
update_xmlns(dtd_parser *p, dtd_element *e, int natts, sgml_attribute *atts)
|
||||
{ dtd_attr_list *al;
|
||||
int nschr = p->dtd->charfunc->func[CF_NS]; /* : */
|
||||
|
||||
|
||||
for(al=e->attributes; al; al=al->next)
|
||||
{ dtd_attr *a = al->attribute;
|
||||
const ichar *name = a->name->name;
|
||||
@ -123,7 +130,7 @@ update_xmlns(dtd_parser *p, dtd_element *e, int natts, sgml_attribute *atts)
|
||||
xmlns_resolve()
|
||||
Convert a symbol as returned by the XML level-1.0 parser to its namespace
|
||||
tuple {url}localname. This function is not used internally, but provided
|
||||
for use from the call-back functions of the parser.
|
||||
for use from the call-back functions of the parser.
|
||||
|
||||
It exploits the stack of namespace-environments managed by the parser
|
||||
itself (see update_xmlns())
|
||||
@ -150,7 +157,7 @@ xmlns_resolve_attribute(dtd_parser *p, dtd_symbol *id,
|
||||
if ( istrprefix(L"xml", buf) ) /* XML reserved namespaces */
|
||||
{ *url = n->name;
|
||||
return TRUE;
|
||||
} else if ( (ns = xmlns_find(p->environments, n)) )
|
||||
} else if ( (ns = xmlns_find(p, n)) )
|
||||
{ if ( ns->url->name[0] )
|
||||
*url = ns->url->name;
|
||||
else
|
||||
@ -158,7 +165,9 @@ xmlns_resolve_attribute(dtd_parser *p, dtd_symbol *id,
|
||||
return TRUE;
|
||||
} else
|
||||
{ *url = n->name; /* undefined namespace */
|
||||
gripe(ERC_EXISTENCE, L"namespace", n->name);
|
||||
if ( p->xml_no_ns == NONS_QUIET )
|
||||
return TRUE;
|
||||
gripe(p, ERC_EXISTENCE, L"namespace", n->name);
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
@ -195,16 +204,16 @@ xmlns_resolve_element(dtd_parser *p, const ichar **local, const ichar **url)
|
||||
ichar *o = buf;
|
||||
const ichar *s;
|
||||
xmlns *ns;
|
||||
|
||||
|
||||
for(s=id->name; *s; s++)
|
||||
{ if ( *s == nschr ) /* explicit namespace */
|
||||
{ dtd_symbol *n;
|
||||
|
||||
|
||||
*o = '\0';
|
||||
*local = s+1;
|
||||
n = dtd_add_symbol(dtd, buf);
|
||||
|
||||
if ( (ns = xmlns_find(p->environments, n)) )
|
||||
if ( (ns = xmlns_find(p, n)) )
|
||||
{ if ( ns->url->name[0] )
|
||||
*url = ns->url->name;
|
||||
else
|
||||
@ -213,17 +222,19 @@ xmlns_resolve_element(dtd_parser *p, const ichar **local, const ichar **url)
|
||||
return TRUE;
|
||||
} else
|
||||
{ *url = n->name; /* undefined namespace */
|
||||
gripe(ERC_EXISTENCE, "namespace", n->name);
|
||||
e->thisns = xmlns_push(p, n->name, n->name); /* define implicitly */
|
||||
if ( p->xml_no_ns == NONS_QUIET )
|
||||
return TRUE;
|
||||
gripe(p, ERC_EXISTENCE, L"namespace", n->name);
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
*o++ = *s;
|
||||
}
|
||||
|
||||
|
||||
*local = id->name;
|
||||
|
||||
if ( (ns = xmlns_find(p->environments, NULL)) )
|
||||
|
||||
if ( (ns = xmlns_find(p, NULL)) )
|
||||
{ if ( ns->url->name[0] )
|
||||
*url = ns->url->name;
|
||||
else
|
||||
|
@ -31,8 +31,9 @@ typedef struct _xmlns
|
||||
struct _xmlns *next; /* next name */
|
||||
} xmlns;
|
||||
|
||||
void xmlns_free(sgml_environment *env);
|
||||
xmlns* xmlns_find(sgml_environment *env, dtd_symbol *ns);
|
||||
void xmlns_free(xmlns *list);
|
||||
xmlns* xmlns_find(dtd_parser *p, dtd_symbol *ns);
|
||||
xmlns * xmlns_push(dtd_parser *p, const ichar *ns, const ichar *url);
|
||||
void update_xmlns(dtd_parser *p, dtd_element *e,
|
||||
int natts, sgml_attribute *atts);
|
||||
int xmlns_resolve_attribute(dtd_parser *p, dtd_symbol *id,
|
||||
|
Reference in New Issue
Block a user