update sgml package.

This commit is contained in:
Vitor Santos Costa 2010-05-06 10:59:09 +01:00
parent 0fabe2b9c6
commit 261b5163c7
20 changed files with 1431 additions and 1010 deletions

View File

@ -155,7 +155,7 @@ localpath(const ichar *ref, const ichar *name)
}
static int
int
register_catalog_file_unlocked(const ichar *file, catalog_location where)
{ catalog_file **f = &catalog;
catalog_file *cf;
@ -205,7 +205,7 @@ wgetenv(const char *name)
static void
init_catalog(void)
init_catalog()
{ static int done = FALSE;
LOCK();
@ -241,7 +241,7 @@ init_catalog(void)
int
register_catalog_file(const ichar *file, catalog_location where)
{ int rc;
init_catalog();
LOCK();
@ -310,7 +310,7 @@ cs_streql(ichar const *a, ichar const *b)
static int
scan_overflow(size_t buflen)
{ gripe(ERC_REPRESENTATION, L"token length");
{ gripe(NULL, ERC_REPRESENTATION, L"token length");
return EOF;
}
@ -439,7 +439,7 @@ load_one_catalogue(catalog_file * file)
int override = 0;
if ( !src )
{ gripe(ERC_NO_CATALOGUE, file->file);
{ gripe(NULL, ERC_NO_CATALOGUE, file->file);
return;
}
@ -514,7 +514,7 @@ load_one_catalogue(catalog_file * file)
To look up a parameter entity:
f = find_in_catalogue(CAT_PENTITY, name, pubid, sysid, ci);
The name may begin with a % but need not; if it doesn't
The name may begin with a % but need not; if it doesn't
a % will be prefixed for the search.
If it cannot otherwise be found ${name}.pen will be returned.
@ -635,7 +635,7 @@ find_in_catalogue(int kind,
return 0;
if ( istrlen(name)+4+1 > penlen )
{ gripe(ERC_REPRESENTATION, L"entity name");
{ gripe(NULL, ERC_REPRESENTATION, L"entity name");
return NULL;
}

View File

@ -44,7 +44,7 @@ new_charclass()
char_range(map, 'a', 'z', CH_LCLETTER);
char_range(map, 'A', 'Z', CH_LCLETTER);
char_range(map, '0', '9', CH_DIGIT);
ca['.'] |= CH_CNM;
ca['-'] |= CH_CNM;
ca[183] |= CH_CNM; /* XML */

View File

@ -466,7 +466,7 @@ dtd * new_dtd(const ichar *doctype);
int set_dialect_dtd(dtd *dtd, dtd_dialect dialect);
int set_option_dtd(dtd *dtd, dtd_option option, int set);
void putchar_dtd_parser(dtd_parser *p, int chr);
int putchar_dtd_parser(dtd_parser *p, int chr);
int begin_document_dtd_parser(dtd_parser *p);
int end_document_dtd_parser(dtd_parser *p);
void reset_document_dtd_parser(dtd_parser *p);

View File

@ -27,6 +27,7 @@
#include <string.h>
#include <wchar.h>
#include "dtd.h"
#include "util.h"
#include "prolog.h"
#define streq(s,q) strcmp((s), (q)) == 0
@ -42,10 +43,12 @@ int
main(int argc, char **argv)
{ dtd_dialect dialect = DL_SGML;
init_ring();
program = argv[0];
argv++;
argc--;
while(argc > 0 && argv[0][0] == '-')
{ if ( streq(argv[0], "-xml") )
{ dialect = DL_XML;
@ -63,7 +66,7 @@ main(int argc, char **argv)
if ( argc == 1 )
{ int wl = mbstowcs(NULL, argv[0], 0);
if ( wl > 0 )
{ wchar_t *ws = malloc((wl+1)*sizeof(wchar_t));
dtd *dtd;

View File

@ -3,9 +3,9 @@
Part of SWI-Prolog
Author: Jan Wielemaker
E-mail: jan@swi.psy.uva.nl
E-mail: J.Wielemaker@cs.vu.nl
WWW: http://www.swi-prolog.org
Copyright (C): 1985-2002, University of Amsterdam
Copyright (C): 1985-2009, University of Amsterdam
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
@ -32,48 +32,52 @@
int
sgml2pl_error(plerrorid id, ...)
{ term_t except = PL_new_term_ref();
term_t formal = PL_new_term_ref();
term_t swi = PL_new_term_ref();
{ int rc;
term_t except, formal, swi;
va_list args;
char msgbuf[1024];
char *msg = NULL;
if ( !(except = PL_new_term_ref()) ||
!(formal = PL_new_term_ref()) ||
!(swi = PL_new_term_ref()) )
return FALSE;
va_start(args, id);
switch(id)
{ case ERR_ERRNO:
{ int err = va_arg(args, int);
msg = strerror(err);
switch(err)
{ case ENOMEM:
PL_unify_term(formal,
PL_FUNCTOR_CHARS, "resource_error", 1,
PL_CHARS, "no_memory");
rc = PL_unify_term(formal,
PL_FUNCTOR_CHARS, "resource_error", 1,
PL_CHARS, "no_memory");
break;
case EACCES:
{ const char *file = va_arg(args, const char *);
const char *action = va_arg(args, const char *);
PL_unify_term(formal,
PL_FUNCTOR_CHARS, "permission_error", 3,
PL_CHARS, action,
PL_CHARS, "file",
PL_CHARS, file);
rc = PL_unify_term(formal,
PL_FUNCTOR_CHARS, "permission_error", 3,
PL_CHARS, action,
PL_CHARS, "file",
PL_CHARS, file);
break;
}
case ENOENT:
{ const char *file = va_arg(args, const char *);
PL_unify_term(formal,
PL_FUNCTOR_CHARS, "existence_error", 2,
PL_CHARS, "file",
PL_CHARS, file);
rc = PL_unify_term(formal,
PL_FUNCTOR_CHARS, "existence_error", 2,
PL_CHARS, "file",
PL_CHARS, file);
break;
}
default:
PL_unify_atom_chars(formal, "system_error");
rc = PL_unify_atom_chars(formal, "system_error");
break;
}
break;
@ -84,12 +88,12 @@ sgml2pl_error(plerrorid id, ...)
if ( PL_is_variable(actual) &&
strcmp(expected, "variable") != 0 )
PL_unify_atom_chars(formal, "instantiation_error");
rc = PL_unify_atom_chars(formal, "instantiation_error");
else
PL_unify_term(formal,
PL_FUNCTOR_CHARS, "type_error", 2,
PL_CHARS, expected,
PL_TERM, actual);
rc = PL_unify_term(formal,
PL_FUNCTOR_CHARS, "type_error", 2,
PL_CHARS, expected,
PL_TERM, actual);
break;
}
case ERR_DOMAIN:
@ -97,31 +101,31 @@ sgml2pl_error(plerrorid id, ...)
term_t actual = va_arg(args, term_t);
if ( PL_is_variable(actual) )
PL_unify_atom_chars(formal, "instantiation_error");
rc = PL_unify_atom_chars(formal, "instantiation_error");
else
PL_unify_term(formal,
PL_FUNCTOR_CHARS, "domain_error", 2,
PL_CHARS, expected,
PL_TERM, actual);
rc = PL_unify_term(formal,
PL_FUNCTOR_CHARS, "domain_error", 2,
PL_CHARS, expected,
PL_TERM, actual);
break;
}
case ERR_EXISTENCE:
{ const char *type = va_arg(args, const char *);
term_t obj = va_arg(args, term_t);
PL_unify_term(formal,
PL_FUNCTOR_CHARS, "existence_error", 2,
PL_CHARS, type,
PL_TERM, obj);
rc = PL_unify_term(formal,
PL_FUNCTOR_CHARS, "existence_error", 2,
PL_CHARS, type,
PL_TERM, obj);
break;
}
case ERR_FAIL:
{ term_t goal = va_arg(args, term_t);
PL_unify_term(formal,
PL_FUNCTOR_CHARS, "goal_failed", 1,
PL_TERM, goal);
rc = PL_unify_term(formal,
PL_FUNCTOR_CHARS, "goal_failed", 1,
PL_TERM, goal);
break;
}
@ -129,10 +133,10 @@ sgml2pl_error(plerrorid id, ...)
{ const char *limit = va_arg(args, const char *);
long maxval = va_arg(args, long);
PL_unify_term(formal,
PL_FUNCTOR_CHARS, "limit_exceeded", 2,
PL_CHARS, limit,
PL_LONG, maxval);
rc = PL_unify_term(formal,
PL_FUNCTOR_CHARS, "limit_exceeded", 2,
PL_CHARS, limit,
PL_LONG, maxval);
break;
}
@ -142,10 +146,10 @@ sgml2pl_error(plerrorid id, ...)
vsprintf(msgbuf, fmt, args);
msg = msgbuf;
PL_unify_term(formal,
PL_FUNCTOR_CHARS, "miscellaneous", 1,
PL_CHARS, id);
rc = PL_unify_term(formal,
PL_FUNCTOR_CHARS, "miscellaneous", 1,
PL_CHARS, id);
break;
}
default:
@ -153,26 +157,29 @@ sgml2pl_error(plerrorid id, ...)
}
va_end(args);
if ( msg )
if ( rc && msg )
{ term_t predterm = PL_new_term_ref();
term_t msgterm = PL_new_term_ref();
if ( msg )
{ PL_put_atom_chars(msgterm, msg);
}
PL_unify_term(swi,
PL_FUNCTOR_CHARS, "context", 2,
PL_TERM, predterm,
PL_TERM, msgterm);
if ( !(predterm = PL_new_term_ref()) ||
!(msgterm = PL_new_term_ref()) ||
!PL_put_atom_chars(msgterm, msg) ||
!PL_unify_term(swi,
PL_FUNCTOR_CHARS, "context", 2,
PL_TERM, predterm,
PL_TERM, msgterm) )
rc = FALSE;
}
PL_unify_term(except,
PL_FUNCTOR_CHARS, "error", 2,
PL_TERM, formal,
PL_TERM, swi);
if ( rc )
rc = PL_unify_term(except,
PL_FUNCTOR_CHARS, "error", 2,
PL_TERM, formal,
PL_TERM, swi);
if ( rc )
return PL_raise_exception(except);
return PL_raise_exception(except);
return FALSE;
}

View File

@ -44,4 +44,3 @@ typedef enum
int sgml2pl_error(plerrorid, ...);
#endif /*H_ERROR_INCLUDED*/

View File

@ -107,7 +107,7 @@ visit(dtd_state *state, visited *visited)
{ if ( visited->states[i] == state )
return FALSE;
}
if ( visited->size >= MAX_VISITED )
{ fprintf(stderr, "Reached MAX_VISITED!\n");
return FALSE;
@ -262,7 +262,7 @@ do_find_omitted_path(dtd_state *state, dtd_element *e,
}
int
int
find_omitted_path(dtd_state *state, dtd_element *e, dtd_element **path)
{ int pl = 0;
visited visited;
@ -314,13 +314,13 @@ static transition *
state_transitions(dtd_state *state)
{ if ( !state->transitions && state->expander )
{ expander *ex = state->expander;
switch(ex->type)
{ case EX_AND:
{ dtd_model_list *left = ex->kind.and.set;
if ( !left ) /* empty AND (should not happen) */
{ link(state, ex->target, NULL);
{ link(state, ex->target, NULL);
} else if ( !left->next ) /* only one left */
{ translate_model(left->model, state, ex->target);
} else
@ -378,7 +378,7 @@ translate_one(dtd_model *m, dtd_state *from, dtd_state *to)
ex->target = to;
ex->type = EX_AND;
for( sub = m->content.group; sub; sub = sub->next )
add_model_list(&ex->kind.and.set, sub);
@ -436,7 +436,7 @@ make_state_engine(dtd_element *e)
{ if ( def->content )
{ def->initial_state = new_dtd_state();
def->final_state = new_dtd_state();
translate_model(def->content, def->initial_state, def->final_state);
} else if ( def->type == C_CDATA || def->type == C_RCDATA )
{ def->initial_state = new_dtd_state();
@ -450,7 +450,7 @@ make_state_engine(dtd_element *e)
return def->initial_state;
}
return NULL;
}
@ -492,7 +492,7 @@ free_expander(expander *e, visited *visited)
static void
do_free_state_engine(dtd_state *state, visited *visited)
{ transition *t, *next;
for(t=state->transitions; t; t=next)
{ next = t->next;

File diff suppressed because it is too large Load Diff

View File

@ -145,6 +145,12 @@ typedef enum
DM_DATA /* Environment has only elements */
} data_mode;
#ifdef XMLNS
typedef enum
{ NONS_ERROR = 0,
NONS_QUIET
} xmlnons;
#endif
typedef struct _sgml_environment
{ dtd_element *element; /* element that opened the env */
@ -201,6 +207,10 @@ typedef struct _dtd_parser
dtd_srcloc startcdata; /* Start of last cdata */
dtd_symbol *enforce_outer_element; /* Outer element to look for */
sgml_event_class event_class; /* EV_* */
xmlnons xml_no_ns; /* What if namespace does not exist? */
#ifdef XMLNS
struct _xmlns *xmlns; /* Outer xmlns declaration */
#endif
void *closure; /* client handle */
sgml_begin_element_f on_begin_element; /* start an element */
@ -221,7 +231,7 @@ typedef struct _dtd_parser
#include "xmlns.h"
#endif
extern int gripe(dtd_error_id e, ...);
extern int gripe(dtd_parser *p, dtd_error_id e, ...);
#define SGML_SUB_DOCUMENT 0x1

View File

@ -342,7 +342,7 @@ prolog_print_attribute(dtd_element *e, dtd_attr *at)
printf("list(nutoken)");
break;
}
printf(", "); /* print default */
switch(at->def)
{ case AT_REQUIRED:
@ -427,7 +427,7 @@ prolog_print_element(dtd_element *e, unsigned int flags)
if ( def->excluded )
{ dtd_element_list *el;
for(el = def->excluded; el; el=el->next)
wprintf(L"exclude(%ls, %ls).\n",
atom(e->name->name),
@ -435,7 +435,7 @@ prolog_print_element(dtd_element *e, unsigned int flags)
}
if ( def->included )
{ dtd_element_list *el;
for(el = def->included; el; el=el->next)
wprintf(L"include(%ls, %ls).\n",
atom(e->name->name),

View File

@ -26,14 +26,18 @@
#include <SWI-Prolog.h>
#include <stdlib.h>
#ifdef HAVE_MALLOC_H
#include <malloc.h>
#include HAVE_MALLOC_H
#endif
#include "error.h"
#include <errno.h>
#include <string.h>
#include <stdio.h>
#include <wctype.h>
#include "xml_unicode.h"
#include "dtd.h"
#ifdef __WINDOWS__
#define inline __inline
#endif
static atom_t ATOM_iso_latin_1;
static atom_t ATOM_utf8;
@ -86,7 +90,7 @@ room_buf(charbuf *b, size_t room)
b->end = b->bufp + used;
}
return TRUE;
return TRUE;
}
@ -178,11 +182,11 @@ do_quote(term_t in, term_t quoted, char **map, int maxchr)
if ( inA )
{ for(s = (unsigned char*)inA ; len-- > 0; s++ )
{ int c = *s;
if ( map[c] )
{ if ( !add_str_buf(&buffer, map[c]) )
return FALSE;
changes++;
} else if ( c > maxchr )
{ char buf[10];
@ -190,7 +194,7 @@ do_quote(term_t in, term_t quoted, char **map, int maxchr)
sprintf(buf, "&#%d;", c);
if ( !add_str_buf(&buffer, buf) )
return FALSE;
changes++;
} else
{ add_char_buf(&buffer, c);
@ -204,11 +208,11 @@ do_quote(term_t in, term_t quoted, char **map, int maxchr)
} else
{ for( ; len-- > 0; inW++ )
{ int c = *inW;
if ( c <= 0xff && map[c] )
{ if ( !add_str_bufW(&buffer, map[c]) )
return FALSE;
changes++;
} else if ( c > maxchr )
{ char buf[10];
@ -216,13 +220,13 @@ do_quote(term_t in, term_t quoted, char **map, int maxchr)
sprintf(buf, "&#%d;", c);
if ( !add_str_bufW(&buffer, buf) )
return FALSE;
changes++;
}else
{ add_char_bufW(&buffer, c);
}
}
if ( changes > 0 )
rc = PL_unify_wchars(quoted, PL_ATOM,
used_buf(&buffer)/sizeof(wchar_t),
@ -230,7 +234,7 @@ do_quote(term_t in, term_t quoted, char **map, int maxchr)
else
rc = PL_unify(in, quoted);
}
free_buf(&buffer);
return rc;
@ -321,12 +325,39 @@ xml_quote_cdata(term_t in, term_t out, term_t encoding)
}
static inline int
is_xml_nmstart(dtd_charclass *map, int c)
{ if ( c <= 0xff )
{ return (map->class[c] & CH_NMSTART);
} else
{ return ( xml_basechar(c) ||
xml_ideographic(c)
);
}
}
static inline int
is_xml_chname(dtd_charclass *map, int c)
{ if ( c <= 0xff )
{ return (map->class[c] & CH_NAME);
} else
{ return ( xml_basechar(c) ||
xml_digit(c) ||
xml_ideographic(c) ||
xml_combining_char(c) ||
xml_extender(c)
);
}
}
static dtd_charclass *map;
static foreign_t
xml_name(term_t in, term_t encoding)
{ char *ins;
wchar_t *inW;
size_t len;
static dtd_charclass *map;
unsigned int i;
int maxchr;
@ -345,7 +376,7 @@ xml_name(term_t in, term_t encoding)
c = ins[0] & 0xff;
if ( c > maxchr )
return FALSE;
if ( !(map->class[c] & CH_NMSTART) )
return FALSE;
for(i=1; i<len; i++)
@ -360,22 +391,16 @@ xml_name(term_t in, term_t encoding)
if ( PL_get_wchars(in, &len, &inW, CVT_ATOMIC) )
{ if ( len == 0 )
return FALSE;
if ( inW[0] > maxchr )
if ( inW[0] > maxchr ||
!is_xml_nmstart(map, inW[0]) )
return FALSE;
if ( inW[0] <= 0xff &&
!(map->class[inW[0]] & CH_NMSTART) )
return FALSE;
if ( inW[0] > 0xff && !iswalpha(inW[0]) )
return FALSE;
for(i=1; i<len; i++)
{ int c = inW[i];
if ( c <= 0xff && !(map->class[c] & CH_NAME) )
return FALSE;
if ( c > 0xff && !iswalnum((wint_t)c) )
if ( c > maxchr ||
!is_xml_chname(map, c) )
return FALSE;
}
@ -386,6 +411,57 @@ xml_name(term_t in, term_t encoding)
}
static foreign_t
iri_xml_namespace(term_t iri, term_t namespace, term_t localname)
{ char *s;
pl_wchar_t *w;
size_t len;
if ( !map )
map = new_charclass();
if ( PL_get_nchars(iri, &len, &s, CVT_ATOM|CVT_STRING) )
{ const char *e = &s[len];
const char *p = e;
while(p>s && (map->class[p[-1]&0xff] & CH_NAME))
p--;
while(p<e && !(map->class[p[0]&0xff] & CH_NMSTART))
p++;
if ( !PL_unify_atom_nchars(namespace, p-s, s) )
return FALSE;
if ( localname &&
!PL_unify_atom_nchars(localname, e-p, p) )
return FALSE;
return TRUE;
} else if ( PL_get_wchars(iri, &len, &w, CVT_ATOM|CVT_STRING|CVT_EXCEPTION) )
{ const pl_wchar_t *e = &w[len];
const pl_wchar_t *p = e;
while(p>w && is_xml_chname(map, p[-1]) )
p--;
while(p<e && !is_xml_nmstart(map, p[0]) )
p++;
if ( !PL_unify_wchars(namespace, PL_ATOM, p-w, w) )
return FALSE;
if ( localname &&
!PL_unify_wchars(localname, PL_ATOM, e-p, p) )
return FALSE;
return TRUE;
}
return FALSE;
}
static foreign_t
iri_xml_namespace2(term_t iri, term_t namespace)
{ return iri_xml_namespace(iri, namespace, 0);
}
install_t
@ -398,4 +474,6 @@ install_xml_quote()
PL_register_foreign("xml_quote_attribute", 3, xml_quote_attribute, 0);
PL_register_foreign("xml_quote_cdata", 3, xml_quote_cdata, 0);
PL_register_foreign("xml_name", 2, xml_name, 0);
PL_register_foreign("iri_xml_namespace", 3, iri_xml_namespace, 0);
PL_register_foreign("iri_xml_namespace", 2, iri_xml_namespace2, 0);
}

View File

@ -95,10 +95,10 @@ print_word(dtd_parser * p, char c, /* preceding character */
static void
wprint_escaped(FILE *f, const wchar_t *s, int len)
{ const wchar_t *e = &s[len];
while ( s < e )
{ wint_t x = *s++;
if (x >= ' ')
{ if (x == '\\') /* \ --> \\ */
wputc(x, f);
@ -352,7 +352,7 @@ mb2wc(const char *s)
return ws;
}
perror("mbstowcs");
exit(1);
}

View File

@ -26,7 +26,7 @@ Markup languages are an increasingly important method for
data-representation and exchange. This article documents the package
\pllib{sgml}, a foreign library for SWI-Prolog to parse SGML
and XML documents, returning information on both the document and the
document's DTD. The parser is designed to be small, fast and flexible.
document's DTD. The parser is designed to be small, fast and flexible.
\end{abstract}
\pagebreak
@ -56,17 +56,17 @@ The parser described in this document is small (less than 100 kBytes
executable on a Pentium), fast (between 2 and 5 times faster than SP),
provides access to the DTD, and provides flexible input handling.
The document output is equal to the output produced by \jargon{xml2pl},
The document output is equal to the output produced by \jargon{xml2pl},
an SP interface to SWI-Prolog written by Anjo Anjewierden.
\section{Bluffer's Guide}
This package allows you to parse SGML, XML and HTML data into a Prolog
data structure. The high-level interface defined in \pllib{sgml}
This package allows you to parse SGML, XML and HTML data into a Prolog
data structure. The high-level interface defined in \pllib{sgml}
provides access at the file-level, while the low-level interface defined
in the foreign module works with Prolog streams. Please use the source
of \file{sgml.pl} as a starting point for dealing with data from
in the foreign module works with Prolog streams. Please use the source
of \file{sgml.pl} as a starting point for dealing with data from
other sources than files, such as SWI-Prolog resources, network-sockets,
character strings, \emph{etc.} The first example below loads an HTML file.
@ -123,9 +123,9 @@ This is called `omitted-tag' handling.
].
\end{code}
The document is represented as a list, each element being an atom to
The document is represented as a list, each element being an atom to
represent \const{CDATA} or a term \term{element}{Name, Attributes, Content}.
Entities (e.g. \verb$&lt;$) are expanded and included in the
Entities (e.g. \verb$&lt;$) are expanded and included in the
atom representing the element content or attribute value.%
\footnote{Up to SWI-Prolog 5.4.x, Prolog could not represent
\jargon{wide} characters and entities that did not fit in
@ -141,23 +141,24 @@ self-contained files in SGML, HTML, or XML into a structured term. They
are based on load_structure/3.
\begin{description}
\predicate{load_sgml_file}{2}{+File, -ListOfContent}
Same as \term{load_structure}{File, ListOfContent, [dialect(sgml)]}.
\predicate{load_sgml_file}{2}{+Source, -ListOfContent}
Same as \term{load_structure}{Source, ListOfContent, [dialect(sgml)]}.
\predicate{load_xml_file}{2}{+File, -ListOfContent}
Same as \term{load_structure(File, ListOfContent, [dialect(xml)]}.
\predicate{load_xml_file}{2}{+Source, -ListOfContent}
Same as \term{load_structure(Source, ListOfContent, [dialect(xml)]}.
\predicate{load_html_file}{2}{+File, -Content}
Load \arg{File} and parse as HTML. Implemented as below. Note that
load_html_file/2 re-uses a cached DTD object as defined by dtd/2. As DTD
objects may be corrupted while loading errornous documents sharing is
undesirable if the documents are not known to be correct. See dtd/2 for
details.
\predicate{load_html_file}{2}{+Source, -Content}
Load \arg{Source} and parse as HTML. \arg{Source} is either the
name of a file or term \term{stream}{Handle}. Implemented as
below. Note that load_html_file/2 re-uses a cached DTD object as defined
by dtd/2. As DTD objects may be corrupted while loading errornous
documents sharing is undesirable if the documents are not known to be
correct. See dtd/2 for details.
\begin{code}
load_html_file(File, Term) :-
load_html_file(Source, Term) :-
dtd(html, DTD),
load_structure(File, Term,
load_structure(Source, Term,
[ dtd(DTD),
dialect(sgml),
shorttag(false)
@ -171,8 +172,8 @@ load_html_file(File, Term) :-
\subsection{Loading Structured Documents}
SGML or XML files are loaded through the common predicate
load_structure/3. This is a predicate with many options. For
simplicity a number of commonly used shorthands are provided:
load_structure/3. This is a predicate with many options. For
simplicity a number of commonly used shorthands are provided:
load_sgml_file/2, load_xml_file/2, and
load_html_file/2.
@ -184,18 +185,18 @@ Parse \arg{Source} and return the resulting structure in
options controlling the conversion process.
A proper XML document contains only a single toplevel element whose name
matches the document type. Nevertheless, a list is returned for
matches the document type. Nevertheless, a list is returned for
consistency with the representation of element content. The <aref/
ListOfContent/ consists of the following types:
\begin{description}
\termitem{\arg{Atom}}{}
Atoms are used to represent \const{CDATA}. Note
Atoms are used to represent \const{CDATA}. Note
this is possible in SWI-Prolog, as there is no length-limit on atoms and
atom garbage collection is provided.
\termitem{element}{Name, ListAttributes, ListOfContent}
\arg{Name} is the name of the element. Using SGML, which is
\arg{Name} is the name of the element. Using SGML, which is
case-insensitive, all element names are returned as lowercase atoms.
\arg{ListOfAttributes} is a list of \arg{Name}=\arg{Value} pairs for
@ -209,31 +210,31 @@ integers is supported. \arg{ListOfContent} defines the content for the
element.
\termitem{sdata}{Text}
If an entity with declared content-type \const{SDATA} is encountered, this
If an entity with declared content-type \const{SDATA} is encountered, this
term is returned holding the data in \arg{Text}.
\termitem{ndata}{Text}
If an entity with declared content-type \const{NDATA} is encountered, this
If an entity with declared content-type \const{NDATA} is encountered, this
term is returned holding the data in \arg{Text}.
\termitem{pi}{Text}
If a processing instruction is encountered (\verb$<?...?>$), <aref/
Text/ holds the text of the processing instruction. Please note that the
\verb$<?xml ...?>$ instruction is handled internally.
\verb$<?xml ...?>$ instruction is handled internally.
\end{description}
The \arg{Options} list controls the conversion process. Currently
The \arg{Options} list controls the conversion process. Currently
defined options are:
\begin{description}
\termitem{dtd}{?DTD}
Reference to a DTD object. If specified, the \verb$<!DOCTYPE ...>$
declaration is ignored and the document is parsed and validated against
declaration is ignored and the document is parsed and validated against
the provided DTD. If provided as a variable, the created DTD is
returned. See \secref{implicitdtd}.
\termitem{dialect}{+Dialect}
Specify the parsing dialect. Supported are \const{sgml} (default), \const{xml}
Specify the parsing dialect. Supported are \const{sgml} (default), \const{xml}
and \const{xmlns}. See \secref{xml} for details on the differences.
\termitem{shorttag}{+Bool}
@ -272,14 +273,14 @@ Defines (overwrites) an entity definition. At the moment, only
entity options are allowed.
\termitem{file}{+Name}
Sets the name of the file on which errors are reported. Sets the
Sets the name of the file on which errors are reported. Sets the
linenumber to 1.
\termitem{line}{+Line}
Sets the starting line-number for reporting errors.
\termitem{max_errors}{+Max}
Sets the maximum number of errors. If this number is reached, an
Sets the maximum number of errors. If this number is reached, an
exception of the format below is raised. The default is 50. Using
\term{max_errors}{-1} makes the parser continue, no matter how many
errors it encounters.
@ -303,26 +304,26 @@ modes are:
\termitem{space}{sgml}
In SGML, newlines at the start and end of an element are removed.<fn>In
addition, newlines at the end of lines containing only markup should be
deleted. This is not yet implemented.</fn> This is the default mode for
the SGML dialect.
deleted. This is not yet implemented.</fn> This is the default mode for
the SGML dialect.
\termitem{space}{preserve}
White space is passed literally to the application. This mode leaves all
white space handling to the application. This is the default mode for
the XML dialect.
the XML dialect.
\termitem{space}{default}
In addition to \const{sgml} space-mode, all consequtive white-space is
reduced to a single space-character. This mode canonises all white
space.
In addition to \const{sgml} space-mode, all consequtive white-space is
reduced to a single space-character. This mode canonises all white
space.
\termitem{space}{remove}
In addition to \const{default}, all leading and trailing white-space is
removed from \const{CDATA} objects. If, as a result, the \const{CDATA}
becomes empty, nothing is passed to the application. This mode is
especially handy for processing `data-oriented' documents, such as RDF.
It is not suitable for normal text documents. Consider the HTML
fragment below. When processed in this mode, the spaces between the
In addition to \const{default}, all leading and trailing white-space is
removed from \const{CDATA} objects. If, as a result, the \const{CDATA}
becomes empty, nothing is passed to the application. This mode is
especially handy for processing `data-oriented' documents, such as RDF.
It is not suitable for normal text documents. Consider the HTML
fragment below. When processed in this mode, the spaces between the
three modified words are lost. This mode is not part of any standard;
XML 1.0 allows only \const{default} and \const{preserve}.
@ -333,9 +334,9 @@ Consider adjacent <b>bold</b> <ul>and</ul> <it>italic</it> words.
\subsection{XML documents} \label{sec:xml}
The parser can operate in two modes: \const{sgml} mode and \const{xml} mode, as
defined by the \term{dialect}{Dialect} option. Regardless of this
option, if the first line of the document reads as below, the parser is
The parser can operate in two modes: \const{sgml} mode and \const{xml} mode, as
defined by the \term{dialect}{Dialect} option. Regardless of this
option, if the first line of the document reads as below, the parser is
switched automatically into XML mode.
\begin{code}
@ -346,21 +347,21 @@ Currently switching to XML mode implies:
\begin{itemlist}
\item [XML empty elements]
The construct \verb$<element [attribute...] />$ is recognised as
an empty element.
The construct \verb$<element [attribute...] />$ is recognised as
an empty element.
\item [Predefined entities]
The following entitities are predefined: \const{lt} (\verb$<$), \const{gt}
(\verb$>$), \const{amp} (\verb$&$), \const{apos} (\verb$'$)
and \const{quot} (\verb$"$).
(\verb$>$), \const{amp} (\verb$&$), \const{apos} (\verb$'$)
and \const{quot} (\verb$"$).
\item [Case sensitivity]
In XML mode, names are treated case-sensitive, except for the DTD
reserved names (i.e. \exam{ELEMENT}, \emph{etc.}).
In XML mode, names are treated case-sensitive, except for the DTD
reserved names (i.e. \exam{ELEMENT}, \emph{etc.}).
\item [Character classes]
In XML mode, underscores (\verb$_$) and colon (\verb$:$) are
allowed in names.
allowed in names.
\item [White-space handling]
White space mode is set to \const{preserve}. In addition to setting
@ -378,28 +379,28 @@ preserves space, regardless of the default processing mode.
\subsubsection{XML Namespaces} \label{sec:xmlns}
Using the \jargon{dialect} \const{xmlns}, the parser will interpret XML
namespaces. In this case, the names of elements are returned as a term
Using the \jargon{dialect} \const{xmlns}, the parser will interpret XML
namespaces. In this case, the names of elements are returned as a term
of the format
\begin{quote}
\arg{URL}\const{:}\arg{LocalName}
\arg{URL}\const{:}\arg{LocalName}
\end{quote}
If an identifier has no namespace and there is no default namespace it
is returned as a simple atom. If an identifier has a namespace but this
namespace is undeclared, the namespace name rather than the related URL
If an identifier has no namespace and there is no default namespace it
is returned as a simple atom. If an identifier has a namespace but this
namespace is undeclared, the namespace name rather than the related URL
is returned.
Attributes declaring namespaces ({\tt xmlns:<ns>=<url>}) are reported
as if \const{xmlns} were not a defined resource.
In many cases, getting attribute-names as <xmp>\arg{url}:\arg{name}</xmp>
is not desirable. Such terms are hard to unify and sometimes multiple
URLs may be mapped to the same identifier. This may happen due to poor
version management, poor standardisation or because the the application
doesn't care too much about versions. This package defines two
call-backs that can be set using set_sgml_parser/2 to deal
In many cases, getting attribute-names as \arg{url}:\arg{name}
is not desirable. Such terms are hard to unify and sometimes multiple
URLs may be mapped to the same identifier. This may happen due to poor
version management, poor standardisation or because the the application
doesn't care too much about versions. This package defines two
call-backs that can be set using set_sgml_parser/2 to deal
with this problem.
The call-back \const{xmlns} is called as XML namespaces are noticed.
@ -428,6 +429,41 @@ load_rdf_xml(File, Term) :-
]).
\end{code}
The library provides iri_xml_namespace/3 to break down an IRI into
its namespace and localname:
\begin{description}
\predicate[det]{iri_xml_namespace}{3}{+IRI, -Namespace, -Localname}
Split an IRI (Unicode URI) into its \arg{Namespace} (an IRI) and
\arg{Localname} (a Unicode XML name, see xml_name/2). The
\arg{Localname} is defined as the longest last part of the IRI that
satisfies the syntax of an XML name. With IRI schemas that are designed
to work with XML namespaces, this will typically break the IRI on the
last \chr{\#} or \chr{/}. Note however that this can produce unexpected
results. E.g., in the example below, one might expect the namespace to
be \url{http://example.com/images\#}, but an XML name cannot start with
a digit.
\begin{code}
?- iri_xml_namespace('http://example.com/images#12345', NS, L).
NS = 'http://example.com/images#12345',
L = ''.
\end{code}
As we see from the example above, the \arg{Localname} can be the empty
atom. Similarly, \arg{Namespace} can be the empty atom if \arg{IRI} is
an XML name. Applications will often have to check for either or both
these conditions. We decided against failing in these conditions because
the application typically wants to know which of the two conditions
(empty namespace or empty localname) holds. This predicate is often used
for generating RDF/XML from an RDF graph.
\predicate[det]{iri_xml_namespace}{2}{+IRI, -Namespace}
Same as iri_xml_namespace/3, but avoids creating an atom for the
\arg{Localname}.
\end{description}
\subsection{DTD-Handling}
The DTD (\textbf{D}ocument \textbf{T}ype \textbf{D}efinition) is a
@ -438,7 +474,7 @@ predicates for handling the DTD.
\begin{description}
\predicate{new_dtd}{2}{+DocType, -DTD}
Creates an empty DTD for the named \arg{DocType}. The returned
Creates an empty DTD for the named \arg{DocType}. The returned
DTD-reference is an opaque term that can be used in the other predicates
of this package.
@ -468,7 +504,7 @@ Define the DTD dialect. Default is \const{sgml}. Using \const{xml} or
\predicate{dtd}{2}{+DocType, -DTD}
Find the DTD representing the indicated \jargon{doctype}. This predicate
uses a cache of DTD objects. If a doctype has no associated dtd, it
uses a cache of DTD objects. If a doctype has no associated dtd, it
searches for a file using the file search path \exam{dtd} using the call:
\begin{code}
@ -488,15 +524,15 @@ parse multiple documents should be restricted to situations where the
documents processed are known to be error-free.
\predicate{dtd_property}{2}{+DTD, ?Property}
This predicate is used to examine the content of a DTD. Property is one
This predicate is used to examine the content of a DTD. Property is one
of:
\begin{description}
\termitem{doctype}{DocType}
An atom representing the document-type defined by this DTD.
An atom representing the document-type defined by this DTD.
\termitem{elements}{ListOfElements}
A list of atoms representing the names of the elements in this DTD.
A list of atoms representing the names of the elements in this DTD.
\termitem{element}{Name, Omit, Content}
The DTD contains an element with the given name. \arg{Omit} is a term of
@ -508,7 +544,7 @@ form:
\begin{description}
\termitem{empty}{}
The element has no content.
The element has no content.
\termitem{cdata}{}
The element contains non-parsed character data. All data up to the
@ -524,30 +560,30 @@ any order.
\termitem{\#pcdata}{}
The element contains parsed character data .
\termitem{\arg{element}} An element with this name.
\termitem{\arg{element}} An element with this name.
\termitem{*}{SubModel}
0 or more appearances.
0 or more appearances.
\termitem{?}{SubModel}
0 or one appearance.
0 or one appearance.
\termitem{+}{SubModel}
1 or more appearances.
1 or more appearances.
\termitem{,}{SubModel1, SubModel2}
\arg{SubModel1} followed by \arg{SubModel2}.
\arg{SubModel1} followed by \arg{SubModel2}.
\termitem{\&}{SubModel1, SubModel2}
\arg{SubModel1} and \arg{SubModel2} in any order.
\arg{SubModel1} and \arg{SubModel2} in any order.
\termitem{\chr{|}}{SubModel1, SubModel2}
\arg{SubModel1} or \arg{SubModel2}.
\arg{SubModel1} or \arg{SubModel2}.
\end{description}
\termitem{attributes}{Element, ListOfAttributes}
\arg{ListOfAttributes} is a list of atoms representing the attributes
of the element \arg{Element}.
\arg{ListOfAttributes} is a list of atoms representing the attributes
of the element \arg{Element}.
\termitem{attribute}{Element, Attribute, Type, Default}
Query an element. \arg{Type} is one of \const{cdata}, \const{entity},
@ -555,34 +591,34 @@ Query an element. \arg{Type} is one of \const{cdata}, \const{entity},
\const{notation}, \const{number} or \const{nutoken}. For DTD types that
allow for a list, the notation \term{list}{Type} is used. Finally, the
DTD construct \verb$(a|b|...)$ is mapped to the term
\term{nameof}{ListOfValues}.
\term{nameof}{ListOfValues}.
\arg{Default} describes the sgml default. It is one \const{required},
\const{current}, \const{conref} or \const{implied}. If a real default is
present, it is one of \term{default}{Value} or \term{fixed}{Value}.
present, it is one of \term{default}{Value} or \term{fixed}{Value}.
\termitem{entities}{ListOfEntities}
\arg{ListOfEntities} is a list of atoms representing the names of the
defined entities.
\arg{ListOfEntities} is a list of atoms representing the names of the
defined entities.
\termitem{entity}{Name, Value}
\arg{Name} is the name of an entity with given value. Value is one of
\arg{Name} is the name of an entity with given value. Value is one of
\begin{description}
\termitem{\arg{Atom}}{}
If the value is atomic, it represents the literal value of the entity.
If the value is atomic, it represents the literal value of the entity.
\termitem{system}{Url}
\arg{Url} is the URL of the system external entity.
\arg{Url} is the URL of the system external entity.
\termitem{public}{Id, Url}
For external public entities, \arg{Id} is the identifier. If an URL is
provided this is returned in \arg{Url}. Otherwise this argument is
unbound.
For external public entities, \arg{Id} is the identifier. If an URL is
provided this is returned in \arg{Url}. Otherwise this argument is
unbound.
\end{description}
\termitem{notations}{ListOfNotations}
Returns a list holding the names of all \const{NOTATION} declarations.
Returns a list holding the names of all \const{NOTATION} declarations.
\termitem{notation}{Name, Decl}
Unify \arg{Decl} with a list if \term{system}{+File} and/or
@ -592,11 +628,11 @@ Unify \arg{Decl} with a list if \term{system}{+File} and/or
\subsubsection{The DOCTYPE declaration}
As this parser allows for processing partial documents and process the
As this parser allows for processing partial documents and process the
DTD separately, the DOCTYPE declaration plays a special role.
If a document has no DOCTYPE declaraction, the parser returns a list
holding all elements and CDATA found. If the document has a DOCTYPE
If a document has no DOCTYPE declaraction, the parser returns a list
holding all elements and CDATA found. If the document has a DOCTYPE
declaraction, the parser will open the element defined in the DOCTYPE as
soon as the first real data is encountered.
@ -632,53 +668,63 @@ elements_in_xml_document(File, Elements) :-
\begin{description}
\predicate{new_sgml_parser}{2}{-Parser, +Options}
Creates a new parser. A parser can be used one or multiple times for
parsing documents or parts thereof. It may be bound to a DTD or the DTD
may be left implicit, in which case it is created from the document
Creates a new parser. A parser can be used one or multiple times for
parsing documents or parts thereof. It may be bound to a DTD or the DTD
may be left implicit, in which case it is created from the document
prologue or parsing is performed without a DTD. Options:
\begin{description}
\termitem{dtd}{?DTD}
If specified with an initialised DTD, this DTD is used for parsing the
document, regardless of the document prologue. If specified using as a
variable, a reference to the created DTD is returned. This DTD may be
created from the document prologue or build implicitely from the
document's content.
If specified with an initialised DTD, this DTD is used for parsing the
document, regardless of the document prologue. If specified using as a
variable, a reference to the created DTD is returned. This DTD may be
created from the document prologue or build implicitely from the
document's content.
\end{description}
\predicate{free_sgml_parser}{1}{+Parser}
Destroy all resources related to the parser. This does not destroy the
Destroy all resources related to the parser. This does not destroy the
DTD if the parser was created using the \term{dtd}{DTD} option.
\predicate{set_sgml_parser}{2}{+Parser, +Option}
Sets attributes to the parser. Currently defined attributes:
Sets attributes to the parser. Currently defined attributes:
\begin{description}
\termitem{file}{File}
Sets the file for reporting errors and warnings. Sets the line to 1.
Sets the file for reporting errors and warnings. Sets the line to 1.
\termitem{line}{Line}
Sets the current line. Useful if the stream is not at the start of the
(file) object for generating proper line-numbers.
Sets the current line. Useful if the stream is not at the start of the
(file) object for generating proper line-numbers.
\termitem{charpos}{Offset}
Sets the current character location. See also the \term{file}{File}
option.
\termitem{dialect}{Dialect}
Set the markup dialect. Known dialects:
Set the markup dialect. Known dialects:
\begin{description}
\termitem{sgml}{}
The default dialect is to process as SGML. This implies markup is
case-insensitive and standard SGML abbreviation is allowed (abreviated
attributes and omitted tags).
The default dialect is to process as SGML. This implies markup is
case-insensitive and standard SGML abbreviation is allowed (abreviated
attributes and omitted tags).
\termitem{xml}{}
This dialect is selected automatically if the processing instruction
\verb$<?xml ...>$ is encountered. See \secref{xml} for details.
\verb$<?xml ...>$ is encountered. See \secref{xml} for details.
\termitem{xmlns}{}
Process file as XML file with namespace support. See \secref{xmlns} for
details. See also the \verb$qualify_attributes$ option below.
\end{description}
\termitem{xmlns}{+URI}
Set the default namespace of the outer environment. This option is
provided to process partial XML content with proper namespace
resolution.
\termitem{xmlns}{+NS, +URI}
Specify a namespace for the outer environment. This option is
provided to process partial XML content with proper namespace
resolution.
\termitem{qualify_attributes}{Boolean}
How to handle unqualified attribute (i.e. without an explicit namespace)
in XML namespace (\const{xmlns}) mode. Default and standard compliant is
@ -715,20 +761,20 @@ sgml_parse/2.
\end{description}
\predicate{get_sgml_parser}{2}{+Parser, -Option}
Retrieve infomation on the current status of the parser. Notably useful
if the parser is used in the call-back mode. Currently defined options:
Retrieve infomation on the current status of the parser. Notably useful
if the parser is used in the call-back mode. Currently defined options:
\begin{description}
\termitem{file}{-File}
Current file-name. Note that this may be different from the provided
file if an external entity is being loaded.
Current file-name. Note that this may be different from the provided
file if an external entity is being loaded.
\termitem{line}{-Line}
Line-offset from where the parser started its processing in the file-object.
Line-offset from where the parser started its processing in the file-object.
\termitem{charpos}{-CharPos}
Offset from where the parser started its processing in the file-object.
See \secref{indexaccess}.
Offset from where the parser started its processing in the file-object.
See \secref{indexaccess}.
\termitem{charpos}{-Start, -End}
Character offsets of the start and end of the source processed causing the
@ -736,8 +782,8 @@ current call-back. Used in \program{PceEmacs} to for colouring
text in SGML and XML modes.
\termitem{source}{-Stream}
Prolog stream being processed. May be used in the \const{on_begin}, \emph{etc.}
callbacks from sgml_parse/2.
Prolog stream being processed. May be used in the \const{on_begin}, \emph{etc.}
callbacks from sgml_parse/2.
\termitem{dialect}{-Dialect}
Return the current dialect used by the parser (\const{sgml}, \const{xml} or \const{xmlns}).
@ -822,8 +868,8 @@ Input is a stream. A full description of the option-list is below.
\begin{description}
\termitem{document}{+Term}
A variable that will be unified with a list describing the content of
the document (see load_structure/2).
A variable that will be unified with a list describing the content of
the document (see load_structure/2).
\termitem{source}{+Stream}
An input stream that is read. This option <em/must/ be given.
\termitem{content_length}{+Characters}
@ -840,7 +886,7 @@ Default. Parse everything upto the end of the input.
The parser stops after reading the first element. Using
\term{source}{Stream}, this implies reading is stopped as soon
as the element is complete, and another call may be issued on the same
stream to read the next element.
stream to read the next element.
\termitem{content}{}
The value \const{content} is like \const{element} but assumes the
@ -860,9 +906,9 @@ all open elements.
\end{description}
\termitem{max_errors}{+MaxErrors}
Set the maximum number of errors. If this number is exceeded further
writes to the stream will yield an I/O error exception. Printing of
errors is suppressed after reaching this value. The default is 100.
Set the maximum number of errors. If this number is exceeded further
writes to the stream will yield an I/O error exception. Printing of
errors is suppressed after reaching this value. The default is 100.
\termitem{syntax_errors}{+ErrorMode}
Defines how syntax errors are handled.
\begin{description}
@ -875,28 +921,35 @@ Defines how syntax errors are handled.
using <pref builtin>print_message/2 with severity
\const{informational}.
\end{description}
\termitem{xml_no_ns}{+Mode}
Error handling if an XML namespace is not defined. Default generates
an error. If \const{quiet}, the error is suppressed. Can be used
together with \term{call}{urlns, Closure} to provide external expansion
of namespaces. See also \secref{xmlns}.
\termitem{call}{+Event, :PredicateName}
Issue call-backs on the specified events. \arg{PredicateName} is the
name of the predicate to call on this event, possibly prefixed with a
Issue call-backs on the specified events. \arg{PredicateName} is the
name of the predicate to call on this event, possibly prefixed with a
module identifier. If the handler throws an exception, parsing is stopped
and sgml_parse/2 re-throws the exception. The defined events are:
\begin{description}
\termitem{begin}{}
An open-tag has been parsed. The named handler is called with three
arguments: \term{\arg{Handler}}{+Tag, +Attributes, +Parser}.
An open-tag has been parsed. The named handler is called with three
arguments: \term{\arg{Handler}}{+Tag, +Attributes, +Parser}.
\termitem{end}{}
A close-tag has been parsed. The named handler is called with two
arguments: \term{\arg{Handler}}{+Tag, +Parser}.
A close-tag has been parsed. The named handler is called with two
arguments: \term{\arg{Handler}}{+Tag, +Parser}.
\termitem{cdata}{}
CDATA has been parsed. The named handler is called with two arguments:
\term{Handler}{+CDATA, +Parser}, where CDATA is an atom
representing the data.
representing the data.
\termitem{pi}{}
A processing instruction has been parsed. The named handler is called
A processing instruction has been parsed. The named handler is called
with two arguments: \term{\arg{Handler}}{+Text, +Parser}, where
\arg{Text} is the text of the processing instruction.
\arg{Text} is the text of the processing instruction.
\termitem{decl}{}
A declaration (\verb$<!...>$) has been read. The named handler is
@ -918,33 +971,33 @@ If this option is present, errors and warnings are not reported using
print_message/3
\termitem{xmlns}{}
When parsing an in \const{xmlns} mode, a new namespace declaraction is
pushed on the environment. The named handler is called with three
When parsing an in \const{xmlns} mode, a new namespace declaraction is
pushed on the environment. The named handler is called with three
arguments: \term{\arg{Handler}}{+NameSpace, +URL, +Parser}.
See \secref{xmlns} for details.
See \secref{xmlns} for details.
\termitem{urlns}{}
When parsing an in \const{xmlns} mode, this predicate can be used to map a
url into either a canonical URL for this namespace or another internal
identifier. See \secref{xmlns} for details.
When parsing an in \const{xmlns} mode, this predicate can be used to map a
url into either a canonical URL for this namespace or another internal
identifier. See \secref{xmlns} for details.
\end{description}
\end{description}
\end{description}
\subsubsection{Partial Parsing}
In some cases, part of a document needs to be parsed. One option is to
use load_structure/2 or one of its variations and extract
the desired elements from the returned structure. This is a clean
solution, especially on small and medium-sized documents. It however is
unsuitable for parsing really big documents. Such documents can only be
In some cases, part of a document needs to be parsed. One option is to
use load_structure/2 or one of its variations and extract
the desired elements from the returned structure. This is a clean
solution, especially on small and medium-sized documents. It however is
unsuitable for parsing really big documents. Such documents can only be
handled with the call-back output interface realised by the
\term{call}{Event, Action} option of sgml_parse/2.
Event-driven processing is not very natural in Prolog.
The SGML2PL library allows for a mixed approach. Consider the case where
we want to process all descriptions from RDF elements in a document. The
code below calls <xmp>process_rdf_description(Element)</xmp> on each element
code below calls <xmp>process_rdf_description(Element)</xmp> on each element
that is directly inside an RDF element.
\begin{code}
@ -994,26 +1047,28 @@ set_sgml_parser/2 or, for XML, based on the \const{encoding}
attribute of the XML header. The parser reads from SWI-Prolog streams,
which also provide encoding handling. Therefore, there are two modes
for parsing. If the SWI-Prolog stream has encoding \const{octet} (which
is the default for binary streams), the decoder of the SGML parser will
is the default for binary streams), the decoder of the SGML parser will
be used and positions reported by the parser are octet offsets in the
stream. In other cases, the Prolog stream decoder is used and offsets
are character code counts.
\input{xpath.tex}
\section{Processing Indexed Files} \label{sec:indexaccess}
In some cases applications wish to process small portions of large
SGML, XML or RDF files. For example, the \emph{OpenDirectory} project
by Netscape has produced a 90MB RDF file representing the main index.
The parser described here can process this document as a unit, but
loading takes 85 seconds on a Pentium-II 450 and the resulting term
requires about 70MB global stack. One option is to process the entire
document and output it as a Prolog fact-base of RDF triplets, but in
many cases this is undesirable. Another example is a large SGML file
containing online documentation. The application normally wishes to
provide only small portions at a time to the user. Loading the entire
In some cases applications wish to process small portions of large
SGML, XML or RDF files. For example, the \emph{OpenDirectory} project
by Netscape has produced a 90MB RDF file representing the main index.
The parser described here can process this document as a unit, but
loading takes 85 seconds on a Pentium-II 450 and the resulting term
requires about 70MB global stack. One option is to process the entire
document and output it as a Prolog fact-base of RDF triplets, but in
many cases this is undesirable. Another example is a large SGML file
containing online documentation. The application normally wishes to
provide only small portions at a time to the user. Loading the entire
document into memory is then undesirable.
Using the \term{parse}{element} option, we open a file, seek
Using the \term{parse}{element} option, we open a file, seek
(using <pref builtin>seek/4) to the position of the element and
read the desired element.
@ -1059,12 +1114,12 @@ rdf_element(Id, Term) :-
\section{External entities}
While processing an SGML document the document may refer to external
data. This occurs in three places: external parameter entities, normal
external entities and the \const{DOCTYPE} declaration. The current version
of this tool deals rather primitively with external data. External
entities can only be loaded from a file and the mapping between the
entity names and the file is done using a \jargon{catalog} file in a
While processing an SGML document the document may refer to external
data. This occurs in three places: external parameter entities, normal
external entities and the \const{DOCTYPE} declaration. The current version
of this tool deals rather primitively with external data. External
entities can only be loaded from a file and the mapping between the
entity names and the file is done using a \jargon{catalog} file in a
format compatible with that used by James Clark's SP Parser,
based on the SGML Open (now OASIS) specification.
@ -1075,23 +1130,23 @@ sgml_register_catalog_file/2 or the environment variable
\begin{description}
\predicate{sgml_register_catalog_file}{2}{+File, +Location}
Register the indicated \arg{File} as a catalog file. \arg{Location} is
either \const{start} or \const{end} and defines whether the catalog is
either \const{start} or \const{end} and defines whether the catalog is
considered first or last. This predicate has no effect if \arg{File} is
already part of the catalog.
If no files are registered using this predicate, the first query on the
If no files are registered using this predicate, the first query on the
catalog examines \env{SGML_CATALOG_FILES} and fills the catalog with
all files in this path.
all files in this path.
\end{description}
Two types of lines are used by this package.
\begin{quote}
\const{DOCTYPE} \arg{doctype} \arg{file} \\
\const{PUBLIC} \exam{"}\arg{Id}\exam{"} \arg{file}
\const{PUBLIC} \exam{"}\arg{Id}\exam{"} \arg{file}
\end{quote}
The specified \arg{file} path is taken relative to the location of the
The specified \arg{file} path is taken relative to the location of the
catolog file. For the \const{DOCTYPE} declaraction, \pllib{sgml} first
makes an attempt to resolve the \const{SYSTEM} or \const{PUBLIC}
identifier. If this fails it tries to resolve the \arg{doctype} using
@ -1102,10 +1157,12 @@ where system identifiers must be Universal Resource Indicators, not
local file names. Simple uses of relative URIs will work correctly under
UNIX and Windows.
In the future we will design a call-back mechanism for locating and
processing external entities, so Prolog-based file-location and Prolog
In the future we will design a call-back mechanism for locating and
processing external entities, so Prolog-based file-location and Prolog
resources can be used to store external entities.
\input{pwp.tex}
\section{Writing markup}
\subsection{Writing documents}
@ -1149,14 +1206,14 @@ elements are written using increasing indentation. This introduces
(depending on the mode and defined whitespace handling) CDATA sequences
with only layout between elements when read back in. If \const{false}, no
layout characters are added. As this mode does not need to analyse the
document it is faster and guarantees correct output when read back.
Unfortunately the output is hardly human readable and causes problems
document it is faster and guarantees correct output when read back.
Unfortunately the output is hardly human readable and causes problems
with many editors.
\termitem{indent}{Integer}
Set the initial element indentation. It more than zero, the indent
is written before the document.
\termitem{nsmap}{Map}
Set the initial namespace map. \arg{Map} is a list of
Set the initial namespace map. \arg{Map} is a list of
\arg{Name} = \arg{URI}. This option, together with \const{header} and
\const{ident} is added to use xml_write/3 to generate XML
that is embedded in a larger XML document.
@ -1197,7 +1254,7 @@ values are \const{ascii}, \const{iso_latin_1}, \const{utf8} and
\const{unicode}. Versions with two arguments are provided for backward
compatibility, making the safe \const{ascii} encoding assumption.
\begin{description}
\begin{description}
\predicate{xml_quote_attribute}{3}{+In, -Quoted, +Encoding}
Map the characters that may not appear in XML attributes to entities.
Currently these are \verb$<>&"$.%
@ -1222,8 +1279,8 @@ Assumes \const{ascii} encoding.
Succeed if \arg{In} is an atom or string that satisfies the rules for
a valid XML element or attribute name. As with the other predicates in
this group, if \arg{Encoding} cannot represent one of the characters, this
function fails. It uses a hard-coded table for ASCII-range characters and
iswalpha()/iswalnum() for the first and remaining characters of the name.
function fails. Character classification is based on
\url{http://www.w3.org/TR/2006/REC-xml-20060816}.
\predicate{xml_name}{1}{+In}
Backward compatibility version for xml_name/2. Assumes \const{ascii}
@ -1238,8 +1295,8 @@ Known missing SGML features include
\begin{itemlist}
\item [NOTATION on entities]
Though notation is parsed, notation attributes on external entity
declarations are not handed to the user.
Though notation is parsed, notation attributes on external entity
declarations are not handed to the user.
\item [NOTATION attributes]
SGML notations may have attributes, declared using
\verb$<!ATTLIST #NOTATION name attributes>$. Those data attributes
@ -1261,8 +1318,8 @@ Empty start tags (\verb$<>$), unclosed start tags
(\verb$<a<b</verb>) and unclosed end tags (<verb></a<b$) are not
supported.
\item [SGML declaration]
The `SGML declaration' is fixed, though most of the parameters are
handled through indirections in the implementation.
The `SGML declaration' is fixed, though most of the parameters are
handled through indirections in the implementation.
\item [The DATATAG feature]
It is regarded as superseeded by SHORTREF, which is supported.
(SP does not support it either.)
@ -1276,7 +1333,7 @@ one DTD at the same time. It is not supported.
\end{itemlist}
In XML mode the parser recognises SGML constructs that are not allowed
In XML mode the parser recognises SGML constructs that are not allowed
in XML. Also various extensions of XML over SGML are not yet realised.
In particular, XInclude is not implemented because the designers of
XInclude can't make up their minds whether to base it on elements or
@ -1305,7 +1362,7 @@ refers to the SWI-Prolog `home-directory'.
\section{Acknowledgements}
The Prolog representation for parsed documents is based on the
The Prolog representation for parsed documents is based on the
SWI-Prolog interface to SP by Anjo Anjewierden.
Richard O'Keefe has put a lot of effort testing and providing bug

File diff suppressed because it is too large Load Diff

View File

@ -61,7 +61,7 @@ sgml__utf8_get_char(const char *in, int *chr)
}
*chr = *in;
return (char *)in+1;
}

View File

@ -26,7 +26,6 @@
#define UTIL_H_IMPLEMENTATION
#include "util.h"
#include <unistd.h>
#include <ctype.h>
#include <wctype.h>
#include <stdlib.h>
@ -50,7 +49,7 @@
size_t
istrlen(const ichar *s)
{ size_t len =0;
while(*s++)
len++;
@ -67,7 +66,7 @@ istrdup(const ichar *s)
while(*s)
*d++ = *s++;
*d = 0;
return dup;
} else
{ return NULL;
@ -140,10 +139,10 @@ int
istreq(const ichar *s1, const ichar *s2)
{ while(*s1 && *s1 == *s2)
s1++, s2++;
if ( *s1 == 0 && *s2 == 0 )
return TRUE;
return FALSE;
}
@ -152,10 +151,10 @@ int
istrncaseeq(const ichar *s1, const ichar *s2, int len)
{ while(--len >= 0 && towlower(*s1) == towlower(*s2))
s1++, s2++;
if ( len < 0 )
return TRUE;
return FALSE;
}
@ -164,10 +163,10 @@ int
istrprefix(const ichar *pref, const ichar *s)
{ while(*pref && *pref == *s)
pref++, s++;
if ( *pref == 0 )
return TRUE;
return FALSE;
}
@ -212,7 +211,7 @@ istrhash(const ichar *t, int tsize)
while(*t)
{ unsigned int c = *t++;
c -= 'a';
value ^= c << (shift & 0xf);
shift ^= c;
@ -231,7 +230,7 @@ istrcasehash(const ichar *t, int tsize)
while(*t)
{ unsigned int c = towlower(*t++); /* case insensitive */
c -= 'a';
value ^= c << (shift & 0xf);
shift ^= c;
@ -301,7 +300,7 @@ __add_icharbuf(icharbuf *buf, int chr)
else
buf->data = sgml_malloc(buf->allocated*sizeof(ichar));
}
buf->data[buf->size++] = chr;
}
@ -349,7 +348,7 @@ init_ocharbuf(ocharbuf *buf)
ocharbuf *
new_ocharbuf()
{ ocharbuf *buf = sgml_malloc(sizeof(*buf));
return init_ocharbuf(buf);
}
@ -436,24 +435,76 @@ empty_ocharbuf(ocharbuf *buf)
*******************************/
#define RINGSIZE 16
static void *ring[RINGSIZE];
static int ringp;
typedef struct ring
{ void *ring[RINGSIZE];
int ringp;
} ring;
#ifdef _REENTRANT
#include <pthread.h>
static pthread_key_t ring_key;
static void
free_ring(void *ptr)
{ ring *r = ptr;
int i;
void **bp;
for(i=0, bp=r->ring; i<RINGSIZE; i++, bp++)
{ if ( *bp )
{ sgml_free(*bp);
*bp = NULL;
}
}
sgml_free(r);
}
static ring *
my_ring()
{ ring *r;
if ( (r=pthread_getspecific(ring_key)) )
return r;
if ( (r = sgml_calloc(1, sizeof(*r))) )
pthread_setspecific(ring_key, r);
return r;
}
void
init_ring(void)
{ pthread_key_create(&ring_key, free_ring);
}
#else
static ring ring_store;
#define my_ring() (&ring_store)
void init_ring(void) {}
#endif
wchar_t *
str2ring(const wchar_t *in)
{ wchar_t *copy = sgml_malloc((wcslen(in)+1)*sizeof(wchar_t));
{ ring *r;
wchar_t *copy;
if ( !copy )
if ( !(r=my_ring()) ||
!(copy = sgml_malloc((wcslen(in)+1)*sizeof(wchar_t))) )
{ sgml_nomem();
return NULL;
}
wcscpy(copy, in);
if ( ring[ringp] )
sgml_free(ring[ringp]);
ring[ringp++] = copy;
if ( ringp == RINGSIZE )
ringp = 0;
if ( r->ring[r->ringp] )
sgml_free(r->ring[r->ringp]);
r->ring[r->ringp++] = copy;
if ( r->ringp == RINGSIZE )
r->ringp = 0;
return copy;
}
@ -461,13 +512,19 @@ str2ring(const wchar_t *in)
void *
ringallo(size_t size)
{ char *result = sgml_malloc(size);
if ( ring[ringp] )
sgml_free(ring[ringp]);
ring[ringp++] = result;
if ( ringp == RINGSIZE )
ringp = 0;
{ ring *r;
char *result;
if ( !(r=my_ring()) || !(result = sgml_malloc(size)) )
{ sgml_nomem();
return NULL;
}
if ( r->ring[r->ringp] )
sgml_free(r->ring[r->ringp]);
r->ring[r->ringp++] = result;
if ( r->ringp == RINGSIZE )
r->ringp = 0;
return result;
}
@ -529,7 +586,7 @@ wcstoutf8(const wchar_t *in)
{ size++;
}
}
rc = sgml_malloc(size+1);
for(o=rc, s=in; *s; s++)
{ o = utf8_put_char(o, *s);
@ -605,7 +662,7 @@ load_sgml_file_to_charp(const ichar *file, int normalise_rsre, size_t *length)
if ( r )
{ char *s = r;
while(len>0)
{ int n;
@ -652,7 +709,7 @@ load_sgml_file_to_charp(const ichar *file, int normalise_rsre, size_t *length)
if ( last_is_lf )
r2[--len] = '\0'; /* delete last LF */
if ( length )
*length = len;
sgml_free(r);

View File

@ -34,16 +34,16 @@
#include <malloc.h>
#endif
typedef struct
typedef struct
{ int allocated;
int size;
ichar *data;
} icharbuf;
typedef struct
typedef struct
{ int allocated;
int size;
union
union
{ wchar_t *w; /* UCS */
} data;
wchar_t localbuf[256]; /* Initial local store */
@ -98,6 +98,7 @@ void empty_ocharbuf(ocharbuf *buf);
{ buf->data.w[at] = chr; \
}
void init_ring(void);
const wchar_t * str_summary(const wchar_t *s, int len);
wchar_t * str2ring(const wchar_t *in);
void * ringallo(size_t);
@ -107,8 +108,6 @@ ichar * load_sgml_file_to_charp(const ichar *file, int normalise_rsre,
size_t *len);
FILE * wfopen(const wchar_t *name, const char *mode);
void wputs(ichar *s);
#if defined(USE_STRING_FUNCTIONS) && !defined(UTIL_H_IMPLEMENTATION)
#define istrlen(s1) wcslen((s1))

View File

@ -29,8 +29,6 @@
the GNU General Public License.
*/
#include "xml_unicode.h"
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
XML character classification.

View File

@ -29,35 +29,36 @@
#ifdef XMLNS
static xmlns *
xmlns *
xmlns_push(dtd_parser *p, const ichar *ns, const ichar *url)
{ sgml_environment *env = p->environments;
dtd_symbol *n = (*ns ? dtd_add_symbol(p->dtd, ns) : (dtd_symbol *)NULL);
dtd_symbol *u = dtd_add_symbol(p->dtd, url); /* TBD: ochar/ichar */
xmlns *x = sgml_malloc(sizeof(*x));
if ( p->on_xmlns )
(*p->on_xmlns)(p, n, u);
x->name = n;
x->url = u;
if ( env )
{ xmlns *x = sgml_malloc(sizeof(*n));
{ if ( p->on_xmlns )
(*p->on_xmlns)(p, n, u);
x->name = n;
x->url = u;
x->next = env->xmlns;
env->xmlns = x;
return x;
} else
{ x->next = p->xmlns;
p->xmlns = x;
}
return NULL;
return x;
}
void
xmlns_free(sgml_environment *env)
{ xmlns *n, *next;
xmlns_free(xmlns *n)
{ xmlns *next;
for(n = env->xmlns; n; n = next)
for(; n; n = next)
{ next = n->next;
sgml_free(n);
@ -66,16 +67,22 @@ xmlns_free(sgml_environment *env)
xmlns *
xmlns_find(sgml_environment *env, dtd_symbol *ns)
{ for(; env; env = env->parent)
{ xmlns *n;
xmlns_find(dtd_parser *p, dtd_symbol *ns)
{ sgml_environment *env = p->environments;
xmlns *n;
for(n=env->xmlns; n; n = n->next)
for(; env; env = env->parent)
{ for(n=env->xmlns; n; n = n->next)
{ if ( n->name == ns )
return n;
}
}
for (n=p->xmlns; n; n = n->next)
{ if ( n->name == ns )
return n;
}
return NULL;
}
@ -97,7 +104,7 @@ void
update_xmlns(dtd_parser *p, dtd_element *e, int natts, sgml_attribute *atts)
{ dtd_attr_list *al;
int nschr = p->dtd->charfunc->func[CF_NS]; /* : */
for(al=e->attributes; al; al=al->next)
{ dtd_attr *a = al->attribute;
const ichar *name = a->name->name;
@ -123,7 +130,7 @@ update_xmlns(dtd_parser *p, dtd_element *e, int natts, sgml_attribute *atts)
xmlns_resolve()
Convert a symbol as returned by the XML level-1.0 parser to its namespace
tuple {url}localname. This function is not used internally, but provided
for use from the call-back functions of the parser.
for use from the call-back functions of the parser.
It exploits the stack of namespace-environments managed by the parser
itself (see update_xmlns())
@ -150,7 +157,7 @@ xmlns_resolve_attribute(dtd_parser *p, dtd_symbol *id,
if ( istrprefix(L"xml", buf) ) /* XML reserved namespaces */
{ *url = n->name;
return TRUE;
} else if ( (ns = xmlns_find(p->environments, n)) )
} else if ( (ns = xmlns_find(p, n)) )
{ if ( ns->url->name[0] )
*url = ns->url->name;
else
@ -158,7 +165,9 @@ xmlns_resolve_attribute(dtd_parser *p, dtd_symbol *id,
return TRUE;
} else
{ *url = n->name; /* undefined namespace */
gripe(ERC_EXISTENCE, L"namespace", n->name);
if ( p->xml_no_ns == NONS_QUIET )
return TRUE;
gripe(p, ERC_EXISTENCE, L"namespace", n->name);
return FALSE;
}
}
@ -195,16 +204,16 @@ xmlns_resolve_element(dtd_parser *p, const ichar **local, const ichar **url)
ichar *o = buf;
const ichar *s;
xmlns *ns;
for(s=id->name; *s; s++)
{ if ( *s == nschr ) /* explicit namespace */
{ dtd_symbol *n;
*o = '\0';
*local = s+1;
n = dtd_add_symbol(dtd, buf);
if ( (ns = xmlns_find(p->environments, n)) )
if ( (ns = xmlns_find(p, n)) )
{ if ( ns->url->name[0] )
*url = ns->url->name;
else
@ -213,17 +222,19 @@ xmlns_resolve_element(dtd_parser *p, const ichar **local, const ichar **url)
return TRUE;
} else
{ *url = n->name; /* undefined namespace */
gripe(ERC_EXISTENCE, "namespace", n->name);
e->thisns = xmlns_push(p, n->name, n->name); /* define implicitly */
if ( p->xml_no_ns == NONS_QUIET )
return TRUE;
gripe(p, ERC_EXISTENCE, L"namespace", n->name);
return FALSE;
}
}
*o++ = *s;
}
*local = id->name;
if ( (ns = xmlns_find(p->environments, NULL)) )
if ( (ns = xmlns_find(p, NULL)) )
{ if ( ns->url->name[0] )
*url = ns->url->name;
else

View File

@ -31,8 +31,9 @@ typedef struct _xmlns
struct _xmlns *next; /* next name */
} xmlns;
void xmlns_free(sgml_environment *env);
xmlns* xmlns_find(sgml_environment *env, dtd_symbol *ns);
void xmlns_free(xmlns *list);
xmlns* xmlns_find(dtd_parser *p, dtd_symbol *ns);
xmlns * xmlns_push(dtd_parser *p, const ichar *ns, const ichar *url);
void update_xmlns(dtd_parser *p, dtd_element *e,
int natts, sgml_attribute *atts);
int xmlns_resolve_attribute(dtd_parser *p, dtd_symbol *id,