480 lines
9.5 KiB
C
480 lines
9.5 KiB
C
/* $Id$
|
|
|
|
Part of SWI-Prolog
|
|
|
|
Author: Jan Wielemaker
|
|
E-mail: wielemak@science.uva.nl
|
|
WWW: http://www.swi-prolog.org
|
|
Copyright (C): 1985-2007, University of Amsterdam
|
|
|
|
This library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
This library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with this library; if not, write to the Free Software
|
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
*/
|
|
|
|
#include <SWI-Stream.h> /* encoding */
|
|
#include <SWI-Prolog.h>
|
|
#include <stdlib.h>
|
|
#ifdef HAVE_MALLOC_H
|
|
#include <malloc.h>
|
|
#endif
|
|
#include "error.h"
|
|
#include <errno.h>
|
|
#include <string.h>
|
|
#include <stdio.h>
|
|
#include <wctype.h>
|
|
#include "xml_unicode.h"
|
|
#include "dtd.h"
|
|
#ifdef __WINDOWS__
|
|
#define inline __inline
|
|
#endif
|
|
|
|
static atom_t ATOM_iso_latin_1;
|
|
static atom_t ATOM_utf8;
|
|
static atom_t ATOM_unicode;
|
|
static atom_t ATOM_ascii;
|
|
|
|
#define CHARSET 256
|
|
|
|
typedef struct charbuf
|
|
{ char buffer[1024];
|
|
char *bufp;
|
|
char *end;
|
|
size_t size;
|
|
} charbuf;
|
|
|
|
|
|
static void
|
|
init_buf(charbuf *b)
|
|
{ b->bufp = b->end = b->buffer;
|
|
b->size = sizeof(b->buffer);
|
|
}
|
|
|
|
|
|
static void
|
|
free_buf(charbuf *b)
|
|
{ if ( b->bufp != b->buffer )
|
|
free(b->bufp);
|
|
}
|
|
|
|
|
|
static int
|
|
room_buf(charbuf *b, size_t room)
|
|
{ size_t used = b->end - b->bufp;
|
|
|
|
if ( room + used > b->size )
|
|
{ if ( b->bufp == b->buffer )
|
|
{ b->size = sizeof(b->buffer)*2;
|
|
if ( !(b->bufp = malloc(b->size)) )
|
|
return sgml2pl_error(ERR_ERRNO);
|
|
|
|
memcpy(b->bufp, b->buffer, used);
|
|
} else
|
|
{ char *ptr;
|
|
|
|
b->size *= 2;
|
|
if ( !(ptr = realloc(b->bufp, b->size)) )
|
|
return sgml2pl_error(ERR_ERRNO);
|
|
b->bufp = ptr;
|
|
}
|
|
b->end = b->bufp + used;
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
|
|
static size_t
|
|
used_buf(const charbuf *b)
|
|
{ return b->end - b->bufp;
|
|
}
|
|
|
|
|
|
static int
|
|
add_char_buf(charbuf *b, int chr)
|
|
{ if ( room_buf(b, 1) )
|
|
{ *b->end++ = chr;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
|
|
static int
|
|
add_char_bufW(charbuf *b, int chr)
|
|
{ if ( room_buf(b, sizeof(wchar_t)) )
|
|
{ wchar_t *p = (wchar_t*)b->end;
|
|
|
|
*p++ = chr;
|
|
b->end = (char *)p;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
|
|
static int
|
|
add_str_buf(charbuf *b, const char *s)
|
|
{ size_t len = strlen(s);
|
|
|
|
if ( room_buf(b, len+1) )
|
|
{ memcpy(b->end, s, len+1);
|
|
b->end += len;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
|
|
static int
|
|
add_str_bufW(charbuf *b, const char *s)
|
|
{ size_t len = strlen(s);
|
|
|
|
if ( room_buf(b, len*sizeof(wchar_t)) )
|
|
{ wchar_t *p = (wchar_t*)b->end;
|
|
|
|
while(*s)
|
|
*p++ = *s++;
|
|
b->end = (char *)p;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
|
|
|
|
static foreign_t
|
|
do_quote(term_t in, term_t quoted, char **map, int maxchr)
|
|
{ char *inA = NULL;
|
|
wchar_t *inW = NULL;
|
|
size_t len;
|
|
const unsigned char *s;
|
|
charbuf buffer;
|
|
int changes = 0;
|
|
int rc;
|
|
|
|
if ( !PL_get_nchars(in, &len, &inA, CVT_ATOMIC) &&
|
|
!PL_get_wchars(in, &len, &inW, CVT_ATOMIC) )
|
|
return sgml2pl_error(ERR_TYPE, "atom", in);
|
|
if ( len == 0 )
|
|
return PL_unify(in, quoted);
|
|
|
|
init_buf(&buffer);
|
|
|
|
if ( inA )
|
|
{ for(s = (unsigned char*)inA ; len-- > 0; s++ )
|
|
{ int c = *s;
|
|
|
|
if ( map[c] )
|
|
{ if ( !add_str_buf(&buffer, map[c]) )
|
|
return FALSE;
|
|
|
|
changes++;
|
|
} else if ( c > maxchr )
|
|
{ char buf[10];
|
|
|
|
sprintf(buf, "&#%d;", c);
|
|
if ( !add_str_buf(&buffer, buf) )
|
|
return FALSE;
|
|
|
|
changes++;
|
|
} else
|
|
{ add_char_buf(&buffer, c);
|
|
}
|
|
}
|
|
|
|
if ( changes > 0 )
|
|
rc = PL_unify_atom_nchars(quoted, used_buf(&buffer), buffer.bufp);
|
|
else
|
|
rc = PL_unify(in, quoted);
|
|
} else
|
|
{ for( ; len-- > 0; inW++ )
|
|
{ int c = *inW;
|
|
|
|
if ( c <= 0xff && map[c] )
|
|
{ if ( !add_str_bufW(&buffer, map[c]) )
|
|
return FALSE;
|
|
|
|
changes++;
|
|
} else if ( c > maxchr )
|
|
{ char buf[10];
|
|
|
|
sprintf(buf, "&#%d;", c);
|
|
if ( !add_str_bufW(&buffer, buf) )
|
|
return FALSE;
|
|
|
|
changes++;
|
|
}else
|
|
{ add_char_bufW(&buffer, c);
|
|
}
|
|
}
|
|
|
|
if ( changes > 0 )
|
|
rc = PL_unify_wchars(quoted, PL_ATOM,
|
|
used_buf(&buffer)/sizeof(wchar_t),
|
|
(wchar_t*)buffer.bufp);
|
|
else
|
|
rc = PL_unify(in, quoted);
|
|
}
|
|
|
|
free_buf(&buffer);
|
|
|
|
return rc;
|
|
}
|
|
|
|
|
|
static int
|
|
get_max_chr(term_t t, int *maxchr)
|
|
{ atom_t a;
|
|
|
|
if ( PL_get_atom(t, &a) )
|
|
{ if ( a == ATOM_iso_latin_1 )
|
|
*maxchr = 0xff;
|
|
else if ( a == ATOM_utf8 )
|
|
*maxchr = 0x7ffffff;
|
|
else if ( a == ATOM_unicode )
|
|
*maxchr = 0xffff;
|
|
else if ( a == ATOM_ascii )
|
|
*maxchr = 0x7f;
|
|
else
|
|
return sgml2pl_error(ERR_DOMAIN, "encoding", t);
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
return sgml2pl_error(ERR_TYPE, "atom", t);
|
|
}
|
|
|
|
|
|
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
|
(*) xml_quote_attribute/3 assumes the attribute is quoted using "" and
|
|
does *not* escape '. Although escaping ' with ' is valid XML, it is
|
|
*not* valid html, and this routine is also used by the html_write
|
|
library.
|
|
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
|
|
|
|
static foreign_t
|
|
xml_quote_attribute(term_t in, term_t out, term_t encoding)
|
|
{ static char **map;
|
|
int maxchr;
|
|
|
|
if ( !map )
|
|
{ int i;
|
|
|
|
if ( !(map = malloc(CHARSET*sizeof(char*))) )
|
|
return sgml2pl_error(ERR_ERRNO, errno);
|
|
|
|
for(i=0; i<CHARSET; i++)
|
|
map[i] = NULL;
|
|
|
|
map['<'] = "<";
|
|
map['>'] = ">";
|
|
map['&'] = "&";
|
|
/* map['\''] = "'"; See (*) */
|
|
map['"'] = """;
|
|
}
|
|
|
|
if ( !get_max_chr(encoding, &maxchr) )
|
|
return FALSE;
|
|
|
|
return do_quote(in, out, map, maxchr);
|
|
}
|
|
|
|
|
|
static foreign_t
|
|
xml_quote_cdata(term_t in, term_t out, term_t encoding)
|
|
{ static char **map;
|
|
int maxchr;
|
|
|
|
if ( !map )
|
|
{ int i;
|
|
|
|
if ( !(map = malloc(CHARSET*sizeof(char*))) )
|
|
return sgml2pl_error(ERR_ERRNO, errno);
|
|
|
|
for(i=0; i<CHARSET; i++)
|
|
map[i] = NULL;
|
|
|
|
map['<'] = "<";
|
|
map['>'] = ">";
|
|
map['&'] = "&";
|
|
}
|
|
|
|
if ( !get_max_chr(encoding, &maxchr) )
|
|
return FALSE;
|
|
|
|
return do_quote(in, out, map, maxchr);
|
|
}
|
|
|
|
|
|
static inline int
|
|
is_xml_nmstart(dtd_charclass *map, int c)
|
|
{ if ( c <= 0xff )
|
|
{ return (map->class[c] & CH_NMSTART);
|
|
} else
|
|
{ return ( xml_basechar(c) ||
|
|
xml_ideographic(c)
|
|
);
|
|
}
|
|
}
|
|
|
|
|
|
static inline int
|
|
is_xml_chname(dtd_charclass *map, int c)
|
|
{ if ( c <= 0xff )
|
|
{ return (map->class[c] & CH_NAME);
|
|
} else
|
|
{ return ( xml_basechar(c) ||
|
|
xml_digit(c) ||
|
|
xml_ideographic(c) ||
|
|
xml_combining_char(c) ||
|
|
xml_extender(c)
|
|
);
|
|
}
|
|
}
|
|
|
|
static dtd_charclass *map;
|
|
|
|
static foreign_t
|
|
xml_name(term_t in, term_t encoding)
|
|
{ char *ins;
|
|
wchar_t *inW;
|
|
size_t len;
|
|
unsigned int i;
|
|
int maxchr;
|
|
|
|
if ( !get_max_chr(encoding, &maxchr) )
|
|
return FALSE;
|
|
|
|
if ( !map )
|
|
map = new_charclass();
|
|
|
|
if ( PL_get_nchars(in, &len, &ins, CVT_ATOMIC) )
|
|
{ int c;
|
|
|
|
if ( len == 0 )
|
|
return FALSE;
|
|
|
|
c = ins[0] & 0xff;
|
|
if ( c > maxchr )
|
|
return FALSE;
|
|
|
|
if ( !(map->class[c] & CH_NMSTART) )
|
|
return FALSE;
|
|
for(i=1; i<len; i++)
|
|
{ c = ins[i] & 0xff;
|
|
|
|
if ( c > maxchr || !(map->class[c] & CH_NAME) )
|
|
return FALSE;
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
if ( PL_get_wchars(in, &len, &inW, CVT_ATOMIC) )
|
|
{ if ( len == 0 )
|
|
return FALSE;
|
|
|
|
if ( inW[0] > maxchr ||
|
|
!is_xml_nmstart(map, inW[0]) )
|
|
return FALSE;
|
|
|
|
for(i=1; i<len; i++)
|
|
{ int c = inW[i];
|
|
|
|
if ( c > maxchr ||
|
|
!is_xml_chname(map, c) )
|
|
return FALSE;
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
|
|
static foreign_t
|
|
iri_xml_namespace(term_t iri, term_t namespace, term_t localname)
|
|
{ char *s;
|
|
pl_wchar_t *w;
|
|
size_t len;
|
|
|
|
if ( !map )
|
|
map = new_charclass();
|
|
|
|
if ( PL_get_nchars(iri, &len, &s, CVT_ATOM|CVT_STRING) )
|
|
{ const char *e = &s[len];
|
|
const char *p = e;
|
|
|
|
while(p>s && (map->class[p[-1]&0xff] & CH_NAME))
|
|
p--;
|
|
while(p<e && !(map->class[p[0]&0xff] & CH_NMSTART))
|
|
p++;
|
|
|
|
if ( !PL_unify_atom_nchars(namespace, p-s, s) )
|
|
return FALSE;
|
|
if ( localname &&
|
|
!PL_unify_atom_nchars(localname, e-p, p) )
|
|
return FALSE;
|
|
|
|
return TRUE;
|
|
} else if ( PL_get_wchars(iri, &len, &w, CVT_ATOM|CVT_STRING|CVT_EXCEPTION) )
|
|
{ const pl_wchar_t *e = &w[len];
|
|
const pl_wchar_t *p = e;
|
|
|
|
while(p>w && is_xml_chname(map, p[-1]) )
|
|
p--;
|
|
while(p<e && !is_xml_nmstart(map, p[0]) )
|
|
p++;
|
|
|
|
if ( !PL_unify_wchars(namespace, PL_ATOM, p-w, w) )
|
|
return FALSE;
|
|
if ( localname &&
|
|
!PL_unify_wchars(localname, PL_ATOM, e-p, p) )
|
|
return FALSE;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
|
|
static foreign_t
|
|
iri_xml_namespace2(term_t iri, term_t namespace)
|
|
{ return iri_xml_namespace(iri, namespace, 0);
|
|
}
|
|
|
|
|
|
install_t
|
|
install_xml_quote()
|
|
{ ATOM_iso_latin_1 = PL_new_atom("iso_latin_1");
|
|
ATOM_utf8 = PL_new_atom("utf8");
|
|
ATOM_unicode = PL_new_atom("unicode");
|
|
ATOM_ascii = PL_new_atom("ascii");
|
|
|
|
PL_register_foreign("xml_quote_attribute", 3, xml_quote_attribute, 0);
|
|
PL_register_foreign("xml_quote_cdata", 3, xml_quote_cdata, 0);
|
|
PL_register_foreign("xml_name", 2, xml_name, 0);
|
|
PL_register_foreign("iri_xml_namespace", 3, iri_xml_namespace, 0);
|
|
PL_register_foreign("iri_xml_namespace", 2, iri_xml_namespace2, 0);
|
|
}
|