yap-6.3/packages/sgml/quote.c

/*  $Id$

    Part of SWI-Prolog

    Author:        Jan Wielemaker
    E-mail:        wielemak@science.uva.nl
    WWW:           http://www.swi-prolog.org
    Copyright (C): 1985-2007, University of Amsterdam

    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
    License as published by the Free Software Foundation; either
    version 2.1 of the License, or (at your option) any later version.

    This library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Lesser General Public License for more details.

    You should have received a copy of the GNU Lesser General Public
    License along with this library; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

#include <SWI-Stream.h>			/* encoding */
#include <SWI-Prolog.h>
#include <stdlib.h>
#ifdef HAVE_MALLOC_H
#include <malloc.h>
#endif
#include "error.h"
#include <errno.h>
#include <string.h>
#include <stdio.h>
#include <wctype.h>
#include "dtd.h"

static atom_t ATOM_iso_latin_1;
static atom_t ATOM_utf8;
static atom_t ATOM_unicode;
static atom_t ATOM_ascii;

#define CHARSET 256

typedef struct charbuf
{ char     buffer[1024];
  char    *bufp;
  char    *end;
  size_t   size;
} charbuf;


static void
init_buf(charbuf *b)
{ b->bufp = b->end = b->buffer;
  b->size = sizeof(b->buffer);
}


static void
free_buf(charbuf *b)
{ if ( b->bufp != b->buffer )
    free(b->bufp);
}


static int
room_buf(charbuf *b, size_t room)
{ size_t used = b->end - b->bufp;

  if ( room + used > b->size )
  { if ( b->bufp == b->buffer )
    { b->size = sizeof(b->buffer)*2;
      if ( !(b->bufp = malloc(b->size)) )
	return sgml2pl_error(ERR_ERRNO);

      memcpy(b->bufp, b->buffer, used);
    } else
    { char *ptr;

      b->size *= 2;
      if ( !(ptr = realloc(b->bufp, b->size)) )
	return sgml2pl_error(ERR_ERRNO);
      b->bufp = ptr;
    }
    b->end = b->bufp + used;
  }

  return TRUE;    
}


static size_t
used_buf(const charbuf *b)
{ return b->end - b->bufp;
}


static int
add_char_buf(charbuf *b, int chr)
{ if ( room_buf(b, 1) )
  { *b->end++ = chr;

    return TRUE;
  }

  return FALSE;
}


static int
add_char_bufW(charbuf *b, int chr)
{ if ( room_buf(b, sizeof(wchar_t)) )
  { wchar_t *p = (wchar_t*)b->end;

    *p++ = chr;
    b->end = (char *)p;

    return TRUE;
  }

  return FALSE;
}


static int
add_str_buf(charbuf *b, const char *s)
{ size_t len = strlen(s);

  if ( room_buf(b, len+1) )
  { memcpy(b->end, s, len+1);
    b->end += len;

    return TRUE;
  }

  return FALSE;
}


static int
add_str_bufW(charbuf *b, const char *s)
{ size_t len = strlen(s);

  if ( room_buf(b, len*sizeof(wchar_t)) )
  { wchar_t *p = (wchar_t*)b->end;

    while(*s)
      *p++ = *s++;
    b->end = (char *)p;

    return TRUE;
  }

  return FALSE;
}


static foreign_t
do_quote(term_t in, term_t quoted, char **map, int maxchr)
{ char *inA = NULL;
  wchar_t *inW = NULL;
  size_t len;
  const unsigned  char *s;
  charbuf buffer;
  int changes = 0;
  int rc;

  if ( !PL_get_nchars(in, &len, &inA, CVT_ATOMIC) &&
       !PL_get_wchars(in, &len, &inW, CVT_ATOMIC) )
    return sgml2pl_error(ERR_TYPE, "atom", in);
  if ( len == 0 )
    return PL_unify(in, quoted);

  init_buf(&buffer);

  if ( inA )
  { for(s = (unsigned char*)inA ; len-- > 0; s++ )
    { int c = *s;
      
      if ( map[c] )
      { if ( !add_str_buf(&buffer, map[c]) )
	  return FALSE;
	
	changes++;
      } else if ( c > maxchr )
      { char buf[10];

	sprintf(buf, "&#%d;", c);
	if ( !add_str_buf(&buffer, buf) )
	  return FALSE;
	
	changes++;
      } else
      { add_char_buf(&buffer, c);
      }
    }

    if ( changes > 0 )
      rc = PL_unify_atom_nchars(quoted, used_buf(&buffer), buffer.bufp);
    else
      rc = PL_unify(in, quoted);
  } else
  { for( ; len-- > 0; inW++ )
    { int c = *inW;
      
      if ( c <= 0xff && map[c] )
      { if ( !add_str_bufW(&buffer, map[c]) )
	  return FALSE;
	
	changes++;
      } else if ( c > maxchr )
      { char buf[10];

	sprintf(buf, "&#%d;", c);
	if ( !add_str_bufW(&buffer, buf) )
	  return FALSE;
	
	changes++;
      }else
      { add_char_bufW(&buffer, c);
      }
    }
	 
    if ( changes > 0 )
      rc = PL_unify_wchars(quoted, PL_ATOM,
			   used_buf(&buffer)/sizeof(wchar_t),
			   (wchar_t*)buffer.bufp);
    else
      rc = PL_unify(in, quoted);
  }
  
  free_buf(&buffer);

  return rc;
}


static int
get_max_chr(term_t t, int *maxchr)
{ atom_t a;

  if ( PL_get_atom(t, &a) )
  { if ( a == ATOM_iso_latin_1 )
      *maxchr = 0xff;
    else if ( a == ATOM_utf8 )
      *maxchr = 0x7ffffff;
    else if ( a == ATOM_unicode )
      *maxchr = 0xffff;
    else if ( a == ATOM_ascii )
      *maxchr = 0x7f;
    else
      return sgml2pl_error(ERR_DOMAIN, "encoding", t);

    return TRUE;
  }

  return sgml2pl_error(ERR_TYPE, "atom", t);
}


/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
(*) xml_quote_attribute/3 assumes the attribute is   quoted using "" and
does *not* escape '. Although escaping ' with &apos; is valid XML, it is
*not* valid html, and this  routine  is   also  used  by  the html_write
library.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

static foreign_t
xml_quote_attribute(term_t in, term_t out, term_t encoding)
{ static char **map;
  int maxchr;

  if ( !map )
  { int i;

    if ( !(map = malloc(CHARSET*sizeof(char*))) )
      return sgml2pl_error(ERR_ERRNO, errno);

    for(i=0; i<CHARSET; i++)
      map[i] = NULL;

    map['<']  = "&lt;";
    map['>']  = "&gt;";
    map['&']  = "&amp;";
/*  map['\''] = "&apos;"; See (*) */
    map['"']  = "&quot;";
  }

  if ( !get_max_chr(encoding, &maxchr) )
    return FALSE;

  return do_quote(in, out, map, maxchr);
}


static foreign_t
xml_quote_cdata(term_t in, term_t out, term_t encoding)
{ static char **map;
  int maxchr;

  if ( !map )
  { int i;

    if ( !(map = malloc(CHARSET*sizeof(char*))) )
      return sgml2pl_error(ERR_ERRNO, errno);

    for(i=0; i<CHARSET; i++)
      map[i] = NULL;

    map['<']  = "&lt;";
    map['>']  = "&gt;";
    map['&']  = "&amp;";
  }

  if ( !get_max_chr(encoding, &maxchr) )
    return FALSE;

  return do_quote(in, out, map, maxchr);
}


static foreign_t
xml_name(term_t in, term_t encoding)
{ char *ins;
  wchar_t *inW;
  size_t len;
  static dtd_charclass *map;
  unsigned int i;
  int maxchr;

  if ( !get_max_chr(encoding, &maxchr) )
    return FALSE;

  if ( !map )
    map = new_charclass();

  if ( PL_get_nchars(in, &len, &ins, CVT_ATOMIC) )
  { int c;

    if ( len == 0 )
      return FALSE;

    c = ins[0] & 0xff;
    if ( c > maxchr )
      return FALSE;
    
    if ( !(map->class[c] & CH_NMSTART) )
      return FALSE;
    for(i=1; i<len; i++)
    { c = ins[i] & 0xff;

      if ( c > maxchr || !(map->class[c] & CH_NAME) )
	return FALSE;
    }

    return TRUE;
  }
  if ( PL_get_wchars(in, &len, &inW, CVT_ATOMIC) )
  { if ( len == 0 )
      return FALSE;
  
    if ( inW[0] > maxchr )
      return FALSE;

    if ( inW[0] <= 0xff &&
	 !(map->class[inW[0]] & CH_NMSTART) )
      return FALSE;
    if ( inW[0] > 0xff && !iswalpha(inW[0]) )
      return FALSE;
	 
    for(i=1; i<len; i++)
    { int c = inW[i];

      if ( c <= 0xff && !(map->class[c] & CH_NAME) )
	return FALSE;
      if ( c > 0xff && !iswalnum((wint_t)c) )
	return FALSE;
    }

    return TRUE;
  }

  return FALSE;
}


install_t
install_xml_quote()
{ ATOM_iso_latin_1 = PL_new_atom("iso_latin_1");
  ATOM_utf8        = PL_new_atom("utf8");
  ATOM_unicode     = PL_new_atom("unicode");
  ATOM_ascii       = PL_new_atom("ascii");

  PL_register_foreign("xml_quote_attribute", 3, xml_quote_attribute, 0);
  PL_register_foreign("xml_quote_cdata",     3, xml_quote_cdata,     0);
  PL_register_foreign("xml_name",            2, xml_name,            0);
}
Experiment with porting SGML to YAP, and trying to preserve SWI code as much as possible. 2009-03-13 19:39:06 +00:00			`/* $Id$`

			`Part of SWI-Prolog`

			`Author: Jan Wielemaker`
			`E-mail: wielemak@science.uva.nl`
			`WWW: http://www.swi-prolog.org`
			`Copyright (C): 1985-2007, University of Amsterdam`

			`This library is free software; you can redistribute it and/or`
			`modify it under the terms of the GNU Lesser General Public`
			`License as published by the Free Software Foundation; either`
			`version 2.1 of the License, or (at your option) any later version.`

			`This library is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`Lesser General Public License for more details.`

			`You should have received a copy of the GNU Lesser General Public`
			`License along with this library; if not, write to the Free Software`
			`Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA`
			`*/`

			`#include <SWI-Stream.h> /* encoding */`
			`#include <SWI-Prolog.h>`
			`#include <stdlib.h>`
			`#ifdef HAVE_MALLOC_H`
			`#include <malloc.h>`
			`#endif`
			`#include "error.h"`
			`#include <errno.h>`
			`#include <string.h>`
			`#include <stdio.h>`
			`#include <wctype.h>`
			`#include "dtd.h"`

			`static atom_t ATOM_iso_latin_1;`
			`static atom_t ATOM_utf8;`
			`static atom_t ATOM_unicode;`
			`static atom_t ATOM_ascii;`

			`#define CHARSET 256`

			`typedef struct charbuf`
			`{ char buffer[1024];`
			`char *bufp;`
			`char *end;`
			`size_t size;`
			`} charbuf;`


			`static void`
			`init_buf(charbuf *b)`
			`{ b->bufp = b->end = b->buffer;`
			`b->size = sizeof(b->buffer);`
			`}`


			`static void`
			`free_buf(charbuf *b)`
			`{ if ( b->bufp != b->buffer )`
			`free(b->bufp);`
			`}`


			`static int`
			`room_buf(charbuf *b, size_t room)`
			`{ size_t used = b->end - b->bufp;`

			`if ( room + used > b->size )`
			`{ if ( b->bufp == b->buffer )`
			`{ b->size = sizeof(b->buffer)*2;`
			`if ( !(b->bufp = malloc(b->size)) )`
			`return sgml2pl_error(ERR_ERRNO);`

			`memcpy(b->bufp, b->buffer, used);`
			`} else`
			`{ char *ptr;`

			`b->size *= 2;`
			`if ( !(ptr = realloc(b->bufp, b->size)) )`
			`return sgml2pl_error(ERR_ERRNO);`
			`b->bufp = ptr;`
			`}`
			`b->end = b->bufp + used;`
			`}`

			`return TRUE;`
			`}`


			`static size_t`
			`used_buf(const charbuf *b)`
			`{ return b->end - b->bufp;`
			`}`


			`static int`
			`add_char_buf(charbuf *b, int chr)`
			`{ if ( room_buf(b, 1) )`
			`{ *b->end++ = chr;`

			`return TRUE;`
			`}`

			`return FALSE;`
			`}`


			`static int`
			`add_char_bufW(charbuf *b, int chr)`
			`{ if ( room_buf(b, sizeof(wchar_t)) )`
			`{ wchar_t p = (wchar_t)b->end;`

			`*p++ = chr;`
			`b->end = (char *)p;`

			`return TRUE;`
			`}`

			`return FALSE;`
			`}`


			`static int`
			`add_str_buf(charbuf b, const char s)`
			`{ size_t len = strlen(s);`

			`if ( room_buf(b, len+1) )`
			`{ memcpy(b->end, s, len+1);`
			`b->end += len;`

			`return TRUE;`
			`}`

			`return FALSE;`
			`}`


			`static int`
			`add_str_bufW(charbuf b, const char s)`
			`{ size_t len = strlen(s);`

			`if ( room_buf(b, len*sizeof(wchar_t)) )`
			`{ wchar_t p = (wchar_t)b->end;`

			`while(*s)`
			`p++ = s++;`
			`b->end = (char *)p;`

			`return TRUE;`
			`}`

			`return FALSE;`
			`}`



			`static foreign_t`
			`do_quote(term_t in, term_t quoted, char **map, int maxchr)`
			`{ char *inA = NULL;`
			`wchar_t *inW = NULL;`
			`size_t len;`
			`const unsigned char *s;`
			`charbuf buffer;`
			`int changes = 0;`
			`int rc;`

			`if ( !PL_get_nchars(in, &len, &inA, CVT_ATOMIC) &&`
			`!PL_get_wchars(in, &len, &inW, CVT_ATOMIC) )`
			`return sgml2pl_error(ERR_TYPE, "atom", in);`
			`if ( len == 0 )`
			`return PL_unify(in, quoted);`

			`init_buf(&buffer);`

			`if ( inA )`
			`{ for(s = (unsigned char*)inA ; len-- > 0; s++ )`
			`{ int c = *s;`

			`if ( map[c] )`
			`{ if ( !add_str_buf(&buffer, map[c]) )`
			`return FALSE;`

			`changes++;`
			`} else if ( c > maxchr )`
			`{ char buf[10];`

			`sprintf(buf, "&#%d;", c);`
			`if ( !add_str_buf(&buffer, buf) )`
			`return FALSE;`

			`changes++;`
			`} else`
			`{ add_char_buf(&buffer, c);`
			`}`
			`}`

			`if ( changes > 0 )`
			`rc = PL_unify_atom_nchars(quoted, used_buf(&buffer), buffer.bufp);`
			`else`
			`rc = PL_unify(in, quoted);`
			`} else`
			`{ for( ; len-- > 0; inW++ )`
			`{ int c = *inW;`

			`if ( c <= 0xff && map[c] )`
			`{ if ( !add_str_bufW(&buffer, map[c]) )`
			`return FALSE;`

			`changes++;`
			`} else if ( c > maxchr )`
			`{ char buf[10];`

			`sprintf(buf, "&#%d;", c);`
			`if ( !add_str_bufW(&buffer, buf) )`
			`return FALSE;`

			`changes++;`
			`}else`
			`{ add_char_bufW(&buffer, c);`
			`}`
			`}`

			`if ( changes > 0 )`
			`rc = PL_unify_wchars(quoted, PL_ATOM,`
			`used_buf(&buffer)/sizeof(wchar_t),`
			`(wchar_t*)buffer.bufp);`
			`else`
			`rc = PL_unify(in, quoted);`
			`}`

			`free_buf(&buffer);`

			`return rc;`
			`}`


			`static int`
			`get_max_chr(term_t t, int *maxchr)`
			`{ atom_t a;`

			`if ( PL_get_atom(t, &a) )`
			`{ if ( a == ATOM_iso_latin_1 )`
			`*maxchr = 0xff;`
			`else if ( a == ATOM_utf8 )`
			`*maxchr = 0x7ffffff;`
			`else if ( a == ATOM_unicode )`
			`*maxchr = 0xffff;`
			`else if ( a == ATOM_ascii )`
			`*maxchr = 0x7f;`
			`else`
			`return sgml2pl_error(ERR_DOMAIN, "encoding", t);`

			`return TRUE;`
			`}`

			`return sgml2pl_error(ERR_TYPE, "atom", t);`
			`}`


			`/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -`
			`(*) xml_quote_attribute/3 assumes the attribute is quoted using "" and`
			`does not escape '. Although escaping ' with ' is valid XML, it is`
			`not valid html, and this routine is also used by the html_write`
			`library.`
			`- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */`

			`static foreign_t`
			`xml_quote_attribute(term_t in, term_t out, term_t encoding)`
			`{ static char **map;`
			`int maxchr;`

			`if ( !map )`
			`{ int i;`

			`if ( !(map = malloc(CHARSETsizeof(char))) )`
			`return sgml2pl_error(ERR_ERRNO, errno);`

			`for(i=0; i<CHARSET; i++)`
			`map[i] = NULL;`

			`map['<'] = "<";`
			`map['>'] = ">";`
			`map['&'] = "&";`
			`/* map['\''] = "'"; See () /`
			`map['"'] = """;`
			`}`

			`if ( !get_max_chr(encoding, &maxchr) )`
			`return FALSE;`

			`return do_quote(in, out, map, maxchr);`
			`}`


			`static foreign_t`
			`xml_quote_cdata(term_t in, term_t out, term_t encoding)`
			`{ static char **map;`
			`int maxchr;`

			`if ( !map )`
			`{ int i;`

			`if ( !(map = malloc(CHARSETsizeof(char))) )`
			`return sgml2pl_error(ERR_ERRNO, errno);`

			`for(i=0; i<CHARSET; i++)`
			`map[i] = NULL;`

			`map['<'] = "<";`
			`map['>'] = ">";`
			`map['&'] = "&";`
			`}`

			`if ( !get_max_chr(encoding, &maxchr) )`
			`return FALSE;`

			`return do_quote(in, out, map, maxchr);`
			`}`


			`static foreign_t`
			`xml_name(term_t in, term_t encoding)`
			`{ char *ins;`
			`wchar_t *inW;`
			`size_t len;`
			`static dtd_charclass *map;`
			`unsigned int i;`
			`int maxchr;`

			`if ( !get_max_chr(encoding, &maxchr) )`
			`return FALSE;`

			`if ( !map )`
			`map = new_charclass();`

			`if ( PL_get_nchars(in, &len, &ins, CVT_ATOMIC) )`
			`{ int c;`

			`if ( len == 0 )`
			`return FALSE;`

			`c = ins[0] & 0xff;`
			`if ( c > maxchr )`
			`return FALSE;`

			`if ( !(map->class[c] & CH_NMSTART) )`
			`return FALSE;`
			`for(i=1; i<len; i++)`
			`{ c = ins[i] & 0xff;`

			`if ( c > maxchr \|\| !(map->class[c] & CH_NAME) )`
			`return FALSE;`
			`}`

			`return TRUE;`
			`}`
			`if ( PL_get_wchars(in, &len, &inW, CVT_ATOMIC) )`
			`{ if ( len == 0 )`
			`return FALSE;`

			`if ( inW[0] > maxchr )`
			`return FALSE;`

			`if ( inW[0] <= 0xff &&`
			`!(map->class[inW[0]] & CH_NMSTART) )`
			`return FALSE;`
			`if ( inW[0] > 0xff && !iswalpha(inW[0]) )`
			`return FALSE;`

			`for(i=1; i<len; i++)`
			`{ int c = inW[i];`

			`if ( c <= 0xff && !(map->class[c] & CH_NAME) )`
			`return FALSE;`
			`if ( c > 0xff && !iswalnum((wint_t)c) )`
			`return FALSE;`
			`}`

			`return TRUE;`
			`}`

			`return FALSE;`
			`}`




			`install_t`
			`install_xml_quote()`
			`{ ATOM_iso_latin_1 = PL_new_atom("iso_latin_1");`
			`ATOM_utf8 = PL_new_atom("utf8");`
			`ATOM_unicode = PL_new_atom("unicode");`
			`ATOM_ascii = PL_new_atom("ascii");`

			`PL_register_foreign("xml_quote_attribute", 3, xml_quote_attribute, 0);`
			`PL_register_foreign("xml_quote_cdata", 3, xml_quote_cdata, 0);`
			`PL_register_foreign("xml_name", 2, xml_name, 0);`
			`}`