This repository has been archived on 2023-08-20. You can view files and clone it, but cannot push or open issues or pull requests.
yap-6.3/packages/sgml/util.c

748 lines
13 KiB
C
Raw Normal View History

/* $Id$
Part of SWI-Prolog
Author: Jan Wielemaker
E-mail: jan@swi.psy.uva.nl
WWW: http://www.swi-prolog.org
Copyright (C): 1985-2002, University of Amsterdam
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#define _ISOC99_SOURCE 1 /* fwprintf(), etc prototypes */
#define UTIL_H_IMPLEMENTATION
#include "util.h"
#include <unistd.h>
#include <ctype.h>
#include <wctype.h>
#include <stdlib.h>
#ifdef HAVE_MALLOC_H
#include <malloc.h>
#endif
#include <stdio.h>
#include <string.h>
#include <errno.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_IO_H
#include <io.h>
#endif
#include <sys/stat.h>
#include <fcntl.h>
#include <assert.h>
#include "utf8.h"
size_t
istrlen(const ichar *s)
{ size_t len =0;
while(*s++)
len++;
return len;
}
ichar *
istrdup(const ichar *s)
{ if ( s )
{ ichar *dup = sgml_malloc((istrlen(s)+1)*sizeof(ichar));
ichar *d = dup;
while(*s)
*d++ = *s++;
*d = 0;
return dup;
} else
{ return NULL;
}
}
ichar *
istrndup(const ichar *s, int len)
{ ichar *dup = sgml_malloc((len+1)*sizeof(ichar));
ichar *d = dup;
while(--len >= 0)
*d++ = *s++;
*d = 0;
return dup;
}
ichar *
istrcpy(ichar *d, const ichar *s)
{ ichar *r = d;
while(*s)
*d++ = *s++;
*d = 0;
return r;
}
ichar *
istrcat(ichar *d, const ichar *s)
{ ichar *r = d;
d += istrlen(d);
istrcpy(d, s);
return r;
}
ichar *
istrncpy(ichar *d, const ichar *s, size_t len)
{ ichar *r = d;
while(*s && len-- > 0)
*d++ = *s++;
return r;
}
int
istrcaseeq(const ichar *s1, const ichar *s2)
{ ichar c;
while ((c = *s1++) != '\0')
{ if (towlower(*s2++) != towlower(c))
return FALSE;
}
return *s2 == '\0';
}
int
istreq(const ichar *s1, const ichar *s2)
{ while(*s1 && *s1 == *s2)
s1++, s2++;
if ( *s1 == 0 && *s2 == 0 )
return TRUE;
return FALSE;
}
int
istrncaseeq(const ichar *s1, const ichar *s2, int len)
{ while(--len >= 0 && towlower(*s1) == towlower(*s2))
s1++, s2++;
if ( len < 0 )
return TRUE;
return FALSE;
}
int
istrprefix(const ichar *pref, const ichar *s)
{ while(*pref && *pref == *s)
pref++, s++;
if ( *pref == 0 )
return TRUE;
return FALSE;
}
ichar *
istrchr(const ichar *s, int c)
{ for( ; *s; s++ )
{ if ( c == *s )
return (ichar *)s;
}
return NULL;
}
ichar *
istrupper(ichar *s)
{ ichar *r = s;
for( ; *s; s++)
*s = toupper(*s);
return r;
}
ichar *
istrlower(ichar *s)
{ ichar *r = s;
for( ; *s; s++)
*s = towlower(*s);
return r;
}
int
istrhash(const ichar *t, int tsize)
{ unsigned int value = 0;
unsigned int shift = 5;
while(*t)
{ unsigned int c = *t++;
c -= 'a';
value ^= c << (shift & 0xf);
shift ^= c;
}
value = value ^ (value >> 16);
return value % tsize;
}
int
istrcasehash(const ichar *t, int tsize)
{ unsigned int value = 0;
unsigned int shift = 5;
while(*t)
{ unsigned int c = towlower(*t++); /* case insensitive */
c -= 'a';
value ^= c << (shift & 0xf);
shift ^= c;
}
value = value ^ (value >> 16);
return value % tsize;
}
int
istrtol(const ichar *s, long *val)
{ long v;
ichar *e;
if ( *s )
{ v = wcstol(s, &e, 10);
if ( !e[0] && errno != ERANGE )
{ *val = v;
return TRUE;
}
}
return FALSE;
}
/*******************************
* INPUT CHARACTER BUFFER *
*******************************/
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Input character buffer is used to collect data between SGML markup, such
as <...>
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
icharbuf *
new_icharbuf()
{ icharbuf *buf = sgml_malloc(sizeof(*buf));
buf->allocated = 0;
buf->size = 0;
buf->data = NULL;
return buf;
}
void
free_icharbuf(icharbuf *buf)
{ if ( buf->data )
sgml_free(buf->data);
sgml_free(buf);
}
void
__add_icharbuf(icharbuf *buf, int chr)
{ if ( buf->size == buf->allocated )
{ buf->allocated = (buf->allocated ? buf->allocated*2 : 128);
if ( buf->data )
buf->data = sgml_realloc(buf->data, buf->allocated*sizeof(ichar));
else
buf->data = sgml_malloc(buf->allocated*sizeof(ichar));
}
buf->data[buf->size++] = chr;
}
void
del_icharbuf(icharbuf *buf)
{ if ( buf->size > 0 )
buf->size--;
}
void
terminate_icharbuf(icharbuf *buf)
{ add_icharbuf(buf, '\0');
buf->size--;
}
void
empty_icharbuf(icharbuf *buf)
{ buf->size = 0;
}
/*******************************
* OUTPUT CHARACTER BUFFER *
*******************************/
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Output character buffer deals with two representations: ISO Latin-1 and
UCS. It starts life as ISO Latin-1 and is upgraded to UCS as the first
character that doesn't fit ISO Latin-1 is added to the buffer.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
ocharbuf *
init_ocharbuf(ocharbuf *buf)
{ buf->size = 0;
buf->allocated = sizeof(buf->localbuf)/sizeof(wchar_t);
buf->data.w = buf->localbuf;
return buf;
}
ocharbuf *
new_ocharbuf()
{ ocharbuf *buf = sgml_malloc(sizeof(*buf));
return init_ocharbuf(buf);
}
void
free_ocharbuf(ocharbuf *buf)
{ if ( buf->data.w && buf->data.w != buf->localbuf )
sgml_free(buf->data.w);
sgml_free(buf);
}
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Make sure the data of the buffer is malloc'ed and nul-terminated.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
ocharbuf *
malloc_ocharbuf(ocharbuf *buf)
{ if ( buf->data.w == buf->localbuf )
{ int bytes = (buf->size+1) * sizeof(wchar_t);
buf->data.w = sgml_malloc(bytes);
memcpy(buf->data.w, buf->localbuf, bytes);
buf->data.w[buf->size] = 0;
} else
terminate_ocharbuf(buf);
return buf;
}
void
add_ocharbuf(ocharbuf *buf, int chr)
{ if ( buf->size == buf->allocated )
{ buf->allocated *= 2;
if ( buf->data.w != (wchar_t*)buf->localbuf )
{ buf->data.w = sgml_realloc(buf->data.w, buf->allocated*sizeof(wchar_t));
} else
{ buf->data.w = sgml_malloc(buf->allocated*sizeof(wchar_t));
memcpy(buf->data.w, buf->localbuf, sizeof(buf->localbuf));
}
}
buf->data.w[buf->size++] = chr;
}
void
del_ocharbuf(ocharbuf *buf)
{ if ( buf->size > 0 )
buf->size--;
}
void
terminate_ocharbuf(ocharbuf *buf)
{ add_ocharbuf(buf, '\0');
buf->size--;
}
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
empty_ocharbuf() frees the associated buffer after a big lump has been
in it. Otherwise it simply sets the size to 0.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
void
empty_ocharbuf(ocharbuf *buf)
{ buf->size = 0;
if ( buf->allocated > 8192 )
{ assert(buf->data.w != buf->localbuf);
sgml_free(buf->data.w);
buf->allocated = sizeof(buf->localbuf)/sizeof(wchar_t);
buf->data.w = buf->localbuf;
}
}
/*******************************
* BUFFER RING *
*******************************/
#define RINGSIZE 16
static void *ring[RINGSIZE];
static int ringp;
wchar_t *
str2ring(const wchar_t *in)
{ wchar_t *copy = sgml_malloc((wcslen(in)+1)*sizeof(wchar_t));
if ( !copy )
{ sgml_nomem();
return NULL;
}
wcscpy(copy, in);
if ( ring[ringp] )
sgml_free(ring[ringp]);
ring[ringp++] = copy;
if ( ringp == RINGSIZE )
ringp = 0;
return copy;
}
void *
ringallo(size_t size)
{ char *result = sgml_malloc(size);
if ( ring[ringp] )
sgml_free(ring[ringp]);
ring[ringp++] = result;
if ( ringp == RINGSIZE )
ringp = 0;
return result;
}
/*******************************
* MISC *
*******************************/
wchar_t const *
str_summary(wchar_t const *s, int len)
{ wchar_t *buf;
size_t l = wcslen(s);
if ( l < (size_t)len )
return s;
buf = ringallo((len + 10)*sizeof(wchar_t));
wcsncpy(buf, s, len-5);
wcscpy(&buf[len-5], L" ... ");
wcscpy(&buf[len], &s[l-5]);
return buf;
}
wchar_t *
utf8towcs(const char *in)
{ size_t sl = strlen(in);
size_t len = utf8_strlen(in, sl);
wchar_t *buf = sgml_malloc((len + 1)*sizeof(wchar_t));
const char *e = in+sl;
int i;
for(i=0; in < e;)
{ int chr;
in = utf8_get_char(in, &chr);
buf[i++] = chr;
}
buf[i] = 0;
return buf;
}
char *
wcstoutf8(const wchar_t *in)
{ size_t size = 0;
const wchar_t *s;
char *rc, *o;
for(s=in; *s; s++)
{ char buf[6];
if ( *s >= 0x80 )
{ char *o2 = utf8_put_char(buf, *s);
size += o2-buf;
} else
{ size++;
}
}
rc = sgml_malloc(size+1);
for(o=rc, s=in; *s; s++)
{ o = utf8_put_char(o, *s);
}
*o = '\0';
return rc;
}
/*******************************
* FILES *
*******************************/
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Load a file into memory. This would be so easy if we didn't had to deal
with &#RE/&#RS handling that forces us to create the proper record start
and end.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
#ifndef O_BINARY
#define O_BINARY 0
#endif
FILE *
wfopen(const wchar_t *name, const char *mode)
{ size_t mbl = wcstombs(NULL, name, 0);
if ( mbl > 0 )
{ char *mbs = sgml_malloc(mbl+1);
FILE *f;
wcstombs(mbs, name, mbl+1);
f = fopen(mbs, mode);
sgml_free(mbs);
return f;
}
return NULL;
}
static int
wopen(const wchar_t *name, int flags)
{ size_t mbl = wcstombs(NULL, name, 0);
if ( mbl > 0 )
{ char *mbs = sgml_malloc(mbl+1);
int fd;
wcstombs(mbs, name, mbl+1);
fd = open(mbs, flags);
sgml_free(mbs);
return fd;
}
return -1;
}
ichar *
load_sgml_file_to_charp(const ichar *file, int normalise_rsre, size_t *length)
{ int fd;
if ( (fd = wopen(file, O_RDONLY|O_BINARY)) >= 0 )
{ struct stat buf;
if ( fstat(fd, &buf) == 0 )
{ size_t len = buf.st_size;
char *r = sgml_malloc(len+1);
if ( r )
{ char *s = r;
while(len>0)
{ int n;
if ( (n=(int)read(fd, s, (unsigned int)len)) < 0 )
{ close(fd); /* I/O error */
sgml_free(r);
return NULL;
} else if ( n == 0 )
break;
len -= n;
s += n;
}
len = s-r;
*s = '\0'; /* ensure closing EOS */
close(fd);
{ int nl;
int last_is_lf;
ichar *r2, *t;
if ( normalise_rsre )
{ last_is_lf = (len > 0 && s[-1] == '\n');
for(s=r, nl=0; *s; s++)
{ if ( *s == '\n' && s>r && s[-1] != '\r' )
nl++;
}
} else
{ nl = 0;
last_is_lf = 0;
}
r2 = sgml_malloc((len+nl+1)*sizeof(ichar));
for(s=r, t=r2; *s; s++)
{ if ( *s == '\n' )
{ if ( s>r && s[-1] != '\r' )
*t++ = CR;
*t++ = LF;
} else
*t++ = *s;
}
len = t-r2;
*t = '\0';
if ( last_is_lf )
r2[--len] = '\0'; /* delete last LF */
if ( length )
*length = len;
sgml_free(r);
return r2;
}
}
}
}
return NULL;
}
/*******************************
* ALLOCATION *
*******************************/
#ifdef _WINDOWS
#include <windows.h>
#endif
void
sgml_nomem()
{ fprintf(stderr, "SGML: Fatal: out of memory\n");
#ifdef _WINDOWS
MessageBox(NULL, "SGML: Fatal: out of memory", "SGML", MB_OK|MB_TASKMODAL);
#endif
exit(1);
}
void *
sgml_malloc(size_t size)
{ void *mem;
if ( size == 0 )
return NULL;
if ( (mem = malloc(size)) )
return mem;
sgml_nomem();
return NULL;
}
void *
sgml_realloc(void *old, size_t size)
{ void *mem;
if ( old )
{ if ( (mem = realloc(old, size)) )
return mem;
} else
{ if ( (mem = malloc(size)) )
return mem;
}
sgml_nomem();
return NULL;
}
void *
sgml_calloc(size_t n, size_t size)
{ void *mem;
if ( (mem=calloc(n, size)) )
return mem;
sgml_nomem();
return NULL;
}
void
sgml_free(void *mem)
{ if ( mem )
free(mem);
}
/*******************************
* DEBUG *
*******************************/
void
wputs(ichar *s)
{ fwprintf(stderr, L"%ls", s);
}