/* $Id$ Part of SWI-Prolog Author: Jan Wielemaker E-mail: jan@swi.psy.uva.nl WWW: http://www.swi-prolog.org Copyright (C): 1985-2002, University of Amsterdam This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #define _ISOC99_SOURCE 1 /* fwprintf(), etc prototypes */ #define UTIL_H_IMPLEMENTATION #include "util.h" #include <ctype.h> #include <wctype.h> #include <stdlib.h> #ifdef HAVE_MALLOC_H #include <malloc.h> #endif #include <stdio.h> #include <string.h> #include <errno.h> #ifdef HAVE_UNISTD_H #include <unistd.h> #endif #ifdef HAVE_IO_H #include <io.h> #endif #include <sys/stat.h> #include <fcntl.h> #include <assert.h> #include "utf8.h" size_t istrlen(const ichar *s) { size_t len =0; while(*s++) len++; return len; } ichar * istrdup(const ichar *s) { if ( s ) { ichar *dup = sgml_malloc((istrlen(s)+1)*sizeof(ichar)); ichar *d = dup; while(*s) *d++ = *s++; *d = 0; return dup; } else { return NULL; } } ichar * istrndup(const ichar *s, int len) { ichar *dup = sgml_malloc((len+1)*sizeof(ichar)); ichar *d = dup; while(--len >= 0) *d++ = *s++; *d = 0; return dup; } ichar * istrcpy(ichar *d, const ichar *s) { ichar *r = d; while(*s) *d++ = *s++; *d = 0; return r; } ichar * istrcat(ichar *d, const ichar *s) { ichar *r = d; d += istrlen(d); istrcpy(d, s); return r; } ichar * istrncpy(ichar *d, const ichar *s, size_t len) { ichar *r = d; while(*s && len-- > 0) *d++ = *s++; return r; } int istrcaseeq(const ichar *s1, const ichar *s2) { ichar c; while ((c = *s1++) != '\0') { if (towlower(*s2++) != towlower(c)) return FALSE; } return *s2 == '\0'; } int istreq(const ichar *s1, const ichar *s2) { while(*s1 && *s1 == *s2) s1++, s2++; if ( *s1 == 0 && *s2 == 0 ) return TRUE; return FALSE; } int istrncaseeq(const ichar *s1, const ichar *s2, int len) { while(--len >= 0 && towlower(*s1) == towlower(*s2)) s1++, s2++; if ( len < 0 ) return TRUE; return FALSE; } int istrprefix(const ichar *pref, const ichar *s) { while(*pref && *pref == *s) pref++, s++; if ( *pref == 0 ) return TRUE; return FALSE; } ichar * istrchr(const ichar *s, int c) { for( ; *s; s++ ) { if ( c == *s ) return (ichar *)s; } return NULL; } ichar * istrupper(ichar *s) { ichar *r = s; for( ; *s; s++) *s = toupper(*s); return r; } ichar * istrlower(ichar *s) { ichar *r = s; for( ; *s; s++) *s = towlower(*s); return r; } int istrhash(const ichar *t, int tsize) { unsigned int value = 0; unsigned int shift = 5; while(*t) { unsigned int c = *t++; c -= 'a'; value ^= c << (shift & 0xf); shift ^= c; } value = value ^ (value >> 16); return value % tsize; } int istrcasehash(const ichar *t, int tsize) { unsigned int value = 0; unsigned int shift = 5; while(*t) { unsigned int c = towlower(*t++); /* case insensitive */ c -= 'a'; value ^= c << (shift & 0xf); shift ^= c; } value = value ^ (value >> 16); return value % tsize; } int istrtol(const ichar *s, long *val) { long v; ichar *e; if ( *s ) { v = wcstol(s, &e, 10); if ( !e[0] && errno != ERANGE ) { *val = v; return TRUE; } } return FALSE; } /******************************* * INPUT CHARACTER BUFFER * *******************************/ /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Input character buffer is used to collect data between SGML markup, such as <...> - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ icharbuf * new_icharbuf() { icharbuf *buf = sgml_malloc(sizeof(*buf)); buf->allocated = 0; buf->size = 0; buf->data = NULL; return buf; } void free_icharbuf(icharbuf *buf) { if ( buf->data ) sgml_free(buf->data); sgml_free(buf); } void __add_icharbuf(icharbuf *buf, int chr) { if ( buf->size == buf->allocated ) { buf->allocated = (buf->allocated ? buf->allocated*2 : 128); if ( buf->data ) buf->data = sgml_realloc(buf->data, buf->allocated*sizeof(ichar)); else buf->data = sgml_malloc(buf->allocated*sizeof(ichar)); } buf->data[buf->size++] = chr; } void del_icharbuf(icharbuf *buf) { if ( buf->size > 0 ) buf->size--; } void terminate_icharbuf(icharbuf *buf) { add_icharbuf(buf, '\0'); buf->size--; } void empty_icharbuf(icharbuf *buf) { buf->size = 0; } /******************************* * OUTPUT CHARACTER BUFFER * *******************************/ /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Output character buffer deals with two representations: ISO Latin-1 and UCS. It starts life as ISO Latin-1 and is upgraded to UCS as the first character that doesn't fit ISO Latin-1 is added to the buffer. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ ocharbuf * init_ocharbuf(ocharbuf *buf) { buf->size = 0; buf->allocated = sizeof(buf->localbuf)/sizeof(wchar_t); buf->data.w = buf->localbuf; return buf; } ocharbuf * new_ocharbuf() { ocharbuf *buf = sgml_malloc(sizeof(*buf)); return init_ocharbuf(buf); } void free_ocharbuf(ocharbuf *buf) { if ( buf->data.w && buf->data.w != buf->localbuf ) sgml_free(buf->data.w); sgml_free(buf); } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Make sure the data of the buffer is malloc'ed and nul-terminated. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ ocharbuf * malloc_ocharbuf(ocharbuf *buf) { if ( buf->data.w == buf->localbuf ) { int bytes = (buf->size+1) * sizeof(wchar_t); buf->data.w = sgml_malloc(bytes); memcpy(buf->data.w, buf->localbuf, bytes); buf->data.w[buf->size] = 0; } else terminate_ocharbuf(buf); return buf; } void add_ocharbuf(ocharbuf *buf, int chr) { if ( buf->size == buf->allocated ) { buf->allocated *= 2; if ( buf->data.w != (wchar_t*)buf->localbuf ) { buf->data.w = sgml_realloc(buf->data.w, buf->allocated*sizeof(wchar_t)); } else { buf->data.w = sgml_malloc(buf->allocated*sizeof(wchar_t)); memcpy(buf->data.w, buf->localbuf, sizeof(buf->localbuf)); } } buf->data.w[buf->size++] = chr; } void del_ocharbuf(ocharbuf *buf) { if ( buf->size > 0 ) buf->size--; } void terminate_ocharbuf(ocharbuf *buf) { add_ocharbuf(buf, '\0'); buf->size--; } /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - empty_ocharbuf() frees the associated buffer after a big lump has been in it. Otherwise it simply sets the size to 0. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ void empty_ocharbuf(ocharbuf *buf) { buf->size = 0; if ( buf->allocated > 8192 ) { assert(buf->data.w != buf->localbuf); sgml_free(buf->data.w); buf->allocated = sizeof(buf->localbuf)/sizeof(wchar_t); buf->data.w = buf->localbuf; } } /******************************* * BUFFER RING * *******************************/ #define RINGSIZE 16 typedef struct ring { void *ring[RINGSIZE]; int ringp; } ring; #ifdef _REENTRANT #include <pthread.h> static pthread_key_t ring_key; static void free_ring(void *ptr) { ring *r = ptr; int i; void **bp; for(i=0, bp=r->ring; i<RINGSIZE; i++, bp++) { if ( *bp ) { sgml_free(*bp); *bp = NULL; } } sgml_free(r); } static ring * my_ring() { ring *r; if ( (r=pthread_getspecific(ring_key)) ) return r; if ( (r = sgml_calloc(1, sizeof(*r))) ) pthread_setspecific(ring_key, r); return r; } void init_ring(void) { pthread_key_create(&ring_key, free_ring); } #else static ring ring_store; #define my_ring() (&ring_store) void init_ring(void) {} #endif wchar_t * str2ring(const wchar_t *in) { ring *r; wchar_t *copy; if ( !(r=my_ring()) || !(copy = sgml_malloc((wcslen(in)+1)*sizeof(wchar_t))) ) { sgml_nomem(); return NULL; } wcscpy(copy, in); if ( r->ring[r->ringp] ) sgml_free(r->ring[r->ringp]); r->ring[r->ringp++] = copy; if ( r->ringp == RINGSIZE ) r->ringp = 0; return copy; } void * ringallo(size_t size) { ring *r; char *result; if ( !(r=my_ring()) || !(result = sgml_malloc(size)) ) { sgml_nomem(); return NULL; } if ( r->ring[r->ringp] ) sgml_free(r->ring[r->ringp]); r->ring[r->ringp++] = result; if ( r->ringp == RINGSIZE ) r->ringp = 0; return result; } /******************************* * MISC * *******************************/ wchar_t const * str_summary(wchar_t const *s, int len) { wchar_t *buf; size_t l = wcslen(s); if ( l < (size_t)len ) return s; buf = ringallo((len + 10)*sizeof(wchar_t)); wcsncpy(buf, s, len-5); wcscpy(&buf[len-5], L" ... "); wcscpy(&buf[len], &s[l-5]); return buf; } wchar_t * utf8towcs(const char *in) { size_t sl = strlen(in); size_t len = utf8_strlen(in, sl); wchar_t *buf = sgml_malloc((len + 1)*sizeof(wchar_t)); const char *e = in+sl; int i; for(i=0; in < e;) { int chr; in = utf8_get_char(in, &chr); buf[i++] = chr; } buf[i] = 0; return buf; } char * wcstoutf8(const wchar_t *in) { size_t size = 0; const wchar_t *s; char *rc, *o; for(s=in; *s; s++) { char buf[6]; if ( *s >= 0x80 ) { char *o2 = utf8_put_char(buf, *s); size += o2-buf; } else { size++; } } rc = sgml_malloc(size+1); for(o=rc, s=in; *s; s++) { o = utf8_put_char(o, *s); } *o = '\0'; return rc; } /******************************* * FILES * *******************************/ /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Load a file into memory. This would be so easy if we didn't had to deal with &#RE/&#RS handling that forces us to create the proper record start and end. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ #ifndef O_BINARY #define O_BINARY 0 #endif FILE * wfopen(const wchar_t *name, const char *mode) { size_t mbl = wcstombs(NULL, name, 0); if ( mbl > 0 ) { char *mbs = sgml_malloc(mbl+1); FILE *f; wcstombs(mbs, name, mbl+1); f = fopen(mbs, mode); sgml_free(mbs); return f; } return NULL; } static int wopen(const wchar_t *name, int flags) { size_t mbl = wcstombs(NULL, name, 0); if ( mbl > 0 ) { char *mbs = sgml_malloc(mbl+1); int fd; wcstombs(mbs, name, mbl+1); fd = open(mbs, flags); sgml_free(mbs); return fd; } return -1; } ichar * load_sgml_file_to_charp(const ichar *file, int normalise_rsre, size_t *length) { int fd; if ( (fd = wopen(file, O_RDONLY|O_BINARY)) >= 0 ) { struct stat buf; if ( fstat(fd, &buf) == 0 ) { size_t len = buf.st_size; char *r = sgml_malloc(len+1); if ( r ) { char *s = r; while(len>0) { int n; if ( (n=(int)read(fd, s, (unsigned int)len)) < 0 ) { close(fd); /* I/O error */ sgml_free(r); return NULL; } else if ( n == 0 ) break; len -= n; s += n; } len = s-r; *s = '\0'; /* ensure closing EOS */ close(fd); { int nl; int last_is_lf; ichar *r2, *t; if ( normalise_rsre ) { last_is_lf = (len > 0 && s[-1] == '\n'); for(s=r, nl=0; *s; s++) { if ( *s == '\n' && s>r && s[-1] != '\r' ) nl++; } } else { nl = 0; last_is_lf = 0; } r2 = sgml_malloc((len+nl+1)*sizeof(ichar)); for(s=r, t=r2; *s; s++) { if ( *s == '\n' ) { if ( s>r && s[-1] != '\r' ) *t++ = CR; *t++ = LF; } else *t++ = *s; } len = t-r2; *t = '\0'; if ( last_is_lf ) r2[--len] = '\0'; /* delete last LF */ if ( length ) *length = len; sgml_free(r); return r2; } } } } return NULL; } /******************************* * ALLOCATION * *******************************/ #ifdef _WINDOWS #include <windows.h> #endif void sgml_nomem() { fprintf(stderr, "SGML: Fatal: out of memory\n"); #ifdef _WINDOWS MessageBox(NULL, "SGML: Fatal: out of memory", "SGML", MB_OK|MB_TASKMODAL); #endif exit(1); } void * sgml_malloc(size_t size) { void *mem; if ( size == 0 ) return NULL; if ( (mem = malloc(size)) ) return mem; sgml_nomem(); return NULL; } void * sgml_realloc(void *old, size_t size) { void *mem; if ( old ) { if ( (mem = realloc(old, size)) ) return mem; } else { if ( (mem = malloc(size)) ) return mem; } sgml_nomem(); return NULL; } void * sgml_calloc(size_t n, size_t size) { void *mem; if ( (mem=calloc(n, size)) ) return mem; sgml_nomem(); return NULL; } void sgml_free(void *mem) { if ( mem ) free(mem); } /******************************* * DEBUG * *******************************/ void wputs(ichar *s) { fwprintf(stderr, L"%ls", s); }