This repository has been archived on 2023-08-20. You can view files and clone it, but cannot push or open issues or pull requests.
yap-6.3/packages/semweb/atom.c

732 lines
14 KiB
C

/* $Id$
Part of the SWI-Prolog Semweb package
Author: Jan Wielemaker
E-mail: wielemak@science.uva.nl
WWW: http://www.swi-prolog.org
Copyright (C): 2006, University of Amsterdam
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
#include <SWI-Stream.h>
#include <SWI-Prolog.h>
#include "atom.h"
#include "murmur.h"
#include <wchar.h>
#include <wctype.h>
#include <assert.h>
#ifdef __WINDOWS__
#define inline __inline
#endif
#include "unicode_map.c"
/*******************************
* TEXT HANDLING *
*******************************/
static inline int
get_atom_text(atom_t atom, text *txt)
{ if ( (txt->a = (const charA*)PL_atom_nchars(atom, &txt->length)) )
{ txt->w = NULL;
return TRUE;
}
if ( (txt->w = (const charW*)PL_atom_wchars(atom, &txt->length)) )
{ txt->a = NULL;
return TRUE;
}
return FALSE;
}
inline wint_t
fetch(const text *txt, int i)
{ return txt->a ? (wint_t)txt->a[i] : (wint_t)txt->w[i];
}
static int
fill_atom_info(atom_info *info)
{ if ( !info->resolved )
{ info->resolved = TRUE;
if ( !(info->rc=get_atom_text(info->handle, &info->text)) )
{ info->text.a = NULL;
info->text.w = NULL;
}
}
return info->rc;
}
/*******************************
* COMPARE *
*******************************/
static inline int
cmpA(int c1, int c2, int *dl2)
{ if ( c1 == c2 )
{ return 0;
} else
{ int k1 = sort_pointA(c1);
int k2 = sort_pointA(c2);
int d;
if ( (d=((k1>>8)-(k2>>8))) == 0 )
{ if ( *dl2 == 0 )
*dl2 = (k1&0xff) - (k2&0xff);
}
return d;
}
}
static inline int
cmpW(int c1, int c2, int *dl2)
{ if ( c1 == c2 )
{ return 0;
} else
{ int k1 = sort_point(c1);
int k2 = sort_point(c2);
int d;
if ( (d=((k1>>8)-(k2>>8))) == 0 )
{ if ( *dl2 == 0 )
*dl2 = (k1&0xff) - (k2&0xff);
}
return d;
}
}
int
cmp_atom_info(atom_info *info, atom_t a2)
{ text t2;
int i;
int dl2 = 0;
size_t n;
if ( info->handle == a2 )
return 0;
if ( !fill_atom_info(info) ||
!get_atom_text(a2, &t2) )
{ goto cmphandles; /* non-text atoms? */
}
if ( info->text.a && t2.a )
{ const charA *s1 = info->text.a;
const charA *s2 = t2.a;
int d;
while((d=cmpA(*s1, *s2, &dl2)) == 0)
{ if ( *s1 == 0 )
goto eq;
s1++, s2++;
}
return d;
}
n = (info->text.length < t2.length ? info->text.length : t2.length);
if ( info->text.w && t2.w )
{ const charW *s1 = info->text.w;
const charW *s2 = t2.w;
for(;;s1++, s2++)
{ if ( n-- == 0 )
{ if ( info->text.length == t2.length )
goto eq;
return info->text.length < t2.length ? -1 : 1;
} else
{ int d;
if ( (d=cmpW(*s1, *s2, &dl2)) != 0 )
return d;
}
}
}
for(i=0; ; i++)
{ if ( n-- == 0 )
{ if ( info->text.length == t2.length )
goto eq;
return info->text.length < t2.length ? -1 : 1;
} else
{ wint_t c1 = fetch(&info->text, i);
wint_t c2 = fetch(&t2, i);
int d;
if ( (d=cmpW(c1, c2, &dl2)) != 0 )
return d;
}
}
eq:
if ( dl2 )
return dl2;
cmphandles:
return info->handle < a2 ? -1 : 1; /* == already covered */
}
int
cmp_atoms(atom_t a1, atom_t a2)
{ atom_info info = {0};
if ( a1 == a2 )
return 0;
info.handle = a1;
return cmp_atom_info(&info, a2);
}
/*******************************
* HASH *
*******************************/
static unsigned int
string_hashA(const char *s, size_t len)
{ const unsigned char *t = (const unsigned char *)s;
unsigned int hash = 0;
while( len>0 )
{ unsigned char buf[256];
unsigned char *o = buf-1;
int cp = len > 256 ? 256 : (int)len;
const unsigned char *e = t+cp;
t--;
while(++t<e)
*++o = sort_pointA(*t)>>8;
hash ^= rdf_murmer_hash(buf, cp, MURMUR_SEED);
len -= cp;
}
return hash;
}
static unsigned int
string_hashW(const wchar_t *t, size_t len)
{ unsigned int hash = 0;
while( len>0 )
{ unsigned short buf[256];
unsigned short *o = buf;
int cp = len > 256 ? 256 : (int)len;
const wchar_t *e = t+cp;
while(t<e)
*o++ = (short)(sort_point(*t++)>>8);
hash ^= rdf_murmer_hash(buf, cp*sizeof(short), MURMUR_SEED);
len -= cp;
}
return hash;
}
unsigned int
atom_hash_case(atom_t a)
{ const char *s;
const wchar_t *w;
size_t len;
if ( (s = PL_atom_nchars(a, &len)) )
return string_hashA(s, len);
else if ( (w = PL_atom_wchars(a, &len)) )
return string_hashW(w, len);
else
{ assert(0);
return 0;
}
}
/*******************************
* FIND FIRST *
*******************************/
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Given an atom, return a new one that has all its characters modified
such that it appears first in the set of atoms considered equal after
case canonisation and diacritics removal. This is required for prefix
search to find the first atom of the set.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
atom_t
first_atom(atom_t a, int match)
{ text t;
if ( !get_atom_text(a, &t) )
{ return (atom_t)0; /* not a textual atom */
} else
{ size_t len = t.length;
wchar_t buf[256];
wchar_t *out, *s;
int i;
wint_t c;
atom_t rc;
if ( len <= 256 )
out = buf;
else
out = PL_malloc(len*sizeof(wchar_t));
for(s=out,i=0; (c=fetch(&t,i)); s++,i++)
{ if ( c == '*' && match == STR_MATCH_LIKE )
{ if ( i == 0 ) /* like '*...' */
return (atom_t)0;
len = i; /* only up to the first * */
}
*s = sort_point(c)>>8;
}
rc = PL_new_atom_wchars(len, out);
if ( out != buf )
PL_free(out);
return rc;
}
}
/*******************************
* MATCH *
*******************************/
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
With the introduction of wide characters there are two versions of the
match() function, one using char* and one using a structure and index to
fetch characters. Overall performance of the first function is about
twice as good as the general one and as most data will be handled by
this function in practice I think it is worthwhile to have two
implementations. Both implementations are very similar in design and
likely to have the same bugs. If you find one, please fix it in both
branches!
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
static const charA *
nextwordA(const charA *s)
{ while(*s && iswalnum(*s))
s++;
while(*s && !iswalnum(*s))
s++;
return s;
}
#define cmp_pointA(i) (sort_pointA(i)>>8)
static int
matchA(int how, const charA *f, const charA *l)
{ switch(how)
{ case STR_MATCH_EXACT:
{ for( ; *l && *f; l++, f++ )
{ if ( cmp_pointA(*l) != cmp_pointA(*f) )
return FALSE;
}
if ( *l == '\0' && *f == '\0' )
return TRUE;
return FALSE;
}
case STR_MATCH_PREFIX:
{ for( ; *l && *f; l++, f++ )
{ if ( cmp_pointA(*l) != cmp_pointA(*f) )
return FALSE;
}
if ( *f == '\0' )
return TRUE;
return FALSE;
}
case STR_MATCH_SUBSTRING: /* use Boyle-More! */
{ const charA *h;
const charA *f0 = f;
for(h=l; *h; h++)
{ for( l=h,f=f0; *l && *f; l++, f++ )
{ if ( cmp_pointA(*l) != cmp_pointA(*f) )
break;
}
if ( *f == '\0' )
return TRUE;
if ( *h == '\0' )
return FALSE;
}
return FALSE;
}
case STR_MATCH_WORD:
{ const charA *h;
const charA *f0 = f;
for(h=l; *h; h = nextwordA(h))
{ for( l=h,f=f0; *l && *f; l++, f++ )
{ if ( cmp_pointA(*l) != cmp_pointA(*f) )
break;
}
if ( *f == '\0' )
{ if ( *l == '\0' || !iswalnum(*l) )
return TRUE;
}
if ( *l == '\0' )
return FALSE;
}
return FALSE;
}
case STR_MATCH_LIKE: /* SeRQL like: * --> wildcart */
{ typedef struct chp { const charA *pattern;
const charA *label; } chp;
chp chps[MAX_LIKE_CHOICES];
int chn=0;
for( ; *l && *f; l++, f++ )
{ if ( *f == '*' )
{ f++;
if ( *f == '\0' ) /* foo* */
return TRUE;
search_like:
while ( *l && cmp_pointA(*l) != cmp_pointA(*f) )
l++;
if ( *l )
{ if ( chn >= MAX_LIKE_CHOICES )
{ Sdprintf("rdf_db: too many * in `like' expression (>%d)",
MAX_LIKE_CHOICES);
return FALSE;
}
chps[chn].pattern = f;
chps[chn].label = l+1;
chn++;
continue;
} else
goto retry_like;
}
if ( cmp_pointA(*l) != cmp_pointA(*f) )
goto retry_like;
}
if ( *l == '\0' && (*f == '\0' ||
(*f == '*' && f[1] == '\0')) )
return TRUE;
retry_like:
if ( chn > 0 )
{ chn--;
f = chps[chn].pattern;
l = chps[chn].label;
goto search_like;
}
return FALSE;
}
default:
assert(0);
return FALSE;
}
}
static unsigned int
nextword(text *txt, unsigned int i)
{ while(i<txt->length && iswalnum(fetch(txt, i)))
i++;
while(i<txt->length && !iswalnum(fetch(txt, i)))
i++;
return i;
}
#define cmp_point(i) (sort_point(i)>>8)
int
match_atoms(int how, atom_t search, atom_t label)
{ text l, f;
if ( !get_atom_text(label, &l) ||
!get_atom_text(search, &f) )
return FALSE; /* error? */
if ( f.length == 0 )
return TRUE;
if ( f.a && l.a )
return matchA(how, f.a, l.a);
switch(how)
{ case STR_MATCH_EXACT:
{ if ( l.length == f.length )
{ unsigned int i;
for(i=0; i<l.length; i++ )
{ if ( cmp_point(fetch(&l, i)) != cmp_point(fetch(&f, i)) )
return FALSE;
}
return TRUE;
}
return FALSE;
}
case STR_MATCH_PREFIX:
{ if ( f.length <= l.length )
{ unsigned int i;
for(i=0; i<f.length; i++ )
{ if ( cmp_point(fetch(&l, i)) != cmp_point(fetch(&f, i)) )
return FALSE;
}
return TRUE;
}
return FALSE;
}
case STR_MATCH_SUBSTRING: /* use Boyle-More! */
{ if ( f.length <= l.length )
{ unsigned int i, s;
for(s=0; s+f.length <= l.length; s++)
{ for(i=0; i<f.length; i++)
{ if ( cmp_point(fetch(&l, i+s)) != cmp_point(fetch(&f, i)) )
goto snext;
}
return TRUE;
snext:;
}
}
return FALSE;
}
case STR_MATCH_WORD:
{ if ( f.length <= l.length )
{ unsigned int i, s;
for(s=0; s+f.length <= l.length; s = nextword(&l, s))
{ for(i=0; i<f.length; i++)
{ if ( cmp_point(fetch(&l, i+s)) != cmp_point(fetch(&f, i)) )
goto wnext;
}
if ( i+s == l.length || !iswalnum(fetch(&l,i+s)) )
return TRUE;
wnext:;
}
}
return FALSE;
}
case STR_MATCH_LIKE: /* SeRQL like: * --> wildcart */
{ unsigned int ip, il;
typedef struct chp { unsigned int ip;
unsigned int il;
} chp;
chp chps[MAX_LIKE_CHOICES];
int chn=0;
for(ip=il=0; il < l.length && ip < f.length; ip++, il++ )
{ if ( fetch(&f, ip) == '*' )
{ ip++;
if ( ip == f.length ) /* foo* */
return TRUE;
search_like:
while ( il < l.length &&
cmp_point(fetch(&l, il)) != cmp_point(fetch(&f, ip)) )
il++;
if ( il < l.length )
{ if ( chn >= MAX_LIKE_CHOICES )
{ Sdprintf("rdf_db: too many * in `like' expression (>%d)",
MAX_LIKE_CHOICES);
return FALSE;
}
chps[chn].ip = ip;
chps[chn].il = il+1;
chn++;
continue;
} else
goto retry_like;
}
if ( cmp_point(fetch(&l, il)) != cmp_point(fetch(&f, ip)) )
goto retry_like;
}
if ( il == l.length && (ip == f.length ||
(fetch(&f,ip) == '*' && ip+1 == f.length)) )
return TRUE;
retry_like:
if ( chn > 0 )
{ chn--;
ip = chps[chn].ip;
il = chps[chn].il;
goto search_like;
}
return FALSE;
}
default:
assert(0);
return FALSE;
}
}
/*******************************
* LANGUAGE MATCH *
*******************************/
typedef struct lang_choice
{ int langp; /* points after - */
int patp; /* points after *- */
} lang_choice;
#define MAX_CHOICES 10 /* Max number of stars */
typedef struct
{ int il, ip;
text l, p;
lang_choice choicepoints[MAX_CHOICES];
int choice_count;
} lang_state;
static int
create_chp(lang_state *s)
{ if ( s->choice_count < MAX_CHOICES )
{ lang_choice *cp = &s->choicepoints[s->choice_count];
cp->langp = s->il;
cp->patp = s->ip+2;
s->choice_count++;
return TRUE;
}
return FALSE;
}
static int
next_choice(lang_state *s)
{ for ( ; s->choice_count > 0; s->choice_count-- )
{ lang_choice *cp = &s->choicepoints[s->choice_count-1];
int il = cp->langp;
for(; il<s->l.length; il++)
{ if ( fetch(&s->l, il) == '-' )
{ cp->langp = s->il = il+1;
s->ip = cp->patp;
return TRUE;
}
}
}
return FALSE;
}
static atom_t ATOM_;
static atom_t ATOM_star;
int
atom_lang_matches(atom_t lang, atom_t pattern)
{ lang_state s = {0};
int cl, cp;
if ( lang == pattern ) /* exact match */
return TRUE;
if ( !ATOM_ )
{ ATOM_ = PL_new_atom("");
ATOM_star = PL_new_atom("*");
}
if ( lang == ATOM_ ) /* no language */
return FALSE;
if ( pattern == ATOM_star ) /* Everything matches "*" */
return TRUE;
if ( !get_atom_text(lang, &s.l) ||
!get_atom_text(pattern, &s.p) )
return FALSE; /* exception? */
s.il=0; s.ip=0;
for(;; s.ip++, s.il++)
{ if ( s.ip == s.p.length )
return TRUE;
if ( s.il == s.l.length )
{ if ( fetch(&s.p, s.ip) == '*' )
return TRUE;
if ( !next_choice(&s) )
return FALSE;
}
cl = fetch(&s.l, s.il);
cp = fetch(&s.p, s.ip);
if ( cl == cp )
continue;
if ( sort_point(cl)>>8 == sort_point(cp)>>8 )
continue;
if ( cp == '*' )
{ if ( s.ip+1 == s.p.length )
return TRUE;
if ( (s.ip == 0 || fetch(&s.p, s.ip-1) == '-') &&
fetch(&s.p, s.ip+1) == '-' )
{ if ( !create_chp(&s) )
return FALSE;
}
}
if ( !next_choice(&s) )
return FALSE;
}
}