732 lines
14 KiB
C
732 lines
14 KiB
C
/* $Id$
|
|
|
|
Part of the SWI-Prolog Semweb package
|
|
|
|
Author: Jan Wielemaker
|
|
E-mail: wielemak@science.uva.nl
|
|
WWW: http://www.swi-prolog.org
|
|
Copyright (C): 2006, University of Amsterdam
|
|
|
|
This program is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU General Public License
|
|
as published by the Free Software Foundation; either version 2
|
|
of the License, or (at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with this library; if not, write to the Free Software
|
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
*/
|
|
|
|
#ifdef HAVE_CONFIG_H
|
|
#include <config.h>
|
|
#endif
|
|
|
|
#include <SWI-Stream.h>
|
|
#include <SWI-Prolog.h>
|
|
#include "atom.h"
|
|
#include "murmur.h"
|
|
#include <wchar.h>
|
|
#include <wctype.h>
|
|
#include <assert.h>
|
|
|
|
#ifdef __WINDOWS__
|
|
#define inline __inline
|
|
#endif
|
|
|
|
#include "unicode_map.c"
|
|
|
|
|
|
/*******************************
|
|
* TEXT HANDLING *
|
|
*******************************/
|
|
|
|
static inline int
|
|
get_atom_text(atom_t atom, text *txt)
|
|
{ if ( (txt->a = (const charA*)PL_atom_nchars(atom, &txt->length)) )
|
|
{ txt->w = NULL;
|
|
return TRUE;
|
|
}
|
|
if ( (txt->w = (const charW*)PL_atom_wchars(atom, &txt->length)) )
|
|
{ txt->a = NULL;
|
|
return TRUE;
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
|
|
inline wint_t
|
|
fetch(const text *txt, int i)
|
|
{ return txt->a ? (wint_t)txt->a[i] : (wint_t)txt->w[i];
|
|
}
|
|
|
|
|
|
static int
|
|
fill_atom_info(atom_info *info)
|
|
{ if ( !info->resolved )
|
|
{ info->resolved = TRUE;
|
|
|
|
if ( !(info->rc=get_atom_text(info->handle, &info->text)) )
|
|
{ info->text.a = NULL;
|
|
info->text.w = NULL;
|
|
}
|
|
}
|
|
|
|
return info->rc;
|
|
}
|
|
|
|
|
|
/*******************************
|
|
* COMPARE *
|
|
*******************************/
|
|
|
|
static inline int
|
|
cmpA(int c1, int c2, int *dl2)
|
|
{ if ( c1 == c2 )
|
|
{ return 0;
|
|
} else
|
|
{ int k1 = sort_pointA(c1);
|
|
int k2 = sort_pointA(c2);
|
|
int d;
|
|
|
|
if ( (d=((k1>>8)-(k2>>8))) == 0 )
|
|
{ if ( *dl2 == 0 )
|
|
*dl2 = (k1&0xff) - (k2&0xff);
|
|
}
|
|
|
|
return d;
|
|
}
|
|
}
|
|
|
|
|
|
static inline int
|
|
cmpW(int c1, int c2, int *dl2)
|
|
{ if ( c1 == c2 )
|
|
{ return 0;
|
|
} else
|
|
{ int k1 = sort_point(c1);
|
|
int k2 = sort_point(c2);
|
|
int d;
|
|
|
|
if ( (d=((k1>>8)-(k2>>8))) == 0 )
|
|
{ if ( *dl2 == 0 )
|
|
*dl2 = (k1&0xff) - (k2&0xff);
|
|
}
|
|
|
|
return d;
|
|
}
|
|
}
|
|
|
|
|
|
int
|
|
cmp_atom_info(atom_info *info, atom_t a2)
|
|
{ text t2;
|
|
int i;
|
|
int dl2 = 0;
|
|
size_t n;
|
|
|
|
if ( info->handle == a2 )
|
|
return 0;
|
|
|
|
if ( !fill_atom_info(info) ||
|
|
!get_atom_text(a2, &t2) )
|
|
{ goto cmphandles; /* non-text atoms? */
|
|
}
|
|
|
|
if ( info->text.a && t2.a )
|
|
{ const charA *s1 = info->text.a;
|
|
const charA *s2 = t2.a;
|
|
int d;
|
|
|
|
while((d=cmpA(*s1, *s2, &dl2)) == 0)
|
|
{ if ( *s1 == 0 )
|
|
goto eq;
|
|
s1++, s2++;
|
|
}
|
|
return d;
|
|
}
|
|
|
|
n = (info->text.length < t2.length ? info->text.length : t2.length);
|
|
|
|
if ( info->text.w && t2.w )
|
|
{ const charW *s1 = info->text.w;
|
|
const charW *s2 = t2.w;
|
|
|
|
for(;;s1++, s2++)
|
|
{ if ( n-- == 0 )
|
|
{ if ( info->text.length == t2.length )
|
|
goto eq;
|
|
|
|
return info->text.length < t2.length ? -1 : 1;
|
|
} else
|
|
{ int d;
|
|
|
|
if ( (d=cmpW(*s1, *s2, &dl2)) != 0 )
|
|
return d;
|
|
}
|
|
}
|
|
}
|
|
|
|
for(i=0; ; i++)
|
|
{ if ( n-- == 0 )
|
|
{ if ( info->text.length == t2.length )
|
|
goto eq;
|
|
|
|
return info->text.length < t2.length ? -1 : 1;
|
|
} else
|
|
{ wint_t c1 = fetch(&info->text, i);
|
|
wint_t c2 = fetch(&t2, i);
|
|
int d;
|
|
|
|
if ( (d=cmpW(c1, c2, &dl2)) != 0 )
|
|
return d;
|
|
}
|
|
}
|
|
|
|
eq:
|
|
if ( dl2 )
|
|
return dl2;
|
|
|
|
cmphandles:
|
|
return info->handle < a2 ? -1 : 1; /* == already covered */
|
|
}
|
|
|
|
|
|
int
|
|
cmp_atoms(atom_t a1, atom_t a2)
|
|
{ atom_info info = {0};
|
|
|
|
if ( a1 == a2 )
|
|
return 0;
|
|
|
|
info.handle = a1;
|
|
|
|
return cmp_atom_info(&info, a2);
|
|
}
|
|
|
|
|
|
/*******************************
|
|
* HASH *
|
|
*******************************/
|
|
|
|
static unsigned int
|
|
string_hashA(const char *s, size_t len)
|
|
{ const unsigned char *t = (const unsigned char *)s;
|
|
unsigned int hash = 0;
|
|
|
|
while( len>0 )
|
|
{ unsigned char buf[256];
|
|
unsigned char *o = buf-1;
|
|
int cp = len > 256 ? 256 : (int)len;
|
|
const unsigned char *e = t+cp;
|
|
|
|
t--;
|
|
while(++t<e)
|
|
*++o = sort_pointA(*t)>>8;
|
|
hash ^= rdf_murmer_hash(buf, cp, MURMUR_SEED);
|
|
|
|
len -= cp;
|
|
}
|
|
|
|
return hash;
|
|
}
|
|
|
|
|
|
static unsigned int
|
|
string_hashW(const wchar_t *t, size_t len)
|
|
{ unsigned int hash = 0;
|
|
|
|
while( len>0 )
|
|
{ unsigned short buf[256];
|
|
unsigned short *o = buf;
|
|
int cp = len > 256 ? 256 : (int)len;
|
|
const wchar_t *e = t+cp;
|
|
|
|
while(t<e)
|
|
*o++ = (short)(sort_point(*t++)>>8);
|
|
hash ^= rdf_murmer_hash(buf, cp*sizeof(short), MURMUR_SEED);
|
|
|
|
len -= cp;
|
|
}
|
|
|
|
return hash;
|
|
}
|
|
|
|
|
|
unsigned int
|
|
atom_hash_case(atom_t a)
|
|
{ const char *s;
|
|
const wchar_t *w;
|
|
size_t len;
|
|
|
|
if ( (s = PL_atom_nchars(a, &len)) )
|
|
return string_hashA(s, len);
|
|
else if ( (w = PL_atom_wchars(a, &len)) )
|
|
return string_hashW(w, len);
|
|
else
|
|
{ assert(0);
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
|
|
/*******************************
|
|
* FIND FIRST *
|
|
*******************************/
|
|
|
|
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
|
Given an atom, return a new one that has all its characters modified
|
|
such that it appears first in the set of atoms considered equal after
|
|
case canonisation and diacritics removal. This is required for prefix
|
|
search to find the first atom of the set.
|
|
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
|
|
|
|
atom_t
|
|
first_atom(atom_t a, int match)
|
|
{ text t;
|
|
|
|
if ( !get_atom_text(a, &t) )
|
|
{ return (atom_t)0; /* not a textual atom */
|
|
} else
|
|
{ size_t len = t.length;
|
|
wchar_t buf[256];
|
|
wchar_t *out, *s;
|
|
int i;
|
|
wint_t c;
|
|
atom_t rc;
|
|
|
|
if ( len <= 256 )
|
|
out = buf;
|
|
else
|
|
out = PL_malloc(len*sizeof(wchar_t));
|
|
|
|
for(s=out,i=0; (c=fetch(&t,i)); s++,i++)
|
|
{ if ( c == '*' && match == STR_MATCH_LIKE )
|
|
{ if ( i == 0 ) /* like '*...' */
|
|
return (atom_t)0;
|
|
len = i; /* only up to the first * */
|
|
}
|
|
*s = sort_point(c)>>8;
|
|
}
|
|
|
|
rc = PL_new_atom_wchars(len, out);
|
|
|
|
if ( out != buf )
|
|
PL_free(out);
|
|
|
|
return rc;
|
|
}
|
|
}
|
|
|
|
/*******************************
|
|
* MATCH *
|
|
*******************************/
|
|
|
|
/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
|
|
With the introduction of wide characters there are two versions of the
|
|
match() function, one using char* and one using a structure and index to
|
|
fetch characters. Overall performance of the first function is about
|
|
twice as good as the general one and as most data will be handled by
|
|
this function in practice I think it is worthwhile to have two
|
|
implementations. Both implementations are very similar in design and
|
|
likely to have the same bugs. If you find one, please fix it in both
|
|
branches!
|
|
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
|
|
|
|
static const charA *
|
|
nextwordA(const charA *s)
|
|
{ while(*s && iswalnum(*s))
|
|
s++;
|
|
while(*s && !iswalnum(*s))
|
|
s++;
|
|
|
|
return s;
|
|
}
|
|
|
|
|
|
#define cmp_pointA(i) (sort_pointA(i)>>8)
|
|
|
|
|
|
static int
|
|
matchA(int how, const charA *f, const charA *l)
|
|
{ switch(how)
|
|
{ case STR_MATCH_EXACT:
|
|
{ for( ; *l && *f; l++, f++ )
|
|
{ if ( cmp_pointA(*l) != cmp_pointA(*f) )
|
|
return FALSE;
|
|
}
|
|
if ( *l == '\0' && *f == '\0' )
|
|
return TRUE;
|
|
|
|
return FALSE;
|
|
}
|
|
case STR_MATCH_PREFIX:
|
|
{ for( ; *l && *f; l++, f++ )
|
|
{ if ( cmp_pointA(*l) != cmp_pointA(*f) )
|
|
return FALSE;
|
|
}
|
|
if ( *f == '\0' )
|
|
return TRUE;
|
|
|
|
return FALSE;
|
|
}
|
|
case STR_MATCH_SUBSTRING: /* use Boyle-More! */
|
|
{ const charA *h;
|
|
const charA *f0 = f;
|
|
|
|
for(h=l; *h; h++)
|
|
{ for( l=h,f=f0; *l && *f; l++, f++ )
|
|
{ if ( cmp_pointA(*l) != cmp_pointA(*f) )
|
|
break;
|
|
}
|
|
if ( *f == '\0' )
|
|
return TRUE;
|
|
if ( *h == '\0' )
|
|
return FALSE;
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
case STR_MATCH_WORD:
|
|
{ const charA *h;
|
|
const charA *f0 = f;
|
|
|
|
for(h=l; *h; h = nextwordA(h))
|
|
{ for( l=h,f=f0; *l && *f; l++, f++ )
|
|
{ if ( cmp_pointA(*l) != cmp_pointA(*f) )
|
|
break;
|
|
}
|
|
if ( *f == '\0' )
|
|
{ if ( *l == '\0' || !iswalnum(*l) )
|
|
return TRUE;
|
|
}
|
|
if ( *l == '\0' )
|
|
return FALSE;
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
case STR_MATCH_LIKE: /* SeRQL like: * --> wildcart */
|
|
{ typedef struct chp { const charA *pattern;
|
|
const charA *label; } chp;
|
|
chp chps[MAX_LIKE_CHOICES];
|
|
int chn=0;
|
|
|
|
for( ; *l && *f; l++, f++ )
|
|
{ if ( *f == '*' )
|
|
{ f++;
|
|
|
|
if ( *f == '\0' ) /* foo* */
|
|
return TRUE;
|
|
|
|
search_like:
|
|
while ( *l && cmp_pointA(*l) != cmp_pointA(*f) )
|
|
l++;
|
|
|
|
if ( *l )
|
|
{ if ( chn >= MAX_LIKE_CHOICES )
|
|
{ Sdprintf("rdf_db: too many * in `like' expression (>%d)",
|
|
MAX_LIKE_CHOICES);
|
|
return FALSE;
|
|
}
|
|
chps[chn].pattern = f;
|
|
chps[chn].label = l+1;
|
|
chn++;
|
|
|
|
continue;
|
|
} else
|
|
goto retry_like;
|
|
}
|
|
|
|
if ( cmp_pointA(*l) != cmp_pointA(*f) )
|
|
goto retry_like;
|
|
}
|
|
if ( *l == '\0' && (*f == '\0' ||
|
|
(*f == '*' && f[1] == '\0')) )
|
|
return TRUE;
|
|
|
|
retry_like:
|
|
if ( chn > 0 )
|
|
{ chn--;
|
|
f = chps[chn].pattern;
|
|
l = chps[chn].label;
|
|
goto search_like;
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
default:
|
|
assert(0);
|
|
return FALSE;
|
|
}
|
|
}
|
|
|
|
|
|
static unsigned int
|
|
nextword(text *txt, unsigned int i)
|
|
{ while(i<txt->length && iswalnum(fetch(txt, i)))
|
|
i++;
|
|
while(i<txt->length && !iswalnum(fetch(txt, i)))
|
|
i++;
|
|
|
|
return i;
|
|
}
|
|
|
|
|
|
#define cmp_point(i) (sort_point(i)>>8)
|
|
|
|
|
|
int
|
|
match_atoms(int how, atom_t search, atom_t label)
|
|
{ text l, f;
|
|
|
|
if ( !get_atom_text(label, &l) ||
|
|
!get_atom_text(search, &f) )
|
|
return FALSE; /* error? */
|
|
|
|
if ( f.length == 0 )
|
|
return TRUE;
|
|
|
|
if ( f.a && l.a )
|
|
return matchA(how, f.a, l.a);
|
|
|
|
switch(how)
|
|
{ case STR_MATCH_EXACT:
|
|
{ if ( l.length == f.length )
|
|
{ unsigned int i;
|
|
|
|
for(i=0; i<l.length; i++ )
|
|
{ if ( cmp_point(fetch(&l, i)) != cmp_point(fetch(&f, i)) )
|
|
return FALSE;
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
case STR_MATCH_PREFIX:
|
|
{ if ( f.length <= l.length )
|
|
{ unsigned int i;
|
|
|
|
for(i=0; i<f.length; i++ )
|
|
{ if ( cmp_point(fetch(&l, i)) != cmp_point(fetch(&f, i)) )
|
|
return FALSE;
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
case STR_MATCH_SUBSTRING: /* use Boyle-More! */
|
|
{ if ( f.length <= l.length )
|
|
{ unsigned int i, s;
|
|
|
|
for(s=0; s+f.length <= l.length; s++)
|
|
{ for(i=0; i<f.length; i++)
|
|
{ if ( cmp_point(fetch(&l, i+s)) != cmp_point(fetch(&f, i)) )
|
|
goto snext;
|
|
}
|
|
return TRUE;
|
|
|
|
snext:;
|
|
}
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
case STR_MATCH_WORD:
|
|
{ if ( f.length <= l.length )
|
|
{ unsigned int i, s;
|
|
|
|
for(s=0; s+f.length <= l.length; s = nextword(&l, s))
|
|
{ for(i=0; i<f.length; i++)
|
|
{ if ( cmp_point(fetch(&l, i+s)) != cmp_point(fetch(&f, i)) )
|
|
goto wnext;
|
|
}
|
|
if ( i+s == l.length || !iswalnum(fetch(&l,i+s)) )
|
|
return TRUE;
|
|
|
|
wnext:;
|
|
}
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
case STR_MATCH_LIKE: /* SeRQL like: * --> wildcart */
|
|
{ unsigned int ip, il;
|
|
typedef struct chp { unsigned int ip;
|
|
unsigned int il;
|
|
} chp;
|
|
chp chps[MAX_LIKE_CHOICES];
|
|
int chn=0;
|
|
|
|
for(ip=il=0; il < l.length && ip < f.length; ip++, il++ )
|
|
{ if ( fetch(&f, ip) == '*' )
|
|
{ ip++;
|
|
|
|
if ( ip == f.length ) /* foo* */
|
|
return TRUE;
|
|
|
|
search_like:
|
|
while ( il < l.length &&
|
|
cmp_point(fetch(&l, il)) != cmp_point(fetch(&f, ip)) )
|
|
il++;
|
|
|
|
if ( il < l.length )
|
|
{ if ( chn >= MAX_LIKE_CHOICES )
|
|
{ Sdprintf("rdf_db: too many * in `like' expression (>%d)",
|
|
MAX_LIKE_CHOICES);
|
|
return FALSE;
|
|
}
|
|
chps[chn].ip = ip;
|
|
chps[chn].il = il+1;
|
|
chn++;
|
|
|
|
continue;
|
|
} else
|
|
goto retry_like;
|
|
}
|
|
|
|
if ( cmp_point(fetch(&l, il)) != cmp_point(fetch(&f, ip)) )
|
|
goto retry_like;
|
|
}
|
|
if ( il == l.length && (ip == f.length ||
|
|
(fetch(&f,ip) == '*' && ip+1 == f.length)) )
|
|
return TRUE;
|
|
|
|
retry_like:
|
|
if ( chn > 0 )
|
|
{ chn--;
|
|
ip = chps[chn].ip;
|
|
il = chps[chn].il;
|
|
goto search_like;
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
default:
|
|
assert(0);
|
|
return FALSE;
|
|
}
|
|
}
|
|
|
|
|
|
/*******************************
|
|
* LANGUAGE MATCH *
|
|
*******************************/
|
|
|
|
typedef struct lang_choice
|
|
{ int langp; /* points after - */
|
|
int patp; /* points after *- */
|
|
} lang_choice;
|
|
|
|
#define MAX_CHOICES 10 /* Max number of stars */
|
|
|
|
typedef struct
|
|
{ int il, ip;
|
|
text l, p;
|
|
lang_choice choicepoints[MAX_CHOICES];
|
|
int choice_count;
|
|
} lang_state;
|
|
|
|
|
|
static int
|
|
create_chp(lang_state *s)
|
|
{ if ( s->choice_count < MAX_CHOICES )
|
|
{ lang_choice *cp = &s->choicepoints[s->choice_count];
|
|
|
|
cp->langp = s->il;
|
|
cp->patp = s->ip+2;
|
|
s->choice_count++;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
|
|
static int
|
|
next_choice(lang_state *s)
|
|
{ for ( ; s->choice_count > 0; s->choice_count-- )
|
|
{ lang_choice *cp = &s->choicepoints[s->choice_count-1];
|
|
int il = cp->langp;
|
|
|
|
for(; il<s->l.length; il++)
|
|
{ if ( fetch(&s->l, il) == '-' )
|
|
{ cp->langp = s->il = il+1;
|
|
s->ip = cp->patp;
|
|
return TRUE;
|
|
}
|
|
}
|
|
}
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
|
|
static atom_t ATOM_;
|
|
static atom_t ATOM_star;
|
|
|
|
int
|
|
atom_lang_matches(atom_t lang, atom_t pattern)
|
|
{ lang_state s = {0};
|
|
int cl, cp;
|
|
|
|
if ( lang == pattern ) /* exact match */
|
|
return TRUE;
|
|
|
|
if ( !ATOM_ )
|
|
{ ATOM_ = PL_new_atom("");
|
|
ATOM_star = PL_new_atom("*");
|
|
}
|
|
|
|
if ( lang == ATOM_ ) /* no language */
|
|
return FALSE;
|
|
if ( pattern == ATOM_star ) /* Everything matches "*" */
|
|
return TRUE;
|
|
|
|
if ( !get_atom_text(lang, &s.l) ||
|
|
!get_atom_text(pattern, &s.p) )
|
|
return FALSE; /* exception? */
|
|
|
|
s.il=0; s.ip=0;
|
|
for(;; s.ip++, s.il++)
|
|
{ if ( s.ip == s.p.length )
|
|
return TRUE;
|
|
if ( s.il == s.l.length )
|
|
{ if ( fetch(&s.p, s.ip) == '*' )
|
|
return TRUE;
|
|
if ( !next_choice(&s) )
|
|
return FALSE;
|
|
}
|
|
|
|
cl = fetch(&s.l, s.il);
|
|
cp = fetch(&s.p, s.ip);
|
|
if ( cl == cp )
|
|
continue;
|
|
if ( sort_point(cl)>>8 == sort_point(cp)>>8 )
|
|
continue;
|
|
|
|
if ( cp == '*' )
|
|
{ if ( s.ip+1 == s.p.length )
|
|
return TRUE;
|
|
if ( (s.ip == 0 || fetch(&s.p, s.ip-1) == '-') &&
|
|
fetch(&s.p, s.ip+1) == '-' )
|
|
{ if ( !create_chp(&s) )
|
|
return FALSE;
|
|
}
|
|
}
|
|
|
|
if ( !next_choice(&s) )
|
|
return FALSE;
|
|
}
|
|
}
|