1599 lines
36 KiB
C
1599 lines
36 KiB
C
|
/* $Id$
|
||
|
|
||
|
Part of SWI-Prolog
|
||
|
|
||
|
Author: Jan Wielemaker
|
||
|
E-mail: J.Wielemaker@cs.vu.nl
|
||
|
WWW: http://www.swi-prolog.org
|
||
|
Copyright (C): 2009, VU University Amsterdam
|
||
|
|
||
|
This library is free software; you can redistribute it and/or
|
||
|
modify it under the terms of the GNU Lesser General Public
|
||
|
License as published by the Free Software Foundation; either
|
||
|
version 2.1 of the License, or (at your option) any later version.
|
||
|
|
||
|
This library is distributed in the hope that it will be useful,
|
||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
|
Lesser General Public License for more details.
|
||
|
|
||
|
You should have received a copy of the GNU Lesser General Public
|
||
|
License along with this library; if not, write to the Free Software
|
||
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||
|
*/
|
||
|
|
||
|
#ifdef HAVE_CONFIG_H
|
||
|
#include <config.h>
|
||
|
#endif
|
||
|
#ifdef __WINDOWS__
|
||
|
#define inline __inline
|
||
|
#endif
|
||
|
|
||
|
#include <SWI-Prolog.h>
|
||
|
#include <string.h>
|
||
|
#include <stdio.h>
|
||
|
#include <wchar.h>
|
||
|
#include <wctype.h>
|
||
|
#include <assert.h>
|
||
|
|
||
|
static size_t removed_dot_segments(size_t len, const pl_wchar_t *in,
|
||
|
pl_wchar_t *out);
|
||
|
static pl_wchar_t *remove_last_segment(const pl_wchar_t *base,
|
||
|
const pl_wchar_t *o);
|
||
|
static char *_utf8_put_char(char *out, int chr);
|
||
|
|
||
|
#define ISUTF8_MB(c) ((unsigned)(c) >= 0xc0 && (unsigned)(c) <= 0xfd)
|
||
|
#define utf8_put_char(out, chr) \
|
||
|
((chr) < 0x80 ? out[0]=(char)(chr), out+1 \
|
||
|
: _utf8_put_char(out, (chr)))
|
||
|
|
||
|
|
||
|
/*******************************
|
||
|
* ERRORS *
|
||
|
*******************************/
|
||
|
|
||
|
static atom_t ATOM_query_value;
|
||
|
static atom_t ATOM_fragment;
|
||
|
static atom_t ATOM_path;
|
||
|
|
||
|
static functor_t FUNCTOR_equal2; /* =/2 */
|
||
|
static functor_t FUNCTOR_pair2; /* -/2 */
|
||
|
static functor_t FUNCTOR_uri_components5;
|
||
|
static functor_t FUNCTOR_uri_authority4;
|
||
|
static functor_t FUNCTOR_error2;
|
||
|
static functor_t FUNCTOR_syntax_error1;
|
||
|
static functor_t FUNCTOR_type_error2;
|
||
|
static functor_t FUNCTOR_domain_error2;
|
||
|
|
||
|
|
||
|
static int
|
||
|
syntax_error(const char *culprit)
|
||
|
{ term_t ex;
|
||
|
|
||
|
if ( (ex=PL_new_term_ref()) &&
|
||
|
PL_unify_term(ex,
|
||
|
PL_FUNCTOR, FUNCTOR_error2,
|
||
|
PL_FUNCTOR, FUNCTOR_syntax_error1,
|
||
|
PL_CHARS, culprit,
|
||
|
PL_VARIABLE) )
|
||
|
return PL_raise_exception(ex);
|
||
|
|
||
|
return FALSE;
|
||
|
}
|
||
|
|
||
|
|
||
|
static int
|
||
|
type_error(const char *expected, term_t found)
|
||
|
{ term_t ex;
|
||
|
|
||
|
if ( (ex=PL_new_term_ref()) &&
|
||
|
PL_unify_term(ex,
|
||
|
PL_FUNCTOR, FUNCTOR_error2,
|
||
|
PL_FUNCTOR, FUNCTOR_type_error2,
|
||
|
PL_CHARS, expected,
|
||
|
PL_TERM, found,
|
||
|
PL_VARIABLE) )
|
||
|
return PL_raise_exception(ex);
|
||
|
|
||
|
return FALSE;
|
||
|
}
|
||
|
|
||
|
|
||
|
static int
|
||
|
domain_error(const char *expected, term_t found)
|
||
|
{ term_t ex;
|
||
|
|
||
|
if ( (ex=PL_new_term_ref()) &&
|
||
|
PL_unify_term(ex,
|
||
|
PL_FUNCTOR, FUNCTOR_error2,
|
||
|
PL_FUNCTOR, FUNCTOR_domain_error2,
|
||
|
PL_CHARS, expected,
|
||
|
PL_TERM, found,
|
||
|
PL_VARIABLE) )
|
||
|
return PL_raise_exception(ex);
|
||
|
|
||
|
return FALSE;
|
||
|
}
|
||
|
|
||
|
|
||
|
/*******************************
|
||
|
* ESCAPING *
|
||
|
*******************************/
|
||
|
|
||
|
#define ESC_PATH (CH_PCHAR|CH_EX_PATH)
|
||
|
#define ESC_QUERY (CH_PCHAR|CH_EX_QF)
|
||
|
#define ESC_QVALUE (CH_UNRESERVED|CH_QSUBDELIM|CH_EX_PCHAR|CH_EX_QF)
|
||
|
#define ESC_QNAME (CH_PCHAR)
|
||
|
#define ESC_FRAGMENT (CH_PCHAR|CH_EX_QF)
|
||
|
#define ESC_AUTH (CH_PCHAR)
|
||
|
#define ESC_PASSWD (CH_PCHAR)
|
||
|
#define ESC_USER (CH_PCHAR)
|
||
|
#define ESC_SCHEME (CH_SCHEME)
|
||
|
#define ESC_PORT (CH_DIGIT)
|
||
|
#define ESC_HOST (CH_UNRESERVED|CH_SUBDELIM)
|
||
|
|
||
|
#define CH_ALPHA 0x0001
|
||
|
#define CH_DIGIT 0x0002
|
||
|
#define CH_EX_UNRES 0x0004
|
||
|
#define CH_GENDELIM 0x0008
|
||
|
#define CH_SUBDELIM 0x0010
|
||
|
#define CH_URL 0x0020
|
||
|
#define CH_EX_PCHAR 0x0040
|
||
|
#define CH_EX_QF 0x0080 /* Extra query and fragment chars */
|
||
|
#define CH_EX_SCHEME 0x0100
|
||
|
#define CH_QSUBDELIM 0x0200
|
||
|
#define CH_EX_PATH 0x0400
|
||
|
|
||
|
#define CH_SCHEME (CH_ALPHA|CH_DIGIT|CH_EX_SCHEME)
|
||
|
#define CH_UNRESERVED (CH_ALPHA|CH_DIGIT|CH_EX_UNRES)
|
||
|
#define CH_PCHAR (CH_UNRESERVED|CH_SUBDELIM|CH_EX_PCHAR)
|
||
|
|
||
|
static int charflags[128] = {0};
|
||
|
static int flags_done = 0;
|
||
|
|
||
|
static void
|
||
|
set_flags(const char *from, int flag)
|
||
|
{ for(; *from; from++)
|
||
|
charflags[from[0]&0xff] |= flag;
|
||
|
}
|
||
|
|
||
|
static void
|
||
|
fill_flags()
|
||
|
{ if ( !flags_done )
|
||
|
{ int c;
|
||
|
|
||
|
for(c='a'; c<='z'; c++)
|
||
|
charflags[c] |= CH_ALPHA;
|
||
|
for(c='A'; c<='Z'; c++)
|
||
|
charflags[c] |= CH_ALPHA;
|
||
|
for(c='0'; c<='9'; c++)
|
||
|
charflags[c] |= CH_DIGIT;
|
||
|
|
||
|
set_flags("-._~", CH_EX_UNRES);
|
||
|
set_flags(":/?#[]@", CH_GENDELIM);
|
||
|
set_flags("!$&'()+*,;=", CH_SUBDELIM);
|
||
|
set_flags("!$'()*,;", CH_QSUBDELIM); /* = CH_SUBDELIM - "&=+" */
|
||
|
set_flags(":@", CH_EX_PCHAR);
|
||
|
set_flags("/", CH_EX_PATH);
|
||
|
set_flags("/?", CH_EX_QF);
|
||
|
set_flags("+-.", CH_EX_SCHEME);
|
||
|
|
||
|
set_flags("/:?#&=", CH_URL);
|
||
|
|
||
|
flags_done = TRUE;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#define no_escape(c, f) ((c < 128) && (charflags[(int)c] & (f)))
|
||
|
#define iri_no_escape(c, f) ((c > 128) || (charflags[(int)c] & (f)))
|
||
|
|
||
|
|
||
|
/* hex(const pl_wchar_t *in, int digits, int *value)
|
||
|
|
||
|
Get <digits> characters from in and interpret them as a hexadecimal
|
||
|
integer. Returns pointer to the end on success or NULL if error.
|
||
|
*/
|
||
|
|
||
|
static const pl_wchar_t *
|
||
|
hex(const pl_wchar_t *in, int digits, int *value)
|
||
|
{ int v = 0;
|
||
|
|
||
|
while(digits-- > 0)
|
||
|
{ int c = *in++;
|
||
|
|
||
|
if ( c >= '0' && c <= '9' )
|
||
|
v = (v<<4) + c - '0';
|
||
|
else if ( c >= 'A' && c <= 'F' )
|
||
|
v = (v<<4) + c + 10 - 'A';
|
||
|
else if ( c >= 'a' && c <= 'f' )
|
||
|
v = (v<<4) + c + 10 - 'a';
|
||
|
else
|
||
|
return NULL;
|
||
|
}
|
||
|
|
||
|
*value = v;
|
||
|
return in;
|
||
|
}
|
||
|
|
||
|
|
||
|
static const pl_wchar_t *
|
||
|
get_encoded_utf8_cont_1(const pl_wchar_t *in, int *val)
|
||
|
{ int c;
|
||
|
|
||
|
if ( in[0] == '%' && hex(in+1, 2, &c) )
|
||
|
{ if ( (c&0xc0) == 0x80 )
|
||
|
{ *val = (c&0x3f);
|
||
|
return in+3;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return NULL;
|
||
|
}
|
||
|
|
||
|
|
||
|
static const pl_wchar_t *
|
||
|
get_encoded_utf8_cont(const pl_wchar_t *in, int cnt, int *val)
|
||
|
{ int shift = cnt*6;
|
||
|
|
||
|
*val <<= shift;
|
||
|
shift -= 6;
|
||
|
|
||
|
while(cnt-->0)
|
||
|
{ int v0;
|
||
|
|
||
|
if ( (in = get_encoded_utf8_cont_1(in, &v0)) )
|
||
|
{ *val |= (v0<<shift);
|
||
|
shift -= 6;
|
||
|
} else
|
||
|
return NULL;
|
||
|
}
|
||
|
|
||
|
return in;
|
||
|
}
|
||
|
|
||
|
|
||
|
static const pl_wchar_t *
|
||
|
get_encoded_utf8(const pl_wchar_t *in, int *chr)
|
||
|
{ int c1;
|
||
|
|
||
|
if ( in[0] == '%' && hex(in+1, 2, &c1) )
|
||
|
{ in += 3;
|
||
|
|
||
|
if ( ISUTF8_MB(c1) )
|
||
|
{ if ( (c1&0xe0) == 0xc0 ) /* 2-byte */
|
||
|
{ *chr = (c1&0x1f);
|
||
|
return get_encoded_utf8_cont(in, 1, chr);
|
||
|
} else if ( (c1&0xf0) == 0xe0 ) /* 3-byte */
|
||
|
{ *chr = (c1&0xf);
|
||
|
return get_encoded_utf8_cont(in, 2, chr);
|
||
|
} else if ( (c1&0xf8) == 0xf0 ) /* 4-byte */
|
||
|
{ *chr = (c1&0x7);
|
||
|
return get_encoded_utf8_cont(in, 3, chr);
|
||
|
} else if ( (c1&0xfc) == 0xf8 ) /* 5-byte */
|
||
|
{ *chr = (c1&0x3);
|
||
|
return get_encoded_utf8_cont(in, 4, chr);
|
||
|
} else if ( (c1&0xfe) == 0xfc ) /* 6-byte */
|
||
|
{ *chr = (c1&0x1);
|
||
|
return get_encoded_utf8_cont(in, 5, chr);
|
||
|
} else
|
||
|
return NULL;
|
||
|
} else
|
||
|
{ *chr = c1;
|
||
|
return in; /* Encoded ASCII character */
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return NULL;
|
||
|
}
|
||
|
|
||
|
|
||
|
/*******************************
|
||
|
* RANGES *
|
||
|
*******************************/
|
||
|
|
||
|
typedef struct range
|
||
|
{ const pl_wchar_t *start;
|
||
|
const pl_wchar_t *end;
|
||
|
} range;
|
||
|
|
||
|
|
||
|
/*******************************
|
||
|
* CHARACTER BUFFER *
|
||
|
*******************************/
|
||
|
|
||
|
typedef struct charbuf
|
||
|
{ pl_wchar_t *base;
|
||
|
pl_wchar_t *here;
|
||
|
pl_wchar_t *end;
|
||
|
pl_wchar_t tmp[256];
|
||
|
} charbuf;
|
||
|
|
||
|
|
||
|
static void
|
||
|
init_charbuf(charbuf *cb)
|
||
|
{ cb->base = cb->here = cb->tmp;
|
||
|
cb->end = &cb->tmp[sizeof(cb->tmp)/sizeof(pl_wchar_t)];
|
||
|
}
|
||
|
|
||
|
|
||
|
static int
|
||
|
init_charbuf_at_size(charbuf *cb, size_t size)
|
||
|
{ size++;
|
||
|
|
||
|
if ( size < sizeof(cb->tmp)/sizeof(pl_wchar_t) )
|
||
|
cb->base = cb->here = cb->tmp;
|
||
|
else
|
||
|
cb->base = cb->here = PL_malloc(size*sizeof(pl_wchar_t));
|
||
|
|
||
|
return TRUE;
|
||
|
}
|
||
|
|
||
|
|
||
|
static int
|
||
|
add_charbuf(charbuf *cb, int c)
|
||
|
{ if ( cb->here < cb->end )
|
||
|
{ *cb->here++ = c;
|
||
|
} else
|
||
|
{ size_t len = (cb->end-cb->base);
|
||
|
|
||
|
if ( cb->base == cb->tmp )
|
||
|
{ pl_wchar_t *n = PL_malloc(len*2*sizeof(pl_wchar_t));
|
||
|
memcpy(n, cb->base, sizeof(cb->tmp));
|
||
|
cb->base = n;
|
||
|
} else
|
||
|
{ cb->base = PL_realloc(cb->base, len*2*sizeof(pl_wchar_t));
|
||
|
}
|
||
|
cb->here = &cb->base[len];
|
||
|
cb->end = &cb->base[len*2];
|
||
|
*cb->here++ = c;
|
||
|
}
|
||
|
|
||
|
return TRUE;
|
||
|
}
|
||
|
|
||
|
|
||
|
static inline int
|
||
|
hexdigit(int val)
|
||
|
{ if ( val < 10 )
|
||
|
return '0'+val;
|
||
|
return 'A'-10+val;
|
||
|
}
|
||
|
|
||
|
|
||
|
static int
|
||
|
add_encoded_charbuf(charbuf *cb, int c, int flags)
|
||
|
{ if ( no_escape(c, flags) )
|
||
|
{ add_charbuf(cb, c);
|
||
|
} else
|
||
|
{ char tmp[6];
|
||
|
const char *end = utf8_put_char(tmp, c);
|
||
|
const char *s;
|
||
|
|
||
|
for(s=tmp; s<end; s++)
|
||
|
{ int b = s[0]&0xff;
|
||
|
|
||
|
add_charbuf(cb, '%');
|
||
|
add_charbuf(cb, hexdigit(b>>4));
|
||
|
add_charbuf(cb, hexdigit(b&0xf));
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return TRUE;
|
||
|
}
|
||
|
|
||
|
|
||
|
static int
|
||
|
iri_add_encoded_charbuf(charbuf *cb, int c, int flags)
|
||
|
{ if ( iri_no_escape(c, flags) )
|
||
|
{ add_charbuf(cb, c);
|
||
|
} else
|
||
|
{ assert(c < 128);
|
||
|
add_charbuf(cb, '%');
|
||
|
add_charbuf(cb, hexdigit(c>>4));
|
||
|
add_charbuf(cb, hexdigit(c&0xf));
|
||
|
}
|
||
|
|
||
|
return TRUE;
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
static int
|
||
|
add_nchars_charbuf(charbuf *cb, size_t len, const pl_wchar_t *s)
|
||
|
{ if ( cb->here+len <= cb->end )
|
||
|
{ wcsncpy(cb->here, s, len);
|
||
|
cb->here += len;
|
||
|
} else
|
||
|
{ size_t n;
|
||
|
|
||
|
for(n=0; n<len; n++)
|
||
|
add_charbuf(cb, s[n]);
|
||
|
}
|
||
|
|
||
|
return TRUE;
|
||
|
}
|
||
|
|
||
|
|
||
|
static int
|
||
|
range_has_escape(const range *r, int flags)
|
||
|
{ const pl_wchar_t *s = r->start;
|
||
|
|
||
|
for(; s<r->end; s++)
|
||
|
{ if ( s[0] == '%' || (s[0] == '+' && flags == ESC_QVALUE) )
|
||
|
return TRUE;
|
||
|
}
|
||
|
|
||
|
return FALSE;
|
||
|
}
|
||
|
|
||
|
|
||
|
static int
|
||
|
range_is_unreserved(const range *r, int iri, int flags)
|
||
|
{ const pl_wchar_t *s = r->start;
|
||
|
|
||
|
if ( iri )
|
||
|
{ for(; s<r->end; s++)
|
||
|
{ if ( !iri_no_escape(s[0], flags) )
|
||
|
return FALSE;
|
||
|
}
|
||
|
} else
|
||
|
{ for(; s<r->end; s++)
|
||
|
{ if ( !no_escape(s[0], flags) )
|
||
|
return FALSE;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return TRUE;
|
||
|
}
|
||
|
|
||
|
|
||
|
static int
|
||
|
add_verb_range_charbuf(charbuf *cb, const range *r)
|
||
|
{ return add_nchars_charbuf(cb, r->end-r->start, r->start);
|
||
|
}
|
||
|
|
||
|
|
||
|
static int
|
||
|
add_decoded_range_charbuf(charbuf *cb, const range *r, int flags)
|
||
|
{ const pl_wchar_t *s = r->start;
|
||
|
|
||
|
while(s<r->end)
|
||
|
{ int c;
|
||
|
|
||
|
if ( *s == '%' )
|
||
|
{ const pl_wchar_t *e;
|
||
|
|
||
|
if ( (e=get_encoded_utf8(s, &c)) )
|
||
|
{ s = e;
|
||
|
} else if (hex(s+1, 2, &c) )
|
||
|
{ s += 3;
|
||
|
} else
|
||
|
{ c = *s++;
|
||
|
}
|
||
|
} else if ( *s == '+' && flags == ESC_QVALUE )
|
||
|
{ s++;
|
||
|
c = ' ';
|
||
|
} else
|
||
|
{ c = *s++;
|
||
|
}
|
||
|
|
||
|
add_charbuf(cb, c);
|
||
|
}
|
||
|
|
||
|
return TRUE;
|
||
|
}
|
||
|
|
||
|
|
||
|
static int
|
||
|
add_normalized_range_charbuf(charbuf *cb, const range *r, int iri, int flags)
|
||
|
{ const pl_wchar_t *s = r->start;
|
||
|
while(s<r->end)
|
||
|
{ int c;
|
||
|
|
||
|
if ( *s == '%' )
|
||
|
{ const pl_wchar_t *e;
|
||
|
|
||
|
if ( (e=get_encoded_utf8(s, &c)) )
|
||
|
{ s = e;
|
||
|
} else if (hex(s+1, 2, &c) )
|
||
|
{ s += 3;
|
||
|
} else
|
||
|
{ c = *s++;
|
||
|
}
|
||
|
} else if ( *s == '+' && flags == ESC_QVALUE )
|
||
|
{ s++;
|
||
|
c = ' ';
|
||
|
} else
|
||
|
{ c = *s++;
|
||
|
}
|
||
|
|
||
|
if ( iri )
|
||
|
{ iri_add_encoded_charbuf(cb, c, flags);
|
||
|
} else
|
||
|
{ add_encoded_charbuf(cb, c, flags);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return TRUE;
|
||
|
}
|
||
|
|
||
|
|
||
|
/* add_range_charbuf(charbuf *cb, const range *r, int iri, int flags)
|
||
|
|
||
|
Add a range of characters while normalizing %-encoding. This
|
||
|
implies not to use encoding if it is not needed and upcase
|
||
|
%xx to %XX otherwise.
|
||
|
|
||
|
If iri == TRUE, values >= 128 are not escaped. Otherwise they
|
||
|
use %-encoded UTF-8
|
||
|
*/
|
||
|
|
||
|
static int
|
||
|
add_range_charbuf(charbuf *cb, const range *r, int iri, int flags)
|
||
|
{ if ( range_has_escape(r, flags) )
|
||
|
{ return add_normalized_range_charbuf(cb, r, iri, flags);
|
||
|
} else if ( range_is_unreserved(r, iri, flags) )
|
||
|
{ add_nchars_charbuf(cb, r->end-r->start, r->start);
|
||
|
} else
|
||
|
{ const pl_wchar_t *s = r->start;
|
||
|
|
||
|
if ( iri )
|
||
|
{ while(s<r->end)
|
||
|
iri_add_encoded_charbuf(cb, *s++, flags);
|
||
|
} else
|
||
|
{ while(s<r->end)
|
||
|
add_encoded_charbuf(cb, *s++, flags);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return TRUE;
|
||
|
}
|
||
|
|
||
|
|
||
|
/* add_lwr_range_charbuf(charbuf *cb, const range *r, int iri, int flags)
|
||
|
|
||
|
Add a range of characters while normalizing %-encoding and
|
||
|
mapping all characters to lowercase.
|
||
|
|
||
|
FIXME: encoding and decoding compatible to add_range_charbuf();
|
||
|
*/
|
||
|
|
||
|
|
||
|
static int
|
||
|
add_lwr_range_charbuf(charbuf *cb, const range *r, int iri, int flags)
|
||
|
{ const pl_wchar_t *s = r->start;
|
||
|
|
||
|
while(s<r->end)
|
||
|
{ int c;
|
||
|
|
||
|
if ( *s == '%' )
|
||
|
{ const pl_wchar_t *e;
|
||
|
|
||
|
if ( (e=get_encoded_utf8(s, &c)) )
|
||
|
{ s = e;
|
||
|
} else if (hex(s+1, 2, &c) )
|
||
|
{ s += 3;
|
||
|
} else
|
||
|
{ c = *s++;
|
||
|
}
|
||
|
} else
|
||
|
{ c = *s++;
|
||
|
}
|
||
|
|
||
|
if ( iri )
|
||
|
iri_add_encoded_charbuf(cb, towlower((wint_t)c), flags);
|
||
|
else
|
||
|
add_encoded_charbuf(cb, towlower((wint_t)c), flags);
|
||
|
}
|
||
|
|
||
|
return TRUE;
|
||
|
}
|
||
|
|
||
|
|
||
|
static void
|
||
|
free_charbuf(charbuf *cb)
|
||
|
{ if ( cb->base != cb->tmp )
|
||
|
PL_free(cb->base);
|
||
|
}
|
||
|
|
||
|
|
||
|
#define TXT_EX_TEXT (CVT_ATOM|CVT_STRING|CVT_EXCEPTION)
|
||
|
|
||
|
static int
|
||
|
get_text_arg(term_t term, int pos, size_t *len, pl_wchar_t **s, int flags)
|
||
|
{ term_t tmp = PL_new_term_ref();
|
||
|
|
||
|
_PL_get_arg(pos, term, tmp);
|
||
|
if ( PL_is_variable(tmp) )
|
||
|
return FALSE;
|
||
|
if ( !PL_get_wchars(tmp, len, s, flags) )
|
||
|
return -1;
|
||
|
|
||
|
return TRUE;
|
||
|
}
|
||
|
|
||
|
|
||
|
/** uri_components(+URI, -Components)
|
||
|
|
||
|
Based on RFC-3986 regular expression:
|
||
|
|
||
|
==
|
||
|
^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
|
||
|
12 3 4 5 6 7 8 9
|
||
|
==
|
||
|
*/
|
||
|
|
||
|
typedef struct uri_component_ranges
|
||
|
{ range scheme;
|
||
|
range authority;
|
||
|
range path;
|
||
|
range query;
|
||
|
range fragment;
|
||
|
} uri_component_ranges;
|
||
|
|
||
|
|
||
|
static const pl_wchar_t *
|
||
|
skip_not(const pl_wchar_t *in, const pl_wchar_t *end, const pl_wchar_t *chars)
|
||
|
{ if ( !chars[1] )
|
||
|
{ for(; in < end; in++)
|
||
|
{ if ( chars[0] == in[0] )
|
||
|
return in;
|
||
|
}
|
||
|
} else
|
||
|
{ for(; in < end; in++)
|
||
|
{ if ( wcschr(chars, in[0]) )
|
||
|
return in;
|
||
|
}
|
||
|
}
|
||
|
return in;
|
||
|
}
|
||
|
|
||
|
|
||
|
static int
|
||
|
unify_range(term_t t, const range *r)
|
||
|
{ if ( r->start )
|
||
|
return PL_unify_wchars(t, PL_ATOM, r->end - r->start, r->start);
|
||
|
|
||
|
return TRUE;
|
||
|
}
|
||
|
|
||
|
|
||
|
static int
|
||
|
parse_uri(uri_component_ranges *ranges, size_t len, const pl_wchar_t *s)
|
||
|
{ const pl_wchar_t *end = &s[len];
|
||
|
const pl_wchar_t *here = s;
|
||
|
const pl_wchar_t *e;
|
||
|
|
||
|
memset(ranges, 0, sizeof(*ranges));
|
||
|
|
||
|
e = skip_not(here, end, L":/?#");
|
||
|
if ( e > s && e[0] == ':' ) /* 1&2 */
|
||
|
{ ranges->scheme.start = s;
|
||
|
ranges->scheme.end = e;
|
||
|
here = e+1;
|
||
|
}
|
||
|
|
||
|
if ( here[0] == '/' && here[1] == '/' ) /* 3 */
|
||
|
{ here += 2; /* 4 */
|
||
|
e = skip_not(here, end, L"/?#");
|
||
|
ranges->authority.start = here;
|
||
|
ranges->authority.end = e;
|
||
|
here = e; /* 5 */
|
||
|
}
|
||
|
|
||
|
e = skip_not(here, end, L"?#");
|
||
|
ranges->path.start = here;
|
||
|
ranges->path.end = e;
|
||
|
here = e; /* 6 */
|
||
|
|
||
|
if ( here[0] == '?' )
|
||
|
{ here++; /* 7 */
|
||
|
e = skip_not(here, end, L"#");
|
||
|
ranges->query.start = here;
|
||
|
ranges->query.end = e;
|
||
|
here = e; /* 8 */
|
||
|
}
|
||
|
|
||
|
if ( here[0] == '#' )
|
||
|
{ here++; /* 9 */
|
||
|
ranges->fragment.start = here;
|
||
|
ranges->fragment.end = end;
|
||
|
}
|
||
|
|
||
|
return TRUE;
|
||
|
}
|
||
|
|
||
|
|
||
|
static foreign_t
|
||
|
uri_components(term_t URI, term_t components)
|
||
|
{ pl_wchar_t *s;
|
||
|
size_t len;
|
||
|
|
||
|
if ( PL_get_wchars(URI, &len, &s, CVT_ATOM|CVT_STRING|CVT_LIST) )
|
||
|
{ uri_component_ranges ranges;
|
||
|
term_t rt = PL_new_term_refs(6);
|
||
|
term_t av = rt+1;
|
||
|
|
||
|
parse_uri(&ranges, len, s);
|
||
|
|
||
|
unify_range(av+0, &ranges.scheme);
|
||
|
unify_range(av+1, &ranges.authority);
|
||
|
unify_range(av+2, &ranges.path);
|
||
|
unify_range(av+3, &ranges.query);
|
||
|
unify_range(av+4, &ranges.fragment);
|
||
|
|
||
|
return (PL_cons_functor_v(rt, FUNCTOR_uri_components5, av) &&
|
||
|
PL_unify(components, rt));
|
||
|
} else if ( PL_is_functor(components, FUNCTOR_uri_components5) )
|
||
|
{ charbuf b;
|
||
|
int rc;
|
||
|
|
||
|
init_charbuf(&b);
|
||
|
/* schema */
|
||
|
if ( (rc=get_text_arg(components, 1, &len, &s, TXT_EX_TEXT)) == TRUE )
|
||
|
{ add_nchars_charbuf(&b, len, s);
|
||
|
add_charbuf(&b, ':');
|
||
|
} else if ( rc == -1 )
|
||
|
{ free_charbuf(&b);
|
||
|
return FALSE;
|
||
|
}
|
||
|
/* authority */
|
||
|
if ( (rc=get_text_arg(components, 2, &len, &s, TXT_EX_TEXT)) == TRUE )
|
||
|
{ add_charbuf(&b, '/');
|
||
|
add_charbuf(&b, '/');
|
||
|
add_nchars_charbuf(&b, len, s);
|
||
|
} else if ( rc == -1 )
|
||
|
{ free_charbuf(&b);
|
||
|
return FALSE;
|
||
|
}
|
||
|
/* path */
|
||
|
if ( (rc=get_text_arg(components, 3, &len, &s, TXT_EX_TEXT)) == TRUE )
|
||
|
{ add_nchars_charbuf(&b, len, s);
|
||
|
} else if ( rc == -1 )
|
||
|
{ free_charbuf(&b);
|
||
|
return FALSE;
|
||
|
}
|
||
|
/* query */
|
||
|
if ( (rc=get_text_arg(components, 4, &len, &s, TXT_EX_TEXT)) == TRUE )
|
||
|
{ if ( len > 0 )
|
||
|
{ add_charbuf(&b, '?');
|
||
|
add_nchars_charbuf(&b, len, s);
|
||
|
}
|
||
|
} else if ( rc == -1 )
|
||
|
{ free_charbuf(&b);
|
||
|
return FALSE;
|
||
|
}
|
||
|
/* fragment */
|
||
|
if ( (rc=get_text_arg(components, 5, &len, &s, TXT_EX_TEXT)) == TRUE )
|
||
|
{ add_charbuf(&b, '#');
|
||
|
add_nchars_charbuf(&b, len, s);
|
||
|
} else if ( rc == -1 )
|
||
|
{ free_charbuf(&b);
|
||
|
return FALSE;
|
||
|
}
|
||
|
|
||
|
rc = PL_unify_wchars(URI, PL_ATOM, b.here-b.base, b.base);
|
||
|
free_charbuf(&b);
|
||
|
|
||
|
return rc;
|
||
|
} else /* generate an error */
|
||
|
{ return PL_get_wchars(URI, &len, &s,
|
||
|
CVT_ATOM|CVT_STRING|CVT_LIST|CVT_EXCEPTION);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
/** uri_is_global(+URI) is semidet.
|
||
|
*/
|
||
|
|
||
|
static foreign_t
|
||
|
uri_is_global(term_t URI)
|
||
|
{ pl_wchar_t *s;
|
||
|
size_t len;
|
||
|
|
||
|
if ( PL_get_wchars(URI, &len, &s,
|
||
|
CVT_ATOM|CVT_STRING|CVT_LIST|CVT_EXCEPTION) )
|
||
|
{ const pl_wchar_t *e;
|
||
|
const pl_wchar_t *end = &s[len];
|
||
|
range r;
|
||
|
|
||
|
e = skip_not(s, end, L":/?#");
|
||
|
if ( e > s && e[0] == ':' )
|
||
|
{ r.start = s;
|
||
|
r.end = e;
|
||
|
if ( range_is_unreserved(&r, TRUE, CH_SCHEME) )
|
||
|
return TRUE;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return FALSE;
|
||
|
}
|
||
|
|
||
|
|
||
|
/*******************************
|
||
|
* QUERY-STRING *
|
||
|
*******************************/
|
||
|
|
||
|
static int
|
||
|
unify_decoded_atom(term_t t, range *r, int flags)
|
||
|
{ if ( range_has_escape(r, flags) )
|
||
|
{ charbuf b;
|
||
|
int rc;
|
||
|
|
||
|
init_charbuf(&b);
|
||
|
add_decoded_range_charbuf(&b, r, flags);
|
||
|
rc = PL_unify_wchars(t, PL_ATOM, b.here - b.base, b.base);
|
||
|
free_charbuf(&b);
|
||
|
return rc;
|
||
|
} else
|
||
|
{ return unify_range(t, r);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
static int
|
||
|
unify_query_string_components(term_t list, size_t len, const pl_wchar_t *qs)
|
||
|
{ if ( len == 0 )
|
||
|
{ return PL_unify_nil(list);
|
||
|
} else
|
||
|
{ term_t tail = PL_copy_term_ref(list);
|
||
|
term_t head = PL_new_term_ref();
|
||
|
term_t eq = PL_new_term_refs(3);
|
||
|
term_t nv = eq+1;
|
||
|
const pl_wchar_t *end = &qs[len];
|
||
|
|
||
|
while(qs < end)
|
||
|
{ range name, value;
|
||
|
|
||
|
name.start = qs;
|
||
|
name.end = skip_not(qs, end, L"=");
|
||
|
if ( name.end < end )
|
||
|
{ value.start = name.end+1;
|
||
|
value.end = skip_not(value.start, end, L"&");
|
||
|
|
||
|
qs = value.end+1;
|
||
|
} else
|
||
|
{ return syntax_error("illegal_uri_query");
|
||
|
}
|
||
|
|
||
|
PL_put_variable(nv+0);
|
||
|
PL_put_variable(nv+1);
|
||
|
unify_decoded_atom(nv+0, &name, ESC_QNAME);
|
||
|
unify_decoded_atom(nv+1, &value, ESC_QVALUE);
|
||
|
|
||
|
if ( !PL_cons_functor_v(eq, FUNCTOR_equal2, nv) ||
|
||
|
!PL_unify_list(tail, head, tail) ||
|
||
|
!PL_unify(head, eq) )
|
||
|
return FALSE;
|
||
|
}
|
||
|
|
||
|
return PL_unify_nil(tail);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
static int
|
||
|
add_encoded_term_charbuf(charbuf *cb, term_t value, int flags)
|
||
|
{ pl_wchar_t *s;
|
||
|
range r;
|
||
|
size_t len;
|
||
|
|
||
|
if ( !PL_get_wchars(value, &len, &s, CVT_ATOMIC|CVT_EXCEPTION) )
|
||
|
return FALSE;
|
||
|
|
||
|
r.start = s;
|
||
|
r.end = r.start+len;
|
||
|
if ( range_is_unreserved(&r, TRUE, flags) )
|
||
|
{ add_nchars_charbuf(cb, r.end-r.start, r.start);
|
||
|
} else
|
||
|
{ const pl_wchar_t *s = r.start;
|
||
|
|
||
|
while(s<r.end)
|
||
|
add_encoded_charbuf(cb, *s++, flags);
|
||
|
}
|
||
|
|
||
|
return TRUE;
|
||
|
}
|
||
|
|
||
|
|
||
|
/** uri_query_components(+QueryString, -ValueList) is det.
|
||
|
*/
|
||
|
|
||
|
static foreign_t
|
||
|
uri_query_components(term_t string, term_t list)
|
||
|
{ pl_wchar_t *s;
|
||
|
size_t len;
|
||
|
|
||
|
if ( PL_get_wchars(string, &len, &s, CVT_ATOM|CVT_STRING|CVT_LIST) )
|
||
|
{ return unify_query_string_components(list, len, s);
|
||
|
} else if ( PL_is_list(list) )
|
||
|
{ term_t tail = PL_copy_term_ref(list);
|
||
|
term_t head = PL_new_term_ref();
|
||
|
term_t nv = PL_new_term_refs(2);
|
||
|
charbuf out;
|
||
|
int rc;
|
||
|
|
||
|
fill_flags();
|
||
|
init_charbuf(&out);
|
||
|
while( PL_get_list(tail, head, tail) )
|
||
|
{ atom_t fname;
|
||
|
int arity;
|
||
|
|
||
|
if ( PL_is_functor(head, FUNCTOR_equal2) ||
|
||
|
PL_is_functor(head, FUNCTOR_pair2) )
|
||
|
{ _PL_get_arg(1, head, nv+0);
|
||
|
_PL_get_arg(2, head, nv+1);
|
||
|
} else if ( PL_get_name_arity(head, &fname, &arity) && arity == 1 )
|
||
|
{ PL_put_atom(nv+0, fname);
|
||
|
_PL_get_arg(1, head, nv+1);
|
||
|
} else
|
||
|
{ free_charbuf(&out);
|
||
|
return type_error("name_value", head);
|
||
|
}
|
||
|
|
||
|
if ( out.here != out.base )
|
||
|
add_charbuf(&out, '&');
|
||
|
if ( !add_encoded_term_charbuf(&out, nv+0, ESC_QNAME) )
|
||
|
{ free_charbuf(&out);
|
||
|
return FALSE;
|
||
|
}
|
||
|
add_charbuf(&out, '=');
|
||
|
if ( !add_encoded_term_charbuf(&out, nv+1, ESC_QVALUE) )
|
||
|
{ free_charbuf(&out);
|
||
|
return FALSE;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
rc = PL_unify_wchars(string, PL_ATOM, out.here-out.base, out.base);
|
||
|
free_charbuf(&out);
|
||
|
return rc;
|
||
|
} else
|
||
|
{ return PL_get_wchars(string, &len, &s,
|
||
|
CVT_ATOM|CVT_STRING|CVT_LIST|CVT_EXCEPTION);
|
||
|
}
|
||
|
|
||
|
return FALSE;
|
||
|
}
|
||
|
|
||
|
|
||
|
/** uri_encoded(+What, +String, -Encoded)
|
||
|
*/
|
||
|
|
||
|
static foreign_t
|
||
|
uri_encoded(term_t what, term_t qv, term_t enc)
|
||
|
{ pl_wchar_t *s;
|
||
|
size_t len;
|
||
|
atom_t w;
|
||
|
int flags;
|
||
|
|
||
|
if ( !PL_get_atom(what, &w) )
|
||
|
return type_error("atom", what);
|
||
|
if ( w == ATOM_query_value )
|
||
|
flags = ESC_QVALUE;
|
||
|
else if ( w == ATOM_fragment )
|
||
|
flags = ESC_FRAGMENT;
|
||
|
else if ( w == ATOM_path )
|
||
|
flags = ESC_PATH;
|
||
|
else
|
||
|
return domain_error("uri_component", what);
|
||
|
|
||
|
fill_flags();
|
||
|
|
||
|
if ( !PL_is_variable(qv) )
|
||
|
{ charbuf out;
|
||
|
int rc;
|
||
|
|
||
|
init_charbuf(&out);
|
||
|
if ( !add_encoded_term_charbuf(&out, qv, flags) )
|
||
|
{ free_charbuf(&out);
|
||
|
return FALSE;
|
||
|
}
|
||
|
rc = PL_unify_wchars(enc, PL_ATOM, out.here-out.base, out.base);
|
||
|
free_charbuf(&out);
|
||
|
return rc;
|
||
|
} else if ( PL_get_wchars(enc, &len, &s, CVT_ATOM|CVT_STRING|CVT_EXCEPTION) )
|
||
|
{ range r;
|
||
|
|
||
|
r.start = s;
|
||
|
r.end = s+len;
|
||
|
|
||
|
return unify_decoded_atom(qv, &r, flags);
|
||
|
} else
|
||
|
{ return FALSE;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
/*******************************
|
||
|
* AUTHORITY *
|
||
|
*******************************/
|
||
|
|
||
|
static int
|
||
|
unify_uri_authority_components(term_t components,
|
||
|
size_t len, const pl_wchar_t *s)
|
||
|
{ const pl_wchar_t *end = &s[len];
|
||
|
const pl_wchar_t *e;
|
||
|
range user = {0};
|
||
|
range passwd = {0};
|
||
|
range host = {0};
|
||
|
range port = {0};
|
||
|
term_t t = PL_new_term_refs(5);
|
||
|
term_t av = t+1;
|
||
|
|
||
|
if ( (e=skip_not(s, end, L"@")) && e<end )
|
||
|
{ user.start = s;
|
||
|
user.end = e;
|
||
|
s = e+1;
|
||
|
if ( (e=skip_not(user.start, user.end, L":")) && e<user.end )
|
||
|
{ passwd.start = e+1;
|
||
|
passwd.end = user.end;
|
||
|
user.end = e;
|
||
|
}
|
||
|
}
|
||
|
host.start = s;
|
||
|
host.end = skip_not(s, end, L":");
|
||
|
if ( host.end < end )
|
||
|
{ port.start = host.end+1;
|
||
|
port.end = end;
|
||
|
}
|
||
|
|
||
|
if ( user.start )
|
||
|
unify_decoded_atom(av+0, &user, ESC_USER);
|
||
|
if ( passwd.start )
|
||
|
unify_decoded_atom(av+1, &passwd, ESC_PASSWD);
|
||
|
unify_decoded_atom(av+2, &host, ESC_HOST);
|
||
|
if ( port.start )
|
||
|
{ wchar_t *ep;
|
||
|
long pn = wcstol(port.start, &ep, 10);
|
||
|
|
||
|
if ( ep == port.end )
|
||
|
{ if ( !PL_put_integer(av+3, pn) )
|
||
|
return FALSE;
|
||
|
} else
|
||
|
{ unify_decoded_atom(av+3, &port, ESC_PORT);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return (PL_cons_functor_v(t, FUNCTOR_uri_authority4, av) &&
|
||
|
PL_unify(components, t));
|
||
|
}
|
||
|
|
||
|
|
||
|
/** uri_authority_components(+Authority, -Components) is det.
|
||
|
uri_authority_components(-Authority, +Components) is det.
|
||
|
*/
|
||
|
|
||
|
static foreign_t
|
||
|
uri_authority_components(term_t Authority, term_t components)
|
||
|
{ pl_wchar_t *s;
|
||
|
size_t len;
|
||
|
|
||
|
if ( PL_get_wchars(Authority, &len, &s, CVT_ATOM|CVT_STRING|CVT_LIST) )
|
||
|
{ return unify_uri_authority_components(components, len, s);
|
||
|
} else if ( PL_is_functor(components, FUNCTOR_uri_authority4) )
|
||
|
{ charbuf b;
|
||
|
int rc;
|
||
|
|
||
|
init_charbuf(&b);
|
||
|
if ( (rc=get_text_arg(components, 1, &len, &s, TXT_EX_TEXT)) == TRUE )
|
||
|
{ add_nchars_charbuf(&b, len, s);
|
||
|
if ( (rc=get_text_arg(components, 2, &len, &s, TXT_EX_TEXT)) == TRUE )
|
||
|
{ add_charbuf(&b, ':');
|
||
|
add_nchars_charbuf(&b, len, s);
|
||
|
} else if ( rc == -1 )
|
||
|
{ free_charbuf(&b);
|
||
|
return FALSE;
|
||
|
}
|
||
|
add_charbuf(&b, '@');
|
||
|
} else if ( rc == -1 )
|
||
|
{ free_charbuf(&b);
|
||
|
return FALSE;
|
||
|
}
|
||
|
if ( (rc=get_text_arg(components, 3, &len, &s, TXT_EX_TEXT)) == TRUE )
|
||
|
{ add_nchars_charbuf(&b, len, s);
|
||
|
} else if ( rc == -1 )
|
||
|
{ free_charbuf(&b);
|
||
|
return FALSE;
|
||
|
}
|
||
|
if ( (rc=get_text_arg(components, 4, &len, &s,
|
||
|
TXT_EX_TEXT|CVT_INTEGER)) == TRUE )
|
||
|
{ add_charbuf(&b, ':');
|
||
|
add_nchars_charbuf(&b, len, s);
|
||
|
} else if ( rc == -1 )
|
||
|
{ free_charbuf(&b);
|
||
|
return FALSE;
|
||
|
}
|
||
|
|
||
|
rc = PL_unify_wchars(Authority, PL_ATOM, b.here-b.base, b.base);
|
||
|
free_charbuf(&b);
|
||
|
|
||
|
return rc;
|
||
|
} else
|
||
|
{ return PL_get_wchars(Authority, &len, &s,
|
||
|
CVT_ATOM|CVT_STRING|CVT_LIST|CVT_EXCEPTION);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
/*******************************
|
||
|
* NORMALIZATION *
|
||
|
*******************************/
|
||
|
|
||
|
static int
|
||
|
normalize_in_charbuf(charbuf *cb, uri_component_ranges *ranges, int iri)
|
||
|
{ fill_flags();
|
||
|
|
||
|
if ( ranges->scheme.start )
|
||
|
{ add_lwr_range_charbuf(cb, &ranges->scheme, iri, ESC_SCHEME);
|
||
|
add_charbuf(cb, ':');
|
||
|
}
|
||
|
if ( ranges->authority.start )
|
||
|
{ add_charbuf(cb, '/');
|
||
|
add_charbuf(cb, '/');
|
||
|
add_lwr_range_charbuf(cb, &ranges->authority, iri, ESC_AUTH);
|
||
|
}
|
||
|
if ( ranges->path.end > ranges->path.start )
|
||
|
{ charbuf pb;
|
||
|
charbuf path;
|
||
|
size_t len;
|
||
|
|
||
|
init_charbuf(&pb);
|
||
|
add_range_charbuf(&pb, &ranges->path, iri, ESC_PATH);
|
||
|
init_charbuf_at_size(&path, pb.here-pb.base);
|
||
|
len = removed_dot_segments(pb.here-pb.base, pb.base, path.base);
|
||
|
add_nchars_charbuf(cb, len, path.base);
|
||
|
free_charbuf(&path);
|
||
|
free_charbuf(&pb);
|
||
|
}
|
||
|
if ( ranges->query.start )
|
||
|
{ add_charbuf(cb, '?');
|
||
|
add_range_charbuf(cb, &ranges->query, iri, ESC_QUERY);
|
||
|
}
|
||
|
if ( ranges->fragment.start )
|
||
|
{ add_charbuf(cb, '#');
|
||
|
add_range_charbuf(cb, &ranges->fragment, iri, ESC_FRAGMENT);
|
||
|
}
|
||
|
|
||
|
return TRUE;
|
||
|
}
|
||
|
|
||
|
|
||
|
static foreign_t
|
||
|
normalized(term_t URI, term_t CannonicalURI, int iri)
|
||
|
{ pl_wchar_t *s;
|
||
|
size_t len;
|
||
|
|
||
|
if ( PL_get_wchars(URI, &len, &s,
|
||
|
CVT_ATOM|CVT_STRING|CVT_LIST|CVT_EXCEPTION) )
|
||
|
{ uri_component_ranges ranges;
|
||
|
charbuf b;
|
||
|
int rc;
|
||
|
|
||
|
parse_uri(&ranges, len, s);
|
||
|
init_charbuf(&b);
|
||
|
normalize_in_charbuf(&b, &ranges, iri);
|
||
|
|
||
|
rc = PL_unify_wchars(CannonicalURI, PL_ATOM, b.here-b.base, b.base);
|
||
|
free_charbuf(&b);
|
||
|
|
||
|
return rc;
|
||
|
}
|
||
|
|
||
|
return FALSE;
|
||
|
}
|
||
|
|
||
|
|
||
|
/** uri_normalized(+URI, -CannonicalURI)
|
||
|
*/
|
||
|
|
||
|
static foreign_t
|
||
|
uri_normalized(term_t URI, term_t CannonicalURI)
|
||
|
{ return normalized(URI, CannonicalURI, FALSE);
|
||
|
}
|
||
|
|
||
|
|
||
|
/** uri_normalized_iri(+URI, -CannonicalIRI)
|
||
|
*/
|
||
|
|
||
|
static foreign_t
|
||
|
uri_normalized_iri(term_t URI, term_t CannonicalURI)
|
||
|
{ return normalized(URI, CannonicalURI, TRUE);
|
||
|
}
|
||
|
|
||
|
|
||
|
static int
|
||
|
ranges_in_charbuf(charbuf *cb, uri_component_ranges *ranges)
|
||
|
{ if ( ranges->scheme.start )
|
||
|
{ add_verb_range_charbuf(cb, &ranges->scheme);
|
||
|
add_charbuf(cb, ':');
|
||
|
}
|
||
|
if ( ranges->authority.start )
|
||
|
{ add_charbuf(cb, '/');
|
||
|
add_charbuf(cb, '/');
|
||
|
add_verb_range_charbuf(cb, &ranges->authority);
|
||
|
}
|
||
|
add_verb_range_charbuf(cb, &ranges->path);
|
||
|
if ( ranges->query.start )
|
||
|
{ add_charbuf(cb, '?');
|
||
|
add_verb_range_charbuf(cb, &ranges->query);
|
||
|
}
|
||
|
if ( ranges->fragment.start )
|
||
|
{ add_charbuf(cb, '#');
|
||
|
add_verb_range_charbuf(cb, &ranges->fragment);
|
||
|
}
|
||
|
|
||
|
return TRUE;
|
||
|
}
|
||
|
|
||
|
|
||
|
typedef struct
|
||
|
{ atom_t atom;
|
||
|
pl_wchar_t *text;
|
||
|
uri_component_ranges ranges;
|
||
|
} base_cache;
|
||
|
|
||
|
#ifdef _REENTRANT
|
||
|
#include <pthread.h>
|
||
|
static pthread_key_t base_key;
|
||
|
|
||
|
static void
|
||
|
free_base_cache(void *cache)
|
||
|
{ base_cache *base = cache;
|
||
|
|
||
|
if ( PL_query(PL_QUERY_HALTING) )
|
||
|
return;
|
||
|
|
||
|
if ( base->atom )
|
||
|
{ PL_unregister_atom(base->atom);
|
||
|
PL_free(base->text);
|
||
|
}
|
||
|
|
||
|
PL_free(base);
|
||
|
}
|
||
|
|
||
|
static base_cache *
|
||
|
myBase()
|
||
|
{ base_cache *base;
|
||
|
|
||
|
if ( (base=pthread_getspecific(base_key)) )
|
||
|
return base;
|
||
|
base = PL_malloc(sizeof(*base));
|
||
|
memset(base, 0, sizeof(*base));
|
||
|
|
||
|
pthread_setspecific(base_key, base);
|
||
|
return base;
|
||
|
}
|
||
|
|
||
|
#else
|
||
|
static base_cache base_store;
|
||
|
#define myBase() &base_store;
|
||
|
#endif
|
||
|
|
||
|
|
||
|
static const uri_component_ranges *
|
||
|
base_ranges(term_t t)
|
||
|
{ atom_t a;
|
||
|
|
||
|
if ( PL_get_atom(t, &a) )
|
||
|
{ base_cache *base = myBase();
|
||
|
|
||
|
if ( base->atom != a )
|
||
|
{ size_t len;
|
||
|
pl_wchar_t *s;
|
||
|
|
||
|
if ( base->atom )
|
||
|
{ PL_unregister_atom(base->atom);
|
||
|
PL_free(base->text);
|
||
|
}
|
||
|
if ( !PL_get_wchars(t, &len, &s, CVT_ATOM|BUF_MALLOC) )
|
||
|
return NULL;
|
||
|
base->atom = a;
|
||
|
PL_register_atom(a);
|
||
|
base->text = s;
|
||
|
parse_uri(&base->ranges, len, s);
|
||
|
}
|
||
|
|
||
|
return &base->ranges;
|
||
|
} else
|
||
|
{ type_error("atom", t);
|
||
|
return NULL;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
|
||
|
static foreign_t
|
||
|
resolve(term_t Rel, term_t Base, term_t URI, int normalize, int iri)
|
||
|
{ pl_wchar_t *s;
|
||
|
size_t slen;
|
||
|
uri_component_ranges s_ranges, t_ranges;
|
||
|
int rc;
|
||
|
size_t len;
|
||
|
charbuf out, pb, path;
|
||
|
|
||
|
init_charbuf(&pb); /* path-buffer */
|
||
|
|
||
|
if ( PL_get_wchars(Rel, &slen, &s,
|
||
|
CVT_ATOM|CVT_STRING|CVT_LIST|CVT_EXCEPTION) )
|
||
|
{ parse_uri(&s_ranges, slen, s);
|
||
|
if ( s_ranges.scheme.start )
|
||
|
{ t_ranges = s_ranges;
|
||
|
} else
|
||
|
{ const uri_component_ranges *b_ranges;
|
||
|
|
||
|
if ( !(b_ranges = base_ranges(Base)) )
|
||
|
return FALSE;
|
||
|
|
||
|
memset(&t_ranges, 0, sizeof(t_ranges));
|
||
|
if ( s_ranges.authority.start )
|
||
|
{ t_ranges.authority = s_ranges.authority;
|
||
|
t_ranges.path = s_ranges.path;
|
||
|
t_ranges.query = s_ranges.query;
|
||
|
} else
|
||
|
{ if ( s_ranges.path.start == s_ranges.path.end )
|
||
|
{ t_ranges.path = b_ranges->path;
|
||
|
if ( s_ranges.query.start )
|
||
|
t_ranges.query = s_ranges.query;
|
||
|
else
|
||
|
t_ranges.query = b_ranges->query;
|
||
|
} else
|
||
|
{ if ( s_ranges.path.start[0] == '/' )
|
||
|
{ t_ranges.path = s_ranges.path;
|
||
|
} else
|
||
|
{ if ( b_ranges->authority.start &&
|
||
|
b_ranges->path.start == b_ranges->path.end )
|
||
|
{ add_charbuf(&pb, '/');
|
||
|
add_verb_range_charbuf(&pb, &s_ranges.path);
|
||
|
} else
|
||
|
{ range path = b_ranges->path;
|
||
|
|
||
|
path.end = remove_last_segment(path.start, path.end);
|
||
|
add_verb_range_charbuf(&pb, &path);
|
||
|
add_verb_range_charbuf(&pb, &s_ranges.path);
|
||
|
t_ranges.path.start = pb.base;
|
||
|
t_ranges.path.end = pb.here;
|
||
|
}
|
||
|
}
|
||
|
t_ranges.query = s_ranges.query;
|
||
|
}
|
||
|
t_ranges.authority = b_ranges->authority;
|
||
|
}
|
||
|
t_ranges.scheme = b_ranges->scheme;
|
||
|
t_ranges.fragment = s_ranges.fragment;
|
||
|
}
|
||
|
} else
|
||
|
return FALSE;
|
||
|
|
||
|
init_charbuf(&out); /* output buffer */
|
||
|
|
||
|
if ( normalize )
|
||
|
{ normalize_in_charbuf(&out, &t_ranges, iri);
|
||
|
} else
|
||
|
{ init_charbuf_at_size(&path, t_ranges.path.end - t_ranges.path.start);
|
||
|
len = removed_dot_segments(t_ranges.path.end - t_ranges.path.start,
|
||
|
t_ranges.path.start,
|
||
|
path.base);
|
||
|
t_ranges.path.start = path.base;
|
||
|
t_ranges.path.end = path.base+len;
|
||
|
free_charbuf(&pb);
|
||
|
|
||
|
ranges_in_charbuf(&out, &t_ranges);
|
||
|
}
|
||
|
|
||
|
rc = PL_unify_wchars(URI, PL_ATOM, out.here-out.base, out.base);
|
||
|
free_charbuf(&out);
|
||
|
|
||
|
return rc;
|
||
|
}
|
||
|
|
||
|
/** uri_resolve(+Relative, +Base, -Absolute) is det.
|
||
|
*/
|
||
|
|
||
|
static foreign_t
|
||
|
uri_resolve(term_t Rel, term_t Base, term_t URI)
|
||
|
{ return resolve(Rel, Base, URI, FALSE, FALSE);
|
||
|
}
|
||
|
|
||
|
|
||
|
/** uri_normalized(+Relative, +Base, -Absolute) is det.
|
||
|
*/
|
||
|
|
||
|
static foreign_t
|
||
|
uri_normalized3(term_t Rel, term_t Base, term_t URI)
|
||
|
{ return resolve(Rel, Base, URI, TRUE, FALSE);
|
||
|
}
|
||
|
|
||
|
|
||
|
/** uri_normalized_iri(+Relative, +Base, -Absolute) is det.
|
||
|
*/
|
||
|
|
||
|
static foreign_t
|
||
|
uri_normalized_iri3(term_t Rel, term_t Base, term_t IRI)
|
||
|
{ return resolve(Rel, Base, IRI, TRUE, TRUE);
|
||
|
}
|
||
|
|
||
|
|
||
|
/*******************************
|
||
|
* PATH LOGIC *
|
||
|
*******************************/
|
||
|
|
||
|
/* http://labs.apache.org/webarch/uri/rfc/rfc3986.html#relative-dot-segments
|
||
|
*/
|
||
|
|
||
|
static pl_wchar_t *
|
||
|
remove_last_segment(const pl_wchar_t *base, const pl_wchar_t *o)
|
||
|
{ while(o>base && o[-1] != '/' )
|
||
|
o--;
|
||
|
|
||
|
return (pl_wchar_t*) o;
|
||
|
}
|
||
|
|
||
|
|
||
|
static inline int
|
||
|
fetch(const pl_wchar_t *in, const pl_wchar_t *end, int at)
|
||
|
{ if ( in+at>=end )
|
||
|
return 0;
|
||
|
return in[at];
|
||
|
}
|
||
|
|
||
|
static size_t
|
||
|
removed_dot_segments(size_t len, const pl_wchar_t *in, pl_wchar_t *out)
|
||
|
{ const pl_wchar_t *end = &in[len];
|
||
|
pl_wchar_t *o = out;
|
||
|
|
||
|
while(in<end)
|
||
|
{ if ( in[0] == '.' )
|
||
|
{ if ( fetch(in, end, 1) == '/' ||
|
||
|
(fetch(in, end, 1) == '.' && fetch(in, end, 2) == '/') )
|
||
|
{ in += 2; /* 2A */
|
||
|
continue;
|
||
|
}
|
||
|
}
|
||
|
if ( in[0] == '/' && fetch(in, end, 1) == '.' )
|
||
|
{ if ( fetch(in, end, 2) == '/' )
|
||
|
{ in += 2; /* 2B "/./" --> "/" */
|
||
|
continue;
|
||
|
}
|
||
|
if ( !fetch(in, end, 2) )
|
||
|
{ *o++ = '/'; /* 2B "/." --> "/" (and close) */
|
||
|
in += 2;
|
||
|
continue;
|
||
|
}
|
||
|
if ( fetch(in, end, 2) == '.' )
|
||
|
{ if ( fetch(in, end, 3) == '/' )
|
||
|
{ in += 3; /* 2C "/../" --> "/" */
|
||
|
o = remove_last_segment(out, o);
|
||
|
if ( o>out ) o--; /* delete / */
|
||
|
continue;
|
||
|
}
|
||
|
if ( !fetch(in, end, 3) )
|
||
|
{ o = remove_last_segment(out, o);
|
||
|
if ( o>out ) o--; /* delete / */
|
||
|
*o++ = '/';
|
||
|
in += 3;
|
||
|
continue;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
if ( in[0] == '.' )
|
||
|
{ if ( !fetch(in, end, 1) )
|
||
|
{ in++; /* 3D */
|
||
|
continue;
|
||
|
}
|
||
|
if ( fetch(in, end, 1) == '.' && !fetch(in, end, 2) )
|
||
|
{ in += 2; /* 3D */
|
||
|
continue;
|
||
|
}
|
||
|
}
|
||
|
if ( in[0] == '/' )
|
||
|
*o++ = *in++;
|
||
|
while( in < end && in[0] != '/' )
|
||
|
*o++ = *in++;
|
||
|
}
|
||
|
|
||
|
return o-out;
|
||
|
}
|
||
|
|
||
|
|
||
|
/*******************************
|
||
|
* IRI HANDLING *
|
||
|
*******************************/
|
||
|
|
||
|
#define utf8_put_char(out, chr) \
|
||
|
((chr) < 0x80 ? out[0]=(char)(chr), out+1 \
|
||
|
: _utf8_put_char(out, (chr)))
|
||
|
|
||
|
|
||
|
static char *
|
||
|
_utf8_put_char(char *out, int chr)
|
||
|
{ if ( chr < 0x80 )
|
||
|
{ *out++ = chr;
|
||
|
} else if ( chr < 0x800 )
|
||
|
{ *out++ = 0xc0|((chr>>6)&0x1f);
|
||
|
*out++ = 0x80|(chr&0x3f);
|
||
|
} else if ( chr < 0x10000 )
|
||
|
{ *out++ = 0xe0|((chr>>12)&0x0f);
|
||
|
*out++ = 0x80|((chr>>6)&0x3f);
|
||
|
*out++ = 0x80|(chr&0x3f);
|
||
|
} else if ( chr < 0x200000 )
|
||
|
{ *out++ = 0xf0|((chr>>18)&0x07);
|
||
|
*out++ = 0x80|((chr>>12)&0x3f);
|
||
|
*out++ = 0x80|((chr>>6)&0x3f);
|
||
|
*out++ = 0x80|(chr&0x3f);
|
||
|
} else if ( chr < 0x4000000 )
|
||
|
{ *out++ = 0xf8|((chr>>24)&0x03);
|
||
|
*out++ = 0x80|((chr>>18)&0x3f);
|
||
|
*out++ = 0x80|((chr>>12)&0x3f);
|
||
|
*out++ = 0x80|((chr>>6)&0x3f);
|
||
|
*out++ = 0x80|(chr&0x3f);
|
||
|
} else if ( (unsigned)chr < 0x80000000 )
|
||
|
{ *out++ = 0xfc|((chr>>30)&0x01);
|
||
|
*out++ = 0x80|((chr>>24)&0x3f);
|
||
|
*out++ = 0x80|((chr>>18)&0x3f);
|
||
|
*out++ = 0x80|((chr>>12)&0x3f);
|
||
|
*out++ = 0x80|((chr>>6)&0x3f);
|
||
|
*out++ = 0x80|(chr&0x3f);
|
||
|
}
|
||
|
|
||
|
return out;
|
||
|
}
|
||
|
|
||
|
|
||
|
/** uri_iri(+URI, -IRI) is det.
|
||
|
uri_iri(-URI, +IRI) is det.
|
||
|
|
||
|
Perform %- and UTF-8 encoding/decoding to translate between a URI and
|
||
|
IRI
|
||
|
*/
|
||
|
|
||
|
static foreign_t
|
||
|
uri_iri(term_t URI, term_t IRI)
|
||
|
{ if ( !PL_is_variable(URI) )
|
||
|
return uri_normalized_iri(URI, IRI);
|
||
|
else
|
||
|
return uri_normalized(IRI, URI);
|
||
|
}
|
||
|
|
||
|
|
||
|
/*******************************
|
||
|
* REGISTRATION *
|
||
|
*******************************/
|
||
|
|
||
|
#define MKATOM(n) \
|
||
|
ATOM_ ## n = PL_new_atom(#n)
|
||
|
#define MKFUNCTOR(n,a) \
|
||
|
FUNCTOR_ ## n ## a = PL_new_functor(PL_new_atom(#n), a)
|
||
|
|
||
|
install_t
|
||
|
install_uri()
|
||
|
{ MKATOM(query_value);
|
||
|
MKATOM(fragment);
|
||
|
MKATOM(path);
|
||
|
|
||
|
MKFUNCTOR(uri_components, 5);
|
||
|
MKFUNCTOR(uri_authority, 4);
|
||
|
MKFUNCTOR(error, 2);
|
||
|
MKFUNCTOR(syntax_error, 1);
|
||
|
MKFUNCTOR(type_error, 2);
|
||
|
MKFUNCTOR(domain_error, 2);
|
||
|
FUNCTOR_equal2 = PL_new_functor(PL_new_atom("="), 2);
|
||
|
FUNCTOR_pair2 = PL_new_functor(PL_new_atom("-"), 2);
|
||
|
|
||
|
#ifdef _REENTRANT
|
||
|
pthread_key_create(&base_key, free_base_cache);
|
||
|
#endif
|
||
|
|
||
|
PL_register_foreign("uri_components", 2, uri_components, 0);
|
||
|
PL_register_foreign("uri_is_global", 1, uri_is_global, 0);
|
||
|
PL_register_foreign("uri_normalized", 2, uri_normalized, 0);
|
||
|
PL_register_foreign("uri_normalized_iri", 2, uri_normalized_iri, 0);
|
||
|
PL_register_foreign("uri_resolve", 3, uri_resolve, 0);
|
||
|
PL_register_foreign("uri_normalized", 3, uri_normalized3, 0);
|
||
|
PL_register_foreign("uri_normalized_iri", 3, uri_normalized_iri3, 0);
|
||
|
PL_register_foreign("uri_query_components", 2, uri_query_components, 0);
|
||
|
PL_register_foreign("uri_authority_components",
|
||
|
2, uri_authority_components, 0);
|
||
|
PL_register_foreign("uri_encoded", 3, uri_encoded, 0);
|
||
|
PL_register_foreign("uri_iri", 2, uri_iri, 0);
|
||
|
}
|