This repository has been archived on 2023-08-20. You can view files and clone it, but cannot push or open issues or pull requests.
yap-6.3/packages/clib/uri.c
Vitor Santos Costa 8d3b9ad16c RDF package
2010-07-28 11:34:41 +01:00

1601 lines
36 KiB
C

/* $Id$
Part of SWI-Prolog
Author: Jan Wielemaker
E-mail: J.Wielemaker@cs.vu.nl
WWW: http://www.swi-prolog.org
Copyright (C): 2009, VU University Amsterdam
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
#ifdef __WINDOWS__
#define inline __inline
#endif
#include <SWI-Prolog.h>
#include <string.h>
#include <stdio.h>
#include <wchar.h>
#include <wctype.h>
#include <assert.h>
static size_t removed_dot_segments(size_t len, const pl_wchar_t *in,
pl_wchar_t *out);
static pl_wchar_t *remove_last_segment(const pl_wchar_t *base,
const pl_wchar_t *o);
static char *_utf8_put_char(char *out, int chr);
#define ISUTF8_MB(c) ((unsigned)(c) >= 0xc0 && (unsigned)(c) <= 0xfd)
#define utf8_put_char(out, chr) \
((chr) < 0x80 ? out[0]=(char)(chr), out+1 \
: _utf8_put_char(out, (chr)))
/*******************************
* ERRORS *
*******************************/
static atom_t ATOM_query_value;
static atom_t ATOM_fragment;
static atom_t ATOM_path;
static functor_t FUNCTOR_equal2; /* =/2 */
static functor_t FUNCTOR_pair2; /* -/2 */
static functor_t FUNCTOR_uri_components5;
static functor_t FUNCTOR_uri_authority4;
static functor_t FUNCTOR_error2;
static functor_t FUNCTOR_syntax_error1;
static functor_t FUNCTOR_type_error2;
static functor_t FUNCTOR_domain_error2;
static int
syntax_error(const char *culprit)
{ term_t ex;
if ( (ex=PL_new_term_ref()) &&
PL_unify_term(ex,
PL_FUNCTOR, FUNCTOR_error2,
PL_FUNCTOR, FUNCTOR_syntax_error1,
PL_CHARS, culprit,
PL_VARIABLE) )
return PL_raise_exception(ex);
return FALSE;
}
static int
type_error(const char *expected, term_t found)
{ term_t ex;
if ( (ex=PL_new_term_ref()) &&
PL_unify_term(ex,
PL_FUNCTOR, FUNCTOR_error2,
PL_FUNCTOR, FUNCTOR_type_error2,
PL_CHARS, expected,
PL_TERM, found,
PL_VARIABLE) )
return PL_raise_exception(ex);
return FALSE;
}
static int
domain_error(const char *expected, term_t found)
{ term_t ex;
if ( (ex=PL_new_term_ref()) &&
PL_unify_term(ex,
PL_FUNCTOR, FUNCTOR_error2,
PL_FUNCTOR, FUNCTOR_domain_error2,
PL_CHARS, expected,
PL_TERM, found,
PL_VARIABLE) )
return PL_raise_exception(ex);
return FALSE;
}
/*******************************
* ESCAPING *
*******************************/
#define ESC_PATH (CH_PCHAR|CH_EX_PATH)
#define ESC_QUERY (CH_PCHAR|CH_EX_QF)
#define ESC_QVALUE (CH_UNRESERVED|CH_QSUBDELIM|CH_EX_PCHAR|CH_EX_QF)
#define ESC_QNAME (CH_PCHAR)
#define ESC_FRAGMENT (CH_PCHAR|CH_EX_QF)
#define ESC_AUTH (CH_PCHAR)
#define ESC_PASSWD (CH_PCHAR)
#define ESC_USER (CH_PCHAR)
#define ESC_SCHEME (CH_SCHEME)
#define ESC_PORT (CH_DIGIT)
#define ESC_HOST (CH_UNRESERVED|CH_SUBDELIM)
#define CH_ALPHA 0x0001
#define CH_DIGIT 0x0002
#define CH_EX_UNRES 0x0004
#define CH_GENDELIM 0x0008
#define CH_SUBDELIM 0x0010
#define CH_URL 0x0020
#define CH_EX_PCHAR 0x0040
#define CH_EX_QF 0x0080 /* Extra query and fragment chars */
#define CH_EX_SCHEME 0x0100
#define CH_QSUBDELIM 0x0200
#define CH_EX_PATH 0x0400
#define CH_SCHEME (CH_ALPHA|CH_DIGIT|CH_EX_SCHEME)
#define CH_UNRESERVED (CH_ALPHA|CH_DIGIT|CH_EX_UNRES)
#define CH_PCHAR (CH_UNRESERVED|CH_SUBDELIM|CH_EX_PCHAR)
static int charflags[128] = {0};
static int flags_done = 0;
static void
set_flags(const char *from, int flag)
{ for(; *from; from++)
charflags[from[0]&0xff] |= flag;
}
static void
fill_flags()
{ if ( !flags_done )
{ int c;
for(c='a'; c<='z'; c++)
charflags[c] |= CH_ALPHA;
for(c='A'; c<='Z'; c++)
charflags[c] |= CH_ALPHA;
for(c='0'; c<='9'; c++)
charflags[c] |= CH_DIGIT;
set_flags("-._~", CH_EX_UNRES);
set_flags(":/?#[]@", CH_GENDELIM);
set_flags("!$&'()+*,;=", CH_SUBDELIM);
set_flags("!$'()*,;", CH_QSUBDELIM); /* = CH_SUBDELIM - "&=+" */
set_flags(":@", CH_EX_PCHAR);
set_flags("/", CH_EX_PATH);
set_flags("/?", CH_EX_QF);
set_flags("+-.", CH_EX_SCHEME);
set_flags("/:?#&=", CH_URL);
flags_done = TRUE;
}
}
#define no_escape(c, f) ((c < 128) && (charflags[(int)c] & (f)))
#define iri_no_escape(c, f) ((c > 128) || (charflags[(int)c] & (f)))
/* hex(const pl_wchar_t *in, int digits, int *value)
Get <digits> characters from in and interpret them as a hexadecimal
integer. Returns pointer to the end on success or NULL if error.
*/
static const pl_wchar_t *
hex(const pl_wchar_t *in, int digits, int *value)
{ int v = 0;
while(digits-- > 0)
{ int c = *in++;
if ( c >= '0' && c <= '9' )
v = (v<<4) + c - '0';
else if ( c >= 'A' && c <= 'F' )
v = (v<<4) + c + 10 - 'A';
else if ( c >= 'a' && c <= 'f' )
v = (v<<4) + c + 10 - 'a';
else
return NULL;
}
*value = v;
return in;
}
static const pl_wchar_t *
get_encoded_utf8_cont_1(const pl_wchar_t *in, int *val)
{ int c;
if ( in[0] == '%' && hex(in+1, 2, &c) )
{ if ( (c&0xc0) == 0x80 )
{ *val = (c&0x3f);
return in+3;
}
}
return NULL;
}
static const pl_wchar_t *
get_encoded_utf8_cont(const pl_wchar_t *in, int cnt, int *val)
{ int shift = cnt*6;
*val <<= shift;
shift -= 6;
while(cnt-->0)
{ int v0;
if ( (in = get_encoded_utf8_cont_1(in, &v0)) )
{ *val |= (v0<<shift);
shift -= 6;
} else
return NULL;
}
return in;
}
static const pl_wchar_t *
get_encoded_utf8(const pl_wchar_t *in, int *chr)
{ int c1;
if ( in[0] == '%' && hex(in+1, 2, &c1) )
{ in += 3;
if ( ISUTF8_MB(c1) )
{ if ( (c1&0xe0) == 0xc0 ) /* 2-byte */
{ *chr = (c1&0x1f);
return get_encoded_utf8_cont(in, 1, chr);
} else if ( (c1&0xf0) == 0xe0 ) /* 3-byte */
{ *chr = (c1&0xf);
return get_encoded_utf8_cont(in, 2, chr);
} else if ( (c1&0xf8) == 0xf0 ) /* 4-byte */
{ *chr = (c1&0x7);
return get_encoded_utf8_cont(in, 3, chr);
} else if ( (c1&0xfc) == 0xf8 ) /* 5-byte */
{ *chr = (c1&0x3);
return get_encoded_utf8_cont(in, 4, chr);
} else if ( (c1&0xfe) == 0xfc ) /* 6-byte */
{ *chr = (c1&0x1);
return get_encoded_utf8_cont(in, 5, chr);
} else
return NULL;
} else
{ *chr = c1;
return in; /* Encoded ASCII character */
}
}
return NULL;
}
/*******************************
* RANGES *
*******************************/
typedef struct range
{ const pl_wchar_t *start;
const pl_wchar_t *end;
} range;
/*******************************
* CHARACTER BUFFER *
*******************************/
typedef struct charbuf
{ pl_wchar_t *base;
pl_wchar_t *here;
pl_wchar_t *end;
pl_wchar_t tmp[256];
} charbuf;
static void
init_charbuf(charbuf *cb)
{ cb->base = cb->here = cb->tmp;
cb->end = &cb->tmp[sizeof(cb->tmp)/sizeof(pl_wchar_t)];
}
static int
init_charbuf_at_size(charbuf *cb, size_t size)
{ size++;
if ( size < sizeof(cb->tmp)/sizeof(pl_wchar_t) )
cb->base = cb->here = cb->tmp;
else
cb->base = cb->here = PL_malloc(size*sizeof(pl_wchar_t));
return TRUE;
}
static int
add_charbuf(charbuf *cb, int c)
{ if ( cb->here < cb->end )
{ *cb->here++ = c;
} else
{ size_t len = (cb->end-cb->base);
if ( cb->base == cb->tmp )
{ pl_wchar_t *n = PL_malloc(len*2*sizeof(pl_wchar_t));
memcpy(n, cb->base, sizeof(cb->tmp));
cb->base = n;
} else
{ cb->base = PL_realloc(cb->base, len*2*sizeof(pl_wchar_t));
}
cb->here = &cb->base[len];
cb->end = &cb->base[len*2];
*cb->here++ = c;
}
return TRUE;
}
static inline int
hexdigit(int val)
{ if ( val < 10 )
return '0'+val;
return 'A'-10+val;
}
static int
add_encoded_charbuf(charbuf *cb, int c, int flags)
{ if ( no_escape(c, flags) )
{ add_charbuf(cb, c);
} else
{ char tmp[6];
const char *end = utf8_put_char(tmp, c);
const char *s;
for(s=tmp; s<end; s++)
{ int b = s[0]&0xff;
add_charbuf(cb, '%');
add_charbuf(cb, hexdigit(b>>4));
add_charbuf(cb, hexdigit(b&0xf));
}
}
return TRUE;
}
static int
iri_add_encoded_charbuf(charbuf *cb, int c, int flags)
{ if ( iri_no_escape(c, flags) )
{ add_charbuf(cb, c);
} else
{ assert(c < 128);
add_charbuf(cb, '%');
add_charbuf(cb, hexdigit(c>>4));
add_charbuf(cb, hexdigit(c&0xf));
}
return TRUE;
}
static int
add_nchars_charbuf(charbuf *cb, size_t len, const pl_wchar_t *s)
{ if ( cb->here+len <= cb->end )
{ wcsncpy(cb->here, s, len);
cb->here += len;
} else
{ size_t n;
for(n=0; n<len; n++)
add_charbuf(cb, s[n]);
}
return TRUE;
}
static int
range_has_escape(const range *r, int flags)
{ const pl_wchar_t *s = r->start;
for(; s<r->end; s++)
{ if ( s[0] == '%' || (s[0] == '+' && flags == ESC_QVALUE) )
return TRUE;
}
return FALSE;
}
static int
range_is_unreserved(const range *r, int iri, int flags)
{ const pl_wchar_t *s = r->start;
if ( iri )
{ for(; s<r->end; s++)
{ if ( !iri_no_escape(s[0], flags) )
return FALSE;
}
} else
{ for(; s<r->end; s++)
{ if ( !no_escape(s[0], flags) )
return FALSE;
}
}
return TRUE;
}
static int
add_verb_range_charbuf(charbuf *cb, const range *r)
{ return add_nchars_charbuf(cb, r->end-r->start, r->start);
}
static int
add_decoded_range_charbuf(charbuf *cb, const range *r, int flags)
{ const pl_wchar_t *s = r->start;
while(s<r->end)
{ int c;
if ( *s == '%' )
{ const pl_wchar_t *e;
if ( (e=get_encoded_utf8(s, &c)) )
{ s = e;
} else if (hex(s+1, 2, &c) )
{ s += 3;
} else
{ c = *s++;
}
} else if ( *s == '+' && flags == ESC_QVALUE )
{ s++;
c = ' ';
} else
{ c = *s++;
}
add_charbuf(cb, c);
}
return TRUE;
}
static int
add_normalized_range_charbuf(charbuf *cb, const range *r, int iri, int flags)
{ const pl_wchar_t *s = r->start;
fprintf(stderr,"s=%s r->start=%p r->end=%p\n",s,r->start,r->end);
jmp_deb(1);
while(s<r->end)
{ int c;
if ( *s == '%' )
{ const pl_wchar_t *e;
if ( (e=get_encoded_utf8(s, &c)) )
{ s = e;
} else if (hex(s+1, 2, &c) )
{ s += 3;
} else
{ c = *s++;
}
} else if ( *s == '+' && flags == ESC_QVALUE )
{ s++;
c = ' ';
} else
{ c = *s++;
}
if ( iri )
{ iri_add_encoded_charbuf(cb, c, flags);
} else
{ add_encoded_charbuf(cb, c, flags);
}
}
return TRUE;
}
/* add_range_charbuf(charbuf *cb, const range *r, int iri, int flags)
Add a range of characters while normalizing %-encoding. This
implies not to use encoding if it is not needed and upcase
%xx to %XX otherwise.
If iri == TRUE, values >= 128 are not escaped. Otherwise they
use %-encoded UTF-8
*/
static int
add_range_charbuf(charbuf *cb, const range *r, int iri, int flags)
{ if ( range_has_escape(r, flags) )
{ return add_normalized_range_charbuf(cb, r, iri, flags);
} else if ( range_is_unreserved(r, iri, flags) )
{ add_nchars_charbuf(cb, r->end-r->start, r->start);
} else
{ const pl_wchar_t *s = r->start;
if ( iri )
{ while(s<r->end)
iri_add_encoded_charbuf(cb, *s++, flags);
} else
{ while(s<r->end)
add_encoded_charbuf(cb, *s++, flags);
}
}
return TRUE;
}
/* add_lwr_range_charbuf(charbuf *cb, const range *r, int iri, int flags)
Add a range of characters while normalizing %-encoding and
mapping all characters to lowercase.
FIXME: encoding and decoding compatible to add_range_charbuf();
*/
static int
add_lwr_range_charbuf(charbuf *cb, const range *r, int iri, int flags)
{ const pl_wchar_t *s = r->start;
while(s<r->end)
{ int c;
if ( *s == '%' )
{ const pl_wchar_t *e;
if ( (e=get_encoded_utf8(s, &c)) )
{ s = e;
} else if (hex(s+1, 2, &c) )
{ s += 3;
} else
{ c = *s++;
}
} else
{ c = *s++;
}
if ( iri )
iri_add_encoded_charbuf(cb, towlower((wint_t)c), flags);
else
add_encoded_charbuf(cb, towlower((wint_t)c), flags);
}
return TRUE;
}
static void
free_charbuf(charbuf *cb)
{ if ( cb->base != cb->tmp )
PL_free(cb->base);
}
#define TXT_EX_TEXT (CVT_ATOM|CVT_STRING|CVT_EXCEPTION)
static int
get_text_arg(term_t term, int pos, size_t *len, pl_wchar_t **s, int flags)
{ term_t tmp = PL_new_term_ref();
_PL_get_arg(pos, term, tmp);
if ( PL_is_variable(tmp) )
return FALSE;
if ( !PL_get_wchars(tmp, len, s, flags) )
return -1;
return TRUE;
}
/** uri_components(+URI, -Components)
Based on RFC-3986 regular expression:
==
^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
12 3 4 5 6 7 8 9
==
*/
typedef struct uri_component_ranges
{ range scheme;
range authority;
range path;
range query;
range fragment;
} uri_component_ranges;
static const pl_wchar_t *
skip_not(const pl_wchar_t *in, const pl_wchar_t *end, const pl_wchar_t *chars)
{ if ( !chars[1] )
{ for(; in < end; in++)
{ if ( chars[0] == in[0] )
return in;
}
} else
{ for(; in < end; in++)
{ if ( wcschr(chars, in[0]) )
return in;
}
}
return in;
}
static int
unify_range(term_t t, const range *r)
{ if ( r->start )
return PL_unify_wchars(t, PL_ATOM, r->end - r->start, r->start);
return TRUE;
}
static int
parse_uri(uri_component_ranges *ranges, size_t len, const pl_wchar_t *s)
{ const pl_wchar_t *end = &s[len];
const pl_wchar_t *here = s;
const pl_wchar_t *e;
memset(ranges, 0, sizeof(*ranges));
e = skip_not(here, end, L":/?#");
if ( e > s && e[0] == ':' ) /* 1&2 */
{ ranges->scheme.start = s;
ranges->scheme.end = e;
here = e+1;
}
if ( here[0] == '/' && here[1] == '/' ) /* 3 */
{ here += 2; /* 4 */
e = skip_not(here, end, L"/?#");
ranges->authority.start = here;
ranges->authority.end = e;
here = e; /* 5 */
}
e = skip_not(here, end, L"?#");
ranges->path.start = here;
ranges->path.end = e;
here = e; /* 6 */
if ( here[0] == '?' )
{ here++; /* 7 */
e = skip_not(here, end, L"#");
ranges->query.start = here;
ranges->query.end = e;
here = e; /* 8 */
}
if ( here[0] == '#' )
{ here++; /* 9 */
ranges->fragment.start = here;
ranges->fragment.end = end;
}
return TRUE;
}
static foreign_t
uri_components(term_t URI, term_t components)
{ pl_wchar_t *s;
size_t len;
if ( PL_get_wchars(URI, &len, &s, CVT_ATOM|CVT_STRING|CVT_LIST) )
{ uri_component_ranges ranges;
term_t rt = PL_new_term_refs(6);
term_t av = rt+1;
parse_uri(&ranges, len, s);
unify_range(av+0, &ranges.scheme);
unify_range(av+1, &ranges.authority);
unify_range(av+2, &ranges.path);
unify_range(av+3, &ranges.query);
unify_range(av+4, &ranges.fragment);
return (PL_cons_functor_v(rt, FUNCTOR_uri_components5, av) &&
PL_unify(components, rt));
} else if ( PL_is_functor(components, FUNCTOR_uri_components5) )
{ charbuf b;
int rc;
init_charbuf(&b);
/* schema */
if ( (rc=get_text_arg(components, 1, &len, &s, TXT_EX_TEXT)) == TRUE )
{ add_nchars_charbuf(&b, len, s);
add_charbuf(&b, ':');
} else if ( rc == -1 )
{ free_charbuf(&b);
return FALSE;
}
/* authority */
if ( (rc=get_text_arg(components, 2, &len, &s, TXT_EX_TEXT)) == TRUE )
{ add_charbuf(&b, '/');
add_charbuf(&b, '/');
add_nchars_charbuf(&b, len, s);
} else if ( rc == -1 )
{ free_charbuf(&b);
return FALSE;
}
/* path */
if ( (rc=get_text_arg(components, 3, &len, &s, TXT_EX_TEXT)) == TRUE )
{ add_nchars_charbuf(&b, len, s);
} else if ( rc == -1 )
{ free_charbuf(&b);
return FALSE;
}
/* query */
if ( (rc=get_text_arg(components, 4, &len, &s, TXT_EX_TEXT)) == TRUE )
{ if ( len > 0 )
{ add_charbuf(&b, '?');
add_nchars_charbuf(&b, len, s);
}
} else if ( rc == -1 )
{ free_charbuf(&b);
return FALSE;
}
/* fragment */
if ( (rc=get_text_arg(components, 5, &len, &s, TXT_EX_TEXT)) == TRUE )
{ add_charbuf(&b, '#');
add_nchars_charbuf(&b, len, s);
} else if ( rc == -1 )
{ free_charbuf(&b);
return FALSE;
}
rc = PL_unify_wchars(URI, PL_ATOM, b.here-b.base, b.base);
free_charbuf(&b);
return rc;
} else /* generate an error */
{ return PL_get_wchars(URI, &len, &s,
CVT_ATOM|CVT_STRING|CVT_LIST|CVT_EXCEPTION);
}
}
/** uri_is_global(+URI) is semidet.
*/
static foreign_t
uri_is_global(term_t URI)
{ pl_wchar_t *s;
size_t len;
if ( PL_get_wchars(URI, &len, &s,
CVT_ATOM|CVT_STRING|CVT_LIST|CVT_EXCEPTION) )
{ const pl_wchar_t *e;
const pl_wchar_t *end = &s[len];
range r;
e = skip_not(s, end, L":/?#");
if ( e > s && e[0] == ':' )
{ r.start = s;
r.end = e;
if ( range_is_unreserved(&r, TRUE, CH_SCHEME) )
return TRUE;
}
}
return FALSE;
}
/*******************************
* QUERY-STRING *
*******************************/
static int
unify_decoded_atom(term_t t, range *r, int flags)
{ if ( range_has_escape(r, flags) )
{ charbuf b;
int rc;
init_charbuf(&b);
add_decoded_range_charbuf(&b, r, flags);
rc = PL_unify_wchars(t, PL_ATOM, b.here - b.base, b.base);
free_charbuf(&b);
return rc;
} else
{ return unify_range(t, r);
}
}
static int
unify_query_string_components(term_t list, size_t len, const pl_wchar_t *qs)
{ if ( len == 0 )
{ return PL_unify_nil(list);
} else
{ term_t tail = PL_copy_term_ref(list);
term_t head = PL_new_term_ref();
term_t eq = PL_new_term_refs(3);
term_t nv = eq+1;
const pl_wchar_t *end = &qs[len];
while(qs < end)
{ range name, value;
name.start = qs;
name.end = skip_not(qs, end, L"=");
if ( name.end < end )
{ value.start = name.end+1;
value.end = skip_not(value.start, end, L"&");
qs = value.end+1;
} else
{ return syntax_error("illegal_uri_query");
}
PL_put_variable(nv+0);
PL_put_variable(nv+1);
unify_decoded_atom(nv+0, &name, ESC_QNAME);
unify_decoded_atom(nv+1, &value, ESC_QVALUE);
if ( !PL_cons_functor_v(eq, FUNCTOR_equal2, nv) ||
!PL_unify_list(tail, head, tail) ||
!PL_unify(head, eq) )
return FALSE;
}
return PL_unify_nil(tail);
}
}
static int
add_encoded_term_charbuf(charbuf *cb, term_t value, int flags)
{ pl_wchar_t *s;
range r;
size_t len;
if ( !PL_get_wchars(value, &len, &s, CVT_ATOMIC|CVT_EXCEPTION) )
return FALSE;
r.start = s;
r.end = r.start+len;
if ( range_is_unreserved(&r, TRUE, flags) )
{ add_nchars_charbuf(cb, r.end-r.start, r.start);
} else
{ const pl_wchar_t *s = r.start;
while(s<r.end)
add_encoded_charbuf(cb, *s++, flags);
}
return TRUE;
}
/** uri_query_components(+QueryString, -ValueList) is det.
*/
static foreign_t
uri_query_components(term_t string, term_t list)
{ pl_wchar_t *s;
size_t len;
if ( PL_get_wchars(string, &len, &s, CVT_ATOM|CVT_STRING|CVT_LIST) )
{ return unify_query_string_components(list, len, s);
} else if ( PL_is_list(list) )
{ term_t tail = PL_copy_term_ref(list);
term_t head = PL_new_term_ref();
term_t nv = PL_new_term_refs(2);
charbuf out;
int rc;
fill_flags();
init_charbuf(&out);
while( PL_get_list(tail, head, tail) )
{ atom_t fname;
int arity;
if ( PL_is_functor(head, FUNCTOR_equal2) ||
PL_is_functor(head, FUNCTOR_pair2) )
{ _PL_get_arg(1, head, nv+0);
_PL_get_arg(2, head, nv+1);
} else if ( PL_get_name_arity(head, &fname, &arity) && arity == 1 )
{ PL_put_atom(nv+0, fname);
_PL_get_arg(1, head, nv+1);
} else
{ free_charbuf(&out);
return type_error("name_value", head);
}
if ( out.here != out.base )
add_charbuf(&out, '&');
if ( !add_encoded_term_charbuf(&out, nv+0, ESC_QNAME) )
{ free_charbuf(&out);
return FALSE;
}
add_charbuf(&out, '=');
if ( !add_encoded_term_charbuf(&out, nv+1, ESC_QVALUE) )
{ free_charbuf(&out);
return FALSE;
}
}
rc = PL_unify_wchars(string, PL_ATOM, out.here-out.base, out.base);
free_charbuf(&out);
return rc;
} else
{ return PL_get_wchars(string, &len, &s,
CVT_ATOM|CVT_STRING|CVT_LIST|CVT_EXCEPTION);
}
return FALSE;
}
/** uri_encoded(+What, +String, -Encoded)
*/
static foreign_t
uri_encoded(term_t what, term_t qv, term_t enc)
{ pl_wchar_t *s;
size_t len;
atom_t w;
int flags;
if ( !PL_get_atom(what, &w) )
return type_error("atom", what);
if ( w == ATOM_query_value )
flags = ESC_QVALUE;
else if ( w == ATOM_fragment )
flags = ESC_FRAGMENT;
else if ( w == ATOM_path )
flags = ESC_PATH;
else
return domain_error("uri_component", what);
fill_flags();
if ( !PL_is_variable(qv) )
{ charbuf out;
int rc;
init_charbuf(&out);
if ( !add_encoded_term_charbuf(&out, qv, flags) )
{ free_charbuf(&out);
return FALSE;
}
rc = PL_unify_wchars(enc, PL_ATOM, out.here-out.base, out.base);
free_charbuf(&out);
return rc;
} else if ( PL_get_wchars(enc, &len, &s, CVT_ATOM|CVT_STRING|CVT_EXCEPTION) )
{ range r;
r.start = s;
r.end = s+len;
return unify_decoded_atom(qv, &r, flags);
} else
{ return FALSE;
}
}
/*******************************
* AUTHORITY *
*******************************/
static int
unify_uri_authority_components(term_t components,
size_t len, const pl_wchar_t *s)
{ const pl_wchar_t *end = &s[len];
const pl_wchar_t *e;
range user = {0};
range passwd = {0};
range host = {0};
range port = {0};
term_t t = PL_new_term_refs(5);
term_t av = t+1;
if ( (e=skip_not(s, end, L"@")) && e<end )
{ user.start = s;
user.end = e;
s = e+1;
if ( (e=skip_not(user.start, user.end, L":")) && e<user.end )
{ passwd.start = e+1;
passwd.end = user.end;
user.end = e;
}
}
host.start = s;
host.end = skip_not(s, end, L":");
if ( host.end < end )
{ port.start = host.end+1;
port.end = end;
}
if ( user.start )
unify_decoded_atom(av+0, &user, ESC_USER);
if ( passwd.start )
unify_decoded_atom(av+1, &passwd, ESC_PASSWD);
unify_decoded_atom(av+2, &host, ESC_HOST);
if ( port.start )
{ wchar_t *ep;
long pn = wcstol(port.start, &ep, 10);
if ( ep == port.end )
{ if ( !PL_put_integer(av+3, pn) )
return FALSE;
} else
{ unify_decoded_atom(av+3, &port, ESC_PORT);
}
}
return (PL_cons_functor_v(t, FUNCTOR_uri_authority4, av) &&
PL_unify(components, t));
}
/** uri_authority_components(+Authority, -Components) is det.
uri_authority_components(-Authority, +Components) is det.
*/
static foreign_t
uri_authority_components(term_t Authority, term_t components)
{ pl_wchar_t *s;
size_t len;
if ( PL_get_wchars(Authority, &len, &s, CVT_ATOM|CVT_STRING|CVT_LIST) )
{ return unify_uri_authority_components(components, len, s);
} else if ( PL_is_functor(components, FUNCTOR_uri_authority4) )
{ charbuf b;
int rc;
init_charbuf(&b);
if ( (rc=get_text_arg(components, 1, &len, &s, TXT_EX_TEXT)) == TRUE )
{ add_nchars_charbuf(&b, len, s);
if ( (rc=get_text_arg(components, 2, &len, &s, TXT_EX_TEXT)) == TRUE )
{ add_charbuf(&b, ':');
add_nchars_charbuf(&b, len, s);
} else if ( rc == -1 )
{ free_charbuf(&b);
return FALSE;
}
add_charbuf(&b, '@');
} else if ( rc == -1 )
{ free_charbuf(&b);
return FALSE;
}
if ( (rc=get_text_arg(components, 3, &len, &s, TXT_EX_TEXT)) == TRUE )
{ add_nchars_charbuf(&b, len, s);
} else if ( rc == -1 )
{ free_charbuf(&b);
return FALSE;
}
if ( (rc=get_text_arg(components, 4, &len, &s,
TXT_EX_TEXT|CVT_INTEGER)) == TRUE )
{ add_charbuf(&b, ':');
add_nchars_charbuf(&b, len, s);
} else if ( rc == -1 )
{ free_charbuf(&b);
return FALSE;
}
rc = PL_unify_wchars(Authority, PL_ATOM, b.here-b.base, b.base);
free_charbuf(&b);
return rc;
} else
{ return PL_get_wchars(Authority, &len, &s,
CVT_ATOM|CVT_STRING|CVT_LIST|CVT_EXCEPTION);
}
}
/*******************************
* NORMALIZATION *
*******************************/
static int
normalize_in_charbuf(charbuf *cb, uri_component_ranges *ranges, int iri)
{ fill_flags();
if ( ranges->scheme.start )
{ add_lwr_range_charbuf(cb, &ranges->scheme, iri, ESC_SCHEME);
add_charbuf(cb, ':');
}
if ( ranges->authority.start )
{ add_charbuf(cb, '/');
add_charbuf(cb, '/');
add_lwr_range_charbuf(cb, &ranges->authority, iri, ESC_AUTH);
}
if ( ranges->path.end > ranges->path.start )
{ charbuf pb;
charbuf path;
size_t len;
init_charbuf(&pb);
add_range_charbuf(&pb, &ranges->path, iri, ESC_PATH);
init_charbuf_at_size(&path, pb.here-pb.base);
len = removed_dot_segments(pb.here-pb.base, pb.base, path.base);
add_nchars_charbuf(cb, len, path.base);
free_charbuf(&path);
free_charbuf(&pb);
}
if ( ranges->query.start )
{ add_charbuf(cb, '?');
add_range_charbuf(cb, &ranges->query, iri, ESC_QUERY);
}
if ( ranges->fragment.start )
{ add_charbuf(cb, '#');
add_range_charbuf(cb, &ranges->fragment, iri, ESC_FRAGMENT);
}
return TRUE;
}
static foreign_t
normalized(term_t URI, term_t CannonicalURI, int iri)
{ pl_wchar_t *s;
size_t len;
if ( PL_get_wchars(URI, &len, &s,
CVT_ATOM|CVT_STRING|CVT_LIST|CVT_EXCEPTION) )
{ uri_component_ranges ranges;
charbuf b;
int rc;
parse_uri(&ranges, len, s);
init_charbuf(&b);
normalize_in_charbuf(&b, &ranges, iri);
rc = PL_unify_wchars(CannonicalURI, PL_ATOM, b.here-b.base, b.base);
free_charbuf(&b);
return rc;
}
return FALSE;
}
/** uri_normalized(+URI, -CannonicalURI)
*/
static foreign_t
uri_normalized(term_t URI, term_t CannonicalURI)
{ return normalized(URI, CannonicalURI, FALSE);
}
/** uri_normalized_iri(+URI, -CannonicalIRI)
*/
static foreign_t
uri_normalized_iri(term_t URI, term_t CannonicalURI)
{ return normalized(URI, CannonicalURI, TRUE);
}
static int
ranges_in_charbuf(charbuf *cb, uri_component_ranges *ranges)
{ if ( ranges->scheme.start )
{ add_verb_range_charbuf(cb, &ranges->scheme);
add_charbuf(cb, ':');
}
if ( ranges->authority.start )
{ add_charbuf(cb, '/');
add_charbuf(cb, '/');
add_verb_range_charbuf(cb, &ranges->authority);
}
add_verb_range_charbuf(cb, &ranges->path);
if ( ranges->query.start )
{ add_charbuf(cb, '?');
add_verb_range_charbuf(cb, &ranges->query);
}
if ( ranges->fragment.start )
{ add_charbuf(cb, '#');
add_verb_range_charbuf(cb, &ranges->fragment);
}
return TRUE;
}
typedef struct
{ atom_t atom;
pl_wchar_t *text;
uri_component_ranges ranges;
} base_cache;
#ifdef _REENTRANT
#include <pthread.h>
static pthread_key_t base_key;
static void
free_base_cache(void *cache)
{ base_cache *base = cache;
if ( PL_query(PL_QUERY_HALTING) )
return;
if ( base->atom )
{ PL_unregister_atom(base->atom);
PL_free(base->text);
}
PL_free(base);
}
static base_cache *
myBase()
{ base_cache *base;
if ( (base=pthread_getspecific(base_key)) )
return base;
base = PL_malloc(sizeof(*base));
memset(base, 0, sizeof(*base));
pthread_setspecific(base_key, base);
return base;
}
#else
static base_cache base_store;
#define myBase() &base_store;
#endif
static const uri_component_ranges *
base_ranges(term_t t)
{ atom_t a;
if ( PL_get_atom(t, &a) )
{ base_cache *base = myBase();
if ( base->atom != a )
{ size_t len;
pl_wchar_t *s;
if ( base->atom )
{ PL_unregister_atom(base->atom);
PL_free(base->text);
}
if ( !PL_get_wchars(t, &len, &s, CVT_ATOM|BUF_MALLOC) )
return NULL;
base->atom = a;
PL_register_atom(a);
base->text = s;
parse_uri(&base->ranges, len, s);
}
return &base->ranges;
} else
{ type_error("atom", t);
return NULL;
}
}
static foreign_t
resolve(term_t Rel, term_t Base, term_t URI, int normalize, int iri)
{ pl_wchar_t *s;
size_t slen;
uri_component_ranges s_ranges, t_ranges;
int rc;
size_t len;
charbuf out, pb, path;
init_charbuf(&pb); /* path-buffer */
if ( PL_get_wchars(Rel, &slen, &s,
CVT_ATOM|CVT_STRING|CVT_LIST|CVT_EXCEPTION) )
{ parse_uri(&s_ranges, slen, s);
if ( s_ranges.scheme.start )
{ t_ranges = s_ranges;
} else
{ const uri_component_ranges *b_ranges;
if ( !(b_ranges = base_ranges(Base)) )
return FALSE;
memset(&t_ranges, 0, sizeof(t_ranges));
if ( s_ranges.authority.start )
{ t_ranges.authority = s_ranges.authority;
t_ranges.path = s_ranges.path;
t_ranges.query = s_ranges.query;
} else
{ if ( s_ranges.path.start == s_ranges.path.end )
{ t_ranges.path = b_ranges->path;
if ( s_ranges.query.start )
t_ranges.query = s_ranges.query;
else
t_ranges.query = b_ranges->query;
} else
{ if ( s_ranges.path.start[0] == '/' )
{ t_ranges.path = s_ranges.path;
} else
{ if ( b_ranges->authority.start &&
b_ranges->path.start == b_ranges->path.end )
{ add_charbuf(&pb, '/');
add_verb_range_charbuf(&pb, &s_ranges.path);
} else
{ range path = b_ranges->path;
path.end = remove_last_segment(path.start, path.end);
add_verb_range_charbuf(&pb, &path);
add_verb_range_charbuf(&pb, &s_ranges.path);
t_ranges.path.start = pb.base;
t_ranges.path.end = pb.here;
}
}
t_ranges.query = s_ranges.query;
}
t_ranges.authority = b_ranges->authority;
}
t_ranges.scheme = b_ranges->scheme;
t_ranges.fragment = s_ranges.fragment;
}
} else
return FALSE;
init_charbuf(&out); /* output buffer */
if ( normalize )
{ normalize_in_charbuf(&out, &t_ranges, iri);
} else
{ init_charbuf_at_size(&path, t_ranges.path.end - t_ranges.path.start);
len = removed_dot_segments(t_ranges.path.end - t_ranges.path.start,
t_ranges.path.start,
path.base);
t_ranges.path.start = path.base;
t_ranges.path.end = path.base+len;
free_charbuf(&pb);
ranges_in_charbuf(&out, &t_ranges);
}
rc = PL_unify_wchars(URI, PL_ATOM, out.here-out.base, out.base);
free_charbuf(&out);
return rc;
}
/** uri_resolve(+Relative, +Base, -Absolute) is det.
*/
static foreign_t
uri_resolve(term_t Rel, term_t Base, term_t URI)
{ return resolve(Rel, Base, URI, FALSE, FALSE);
}
/** uri_normalized(+Relative, +Base, -Absolute) is det.
*/
static foreign_t
uri_normalized3(term_t Rel, term_t Base, term_t URI)
{ return resolve(Rel, Base, URI, TRUE, FALSE);
}
/** uri_normalized_iri(+Relative, +Base, -Absolute) is det.
*/
static foreign_t
uri_normalized_iri3(term_t Rel, term_t Base, term_t IRI)
{ return resolve(Rel, Base, IRI, TRUE, TRUE);
}
/*******************************
* PATH LOGIC *
*******************************/
/* http://labs.apache.org/webarch/uri/rfc/rfc3986.html#relative-dot-segments
*/
static pl_wchar_t *
remove_last_segment(const pl_wchar_t *base, const pl_wchar_t *o)
{ while(o>base && o[-1] != '/' )
o--;
return (pl_wchar_t*) o;
}
static inline int
fetch(const pl_wchar_t *in, const pl_wchar_t *end, int at)
{ if ( in+at>=end )
return 0;
return in[at];
}
static size_t
removed_dot_segments(size_t len, const pl_wchar_t *in, pl_wchar_t *out)
{ const pl_wchar_t *end = &in[len];
pl_wchar_t *o = out;
while(in<end)
{ if ( in[0] == '.' )
{ if ( fetch(in, end, 1) == '/' ||
(fetch(in, end, 1) == '.' && fetch(in, end, 2) == '/') )
{ in += 2; /* 2A */
continue;
}
}
if ( in[0] == '/' && fetch(in, end, 1) == '.' )
{ if ( fetch(in, end, 2) == '/' )
{ in += 2; /* 2B "/./" --> "/" */
continue;
}
if ( !fetch(in, end, 2) )
{ *o++ = '/'; /* 2B "/." --> "/" (and close) */
in += 2;
continue;
}
if ( fetch(in, end, 2) == '.' )
{ if ( fetch(in, end, 3) == '/' )
{ in += 3; /* 2C "/../" --> "/" */
o = remove_last_segment(out, o);
if ( o>out ) o--; /* delete / */
continue;
}
if ( !fetch(in, end, 3) )
{ o = remove_last_segment(out, o);
if ( o>out ) o--; /* delete / */
*o++ = '/';
in += 3;
continue;
}
}
}
if ( in[0] == '.' )
{ if ( !fetch(in, end, 1) )
{ in++; /* 3D */
continue;
}
if ( fetch(in, end, 1) == '.' && !fetch(in, end, 2) )
{ in += 2; /* 3D */
continue;
}
}
if ( in[0] == '/' )
*o++ = *in++;
while( in < end && in[0] != '/' )
*o++ = *in++;
}
return o-out;
}
/*******************************
* IRI HANDLING *
*******************************/
#define utf8_put_char(out, chr) \
((chr) < 0x80 ? out[0]=(char)(chr), out+1 \
: _utf8_put_char(out, (chr)))
static char *
_utf8_put_char(char *out, int chr)
{ if ( chr < 0x80 )
{ *out++ = chr;
} else if ( chr < 0x800 )
{ *out++ = 0xc0|((chr>>6)&0x1f);
*out++ = 0x80|(chr&0x3f);
} else if ( chr < 0x10000 )
{ *out++ = 0xe0|((chr>>12)&0x0f);
*out++ = 0x80|((chr>>6)&0x3f);
*out++ = 0x80|(chr&0x3f);
} else if ( chr < 0x200000 )
{ *out++ = 0xf0|((chr>>18)&0x07);
*out++ = 0x80|((chr>>12)&0x3f);
*out++ = 0x80|((chr>>6)&0x3f);
*out++ = 0x80|(chr&0x3f);
} else if ( chr < 0x4000000 )
{ *out++ = 0xf8|((chr>>24)&0x03);
*out++ = 0x80|((chr>>18)&0x3f);
*out++ = 0x80|((chr>>12)&0x3f);
*out++ = 0x80|((chr>>6)&0x3f);
*out++ = 0x80|(chr&0x3f);
} else if ( (unsigned)chr < 0x80000000 )
{ *out++ = 0xfc|((chr>>30)&0x01);
*out++ = 0x80|((chr>>24)&0x3f);
*out++ = 0x80|((chr>>18)&0x3f);
*out++ = 0x80|((chr>>12)&0x3f);
*out++ = 0x80|((chr>>6)&0x3f);
*out++ = 0x80|(chr&0x3f);
}
return out;
}
/** uri_iri(+URI, -IRI) is det.
uri_iri(-URI, +IRI) is det.
Perform %- and UTF-8 encoding/decoding to translate between a URI and
IRI
*/
static foreign_t
uri_iri(term_t URI, term_t IRI)
{ if ( !PL_is_variable(URI) )
return uri_normalized_iri(URI, IRI);
else
return uri_normalized(IRI, URI);
}
/*******************************
* REGISTRATION *
*******************************/
#define MKATOM(n) \
ATOM_ ## n = PL_new_atom(#n)
#define MKFUNCTOR(n,a) \
FUNCTOR_ ## n ## a = PL_new_functor(PL_new_atom(#n), a)
install_t
install_uri()
{ MKATOM(query_value);
MKATOM(fragment);
MKATOM(path);
MKFUNCTOR(uri_components, 5);
MKFUNCTOR(uri_authority, 4);
MKFUNCTOR(error, 2);
MKFUNCTOR(syntax_error, 1);
MKFUNCTOR(type_error, 2);
MKFUNCTOR(domain_error, 2);
FUNCTOR_equal2 = PL_new_functor(PL_new_atom("="), 2);
FUNCTOR_pair2 = PL_new_functor(PL_new_atom("-"), 2);
#ifdef _REENTRANT
pthread_key_create(&base_key, free_base_cache);
#endif
PL_register_foreign("uri_components", 2, uri_components, 0);
PL_register_foreign("uri_is_global", 1, uri_is_global, 0);
PL_register_foreign("uri_normalized", 2, uri_normalized, 0);
PL_register_foreign("uri_normalized_iri", 2, uri_normalized_iri, 0);
PL_register_foreign("uri_resolve", 3, uri_resolve, 0);
PL_register_foreign("uri_normalized", 3, uri_normalized3, 0);
PL_register_foreign("uri_normalized_iri", 3, uri_normalized_iri3, 0);
PL_register_foreign("uri_query_components", 2, uri_query_components, 0);
PL_register_foreign("uri_authority_components",
2, uri_authority_components, 0);
PL_register_foreign("uri_encoded", 3, uri_encoded, 0);
PL_register_foreign("uri_iri", 2, uri_iri, 0);
}