286 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Prolog
		
	
	
	
	
	
			
		
		
	
	
			286 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Prolog
		
	
	
	
	
	
/*  Part of SWI-Prolog
 | 
						|
 | 
						|
    Author:        Jan Wielemaker
 | 
						|
    E-mail:        J.Wielemaker@cs.vu.nl
 | 
						|
    WWW:           http://www.swi-prolog.org
 | 
						|
    Copyright (C): 2009, VU University Amsterdam
 | 
						|
 | 
						|
    This program is free software; you can redistribute it and/or
 | 
						|
    modify it under the terms of the GNU General Public License
 | 
						|
    as published by the Free Software Foundation; either version 2
 | 
						|
    of the License, or (at your option) any later version.
 | 
						|
 | 
						|
    This program is distributed in the hope that it will be useful,
 | 
						|
    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
						|
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
						|
    GNU General Public License for more details.
 | 
						|
 | 
						|
    You should have received a copy of the GNU General Public
 | 
						|
    License along with this library; if not, write to the Free Software
 | 
						|
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 | 
						|
 | 
						|
    As a special exception, if you link this library with other files,
 | 
						|
    compiled with a Free Software compiler, to produce an executable, this
 | 
						|
    library does not by itself cause the resulting executable to be covered
 | 
						|
    by the GNU General Public License. This exception does not however
 | 
						|
    invalidate any other reasons why the executable file might be covered by
 | 
						|
    the GNU General Public License.
 | 
						|
*/
 | 
						|
 | 
						|
:- module(uri,
 | 
						|
	  [ uri_components/2,		% ?URI, ?Components
 | 
						|
	    uri_data/3,			% ?Field, +Components, ?Data
 | 
						|
	    uri_data/4,			% +Field, +Components, -Data, -New
 | 
						|
 | 
						|
	    uri_normalized/2,		% +URI, -NormalizedURI
 | 
						|
	    uri_normalized_iri/2,	% +URI, -NormalizedIRI
 | 
						|
	    uri_normalized/3,		% +URI, +Base, -NormalizedURI
 | 
						|
	    uri_normalized_iri/3,	% +URI, +Base, -NormalizedIRI
 | 
						|
	    uri_resolve/3,		% +URI, +Base, -AbsURI
 | 
						|
	    uri_is_global/1,		% +URI
 | 
						|
	    uri_query_components/2,	% ?QueryString, ?NameValueList
 | 
						|
	    uri_authority_components/2,	% ?Authority, ?Components
 | 
						|
	    uri_authority_data/3,	% ?Field, ?Components, ?Data
 | 
						|
					% Encoding
 | 
						|
	    uri_encoded/3,		% +Component, ?Value, ?Encoded
 | 
						|
	    uri_file_name/2,		% ?URI, ?Path
 | 
						|
	    uri_iri/2			% ?URI, ?IRI
 | 
						|
	  ]).
 | 
						|
 | 
						|
:- use_module(library(shlib)).
 | 
						|
 | 
						|
:- use_foreign_library(foreign(uri)).
 | 
						|
 | 
						|
/** <module> Process URIs
 | 
						|
 | 
						|
This  library  provides   high-performance    C-based   primitives   for
 | 
						|
manipulating URIs. We decided for a  C-based implementation for the much
 | 
						|
better performance on raw character  manipulation. Notably, URI handling
 | 
						|
primitives are used in  time-critical  parts   of  RDF  processing. This
 | 
						|
implementation is based on RFC-3986:
 | 
						|
 | 
						|
	http://labs.apache.org/webarch/uri/rfc/rfc3986.html
 | 
						|
 | 
						|
The URI processing in this library is  rather liberal. That is, we break
 | 
						|
URIs according to the rules, but we  do not validate that the components
 | 
						|
are valid. Also, percent-decoding for IRIs   is  liberal. It first tries
 | 
						|
UTF-8; then ISO-Latin-1 and finally accepts %-characters verbatim.
 | 
						|
 | 
						|
Earlier experience has shown that strict   enforcement of the URI syntax
 | 
						|
results in many errors that  are   accepted  by  many other web-document
 | 
						|
processing tools.
 | 
						|
*/
 | 
						|
 | 
						|
%%	uri_components(+URI, -Components) is det.
 | 
						|
%%	uri_components(-URI, +Components) is det.
 | 
						|
%
 | 
						|
%	Break a URI  into  its  5   basic  components  according  to the
 | 
						|
%	RFC-3986 regular expression:
 | 
						|
%
 | 
						|
%	    ==
 | 
						|
%	    ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
 | 
						|
%	     12            3  4          5       6  7        8 9
 | 
						|
%	    ==
 | 
						|
%
 | 
						|
%	@param Components is a term uri_components(Scheme, Authority,
 | 
						|
%	Path, Search, Fragment).  See uri_data/3 for accessing this
 | 
						|
%	structure.
 | 
						|
 | 
						|
%%	uri_data(?Field, +Components, ?Data) is semidet.
 | 
						|
%
 | 
						|
%	Provide access the uri_component structure.  Defined field-names
 | 
						|
%	are: =scheme=, =authority=, =path=, =search= and =fragment=
 | 
						|
 | 
						|
uri_data(scheme,    uri_components(S, _, _, _, _), S).
 | 
						|
uri_data(authority, uri_components(_, A, _, _, _), A).
 | 
						|
uri_data(path,	    uri_components(_, _, P, _, _), P).
 | 
						|
uri_data(search,    uri_components(_, _, _, S, _), S).
 | 
						|
uri_data(fragment,  uri_components(_, _, _, _, F), F).
 | 
						|
 | 
						|
%%	uri_data(+Field, +Components, +Data, -NewComponents) is semidet.
 | 
						|
%
 | 
						|
%	NewComponents is the same as Components with Field set to Data.
 | 
						|
 | 
						|
uri_data(scheme,    uri_components(_, A, P, Q, F), S,
 | 
						|
		    uri_components(S, A, P, Q, F)).
 | 
						|
uri_data(authority, uri_components(S, _, P, Q, F), A,
 | 
						|
		    uri_components(S, A, P, Q, F)).
 | 
						|
uri_data(path,      uri_components(S, A, _, Q, F), P,
 | 
						|
		    uri_components(S, A, P, Q, F)).
 | 
						|
uri_data(search,    uri_components(S, A, P, _, F), Q,
 | 
						|
		    uri_components(S, A, P, Q, F)).
 | 
						|
uri_data(fragment,  uri_components(S, A, P, Q, _), F,
 | 
						|
		    uri_components(S, A, P, Q, F)).
 | 
						|
 | 
						|
%%	uri_normalized(+URI, -NormalizedURI) is det.
 | 
						|
%
 | 
						|
%	NormalizedURI is the normalized form   of  URI. Normalization is
 | 
						|
%	syntactic and involves the following steps:
 | 
						|
%
 | 
						|
%	    * 6.2.2.1. Case Normalization
 | 
						|
%	    * 6.2.2.2. Percent-Encoding Normalization
 | 
						|
%	    * 6.2.2.3. Path Segment Normalization
 | 
						|
 | 
						|
%%	uri_normalized_iri(+URI, -NormalizedIRI) is det.
 | 
						|
%
 | 
						|
%	As uri_normalized/2, but percent-encoding is translated into IRI
 | 
						|
%	Unicode characters. The translation  is   liberal:  valid  UTF-8
 | 
						|
%	sequences  of  %-encoded  bytes  are    mapped  to  the  Unicode
 | 
						|
%	character. Other %XX-sequences are mapped   to the corresponding
 | 
						|
%	ISO-Latin-1 character and sole % characters are left untouched.
 | 
						|
%
 | 
						|
%	@see uri_iri/2.
 | 
						|
 | 
						|
 | 
						|
%%	uri_is_global(+URI) is semidet.
 | 
						|
%
 | 
						|
%	True if URI has a scheme. The semantics  is the same as the code
 | 
						|
%	below, but the implementation is more   efficient as it does not
 | 
						|
%	need to parse the  other  components,   nor  needs  to  bind the
 | 
						|
%	scheme.
 | 
						|
%
 | 
						|
%	==
 | 
						|
%	uri_is_global(URI) :-
 | 
						|
%		uri_components(URI, Components),
 | 
						|
%		uri_data(Components, scheme, Scheme),
 | 
						|
%		nonvar(Scheme).
 | 
						|
%	==
 | 
						|
 | 
						|
%%	uri_resolve(+URI, +Base, -GlobalURI) is det.
 | 
						|
%
 | 
						|
%	Resolve a possibly local URI relative   to Base. This implements
 | 
						|
%	http://labs.apache.org/webarch/uri/rfc/rfc3986.html#relative-transform
 | 
						|
 | 
						|
%%	uri_normalized(+URI, +Base, -NormalizedGlobalURI) is det.
 | 
						|
%
 | 
						|
%	NormalizedGlobalURI is the normalized global version of URI.
 | 
						|
%	Behaves as if defined by:
 | 
						|
%
 | 
						|
%	==
 | 
						|
%	uri_normalized(URI, Base, NormalizedGlobalURI) :-
 | 
						|
%		uri_resolve(URI, Base, GlobalURI),
 | 
						|
%		uri_normalized(GlobalURI, NormalizedGlobalURI).
 | 
						|
%	==
 | 
						|
 | 
						|
%%	uri_normalized_iri(+URI, +Base, -NormalizedGlobalIRI) is det.
 | 
						|
%
 | 
						|
%	NormalizedGlobalIRI is the normalized global IRI of URI. Behaves
 | 
						|
%	as if defined by:
 | 
						|
%
 | 
						|
%	==
 | 
						|
%	uri_normalized(URI, Base, NormalizedGlobalIRI) :-
 | 
						|
%		uri_resolve(URI, Base, GlobalURI),
 | 
						|
%		uri_normalized_iri(GlobalURI, NormalizedGlobalIRI).
 | 
						|
%	==
 | 
						|
 | 
						|
%%	uri_query_components(+String, -Query) is det.
 | 
						|
%%	uri_query_components(-String, +Query) is det.
 | 
						|
%
 | 
						|
%	Perform encoding and decoding of an URI query string. Query is a
 | 
						|
%	list of fully decoded (Unicode) Name=Value pairs. In mode (-,+),
 | 
						|
%	query elements of the forms Name(Value)  and Name-Value are also
 | 
						|
%	accepted to enhance interoperability with   the option and pairs
 | 
						|
%	libraries.  E.g.
 | 
						|
%
 | 
						|
%	==
 | 
						|
%	?- uri_query_components(QS, [a=b, c('d+w'), n-'VU Amsterdam']).
 | 
						|
%	QS = 'a=b&c=d%2Bw&n=VU%20Amsterdam'.
 | 
						|
%
 | 
						|
%	?- uri_query_components('a=b&c=d%2Bw&n=VU%20Amsterdam', Q).
 | 
						|
%	Q = [a=b, c='d+w', n='VU Amsterdam'].
 | 
						|
%	==
 | 
						|
 | 
						|
 | 
						|
%%	uri_authority_components(+Authority, -Components) is det.
 | 
						|
%%	uri_authority_components(-Authority, +Components) is det.
 | 
						|
%
 | 
						|
%	Break-down the authority component of a   URI. The fields of the
 | 
						|
%	structure Components can be accessed using uri_authority_data/3.
 | 
						|
 | 
						|
%%	uri_authority_data(+Field, ?Components, ?Data) is semidet.
 | 
						|
%
 | 
						|
%	Provide access the uri_authority  structure. Defined field-names
 | 
						|
%	are: =user=, =password=, =host= and =port=
 | 
						|
 | 
						|
uri_authority_data(user,     uri_authority(U, _, _, _), U).
 | 
						|
uri_authority_data(password, uri_authority(_, P, _, _), P).
 | 
						|
uri_authority_data(host,     uri_authority(_, _, H, _), H).
 | 
						|
uri_authority_data(port,     uri_authority(_, _, _, P), P).
 | 
						|
 | 
						|
 | 
						|
%%	uri_encoded(+Component, +Value, -Encoded) is det.
 | 
						|
%%	uri_encoded(+Component, -Value, +Encoded) is det.
 | 
						|
%
 | 
						|
%	Encoded  is  the  URI   encoding    for   Value.  When  encoding
 | 
						|
%	(Value->Encoded), Component specifies the   URI  component where
 | 
						|
%	the value is used. It  is   one  of =query_value=, =fragment= or
 | 
						|
%	=path=.  Besides  alphanumerical  characters,    the   following
 | 
						|
%	characters are passed verbatim (the  set   is  split  in logical
 | 
						|
%	groups according to RFC3986).
 | 
						|
%
 | 
						|
%	    $ query_value, fragment :
 | 
						|
%	    "-._~" | "!$'()*,;" | ":@" | "/?"
 | 
						|
%	    $ path :
 | 
						|
%	    "-._~" | "!$&'()*,;=" | ":@" | "/"
 | 
						|
 | 
						|
 | 
						|
%%	uri_iri(+URI, -IRI) is det.
 | 
						|
%%	uri_iri(-URI, +IRI) is det.
 | 
						|
%
 | 
						|
%	Convert between a URI, encoded in US-ASCII and an IRI. An IRI is
 | 
						|
%	a fully expanded  Unicode  string.   Unicode  strings  are first
 | 
						|
%	encoded into UTF-8, after which %-encoding takes place.
 | 
						|
%
 | 
						|
%	@error syntax_error(Culprit) in mode (+,-) if URI is not a
 | 
						|
%	legally percent-encoded UTF-8 string.
 | 
						|
 | 
						|
 | 
						|
%%	uri_file_name(+URI, -FileName) is semidet.
 | 
						|
%%	uri_file_name(-URI, +FileName) is det.
 | 
						|
%
 | 
						|
%	Convert between a URI and a   local  file_name. This protocol is
 | 
						|
%	covered by RFC 1738. Please note   that file-URIs use _absolute_
 | 
						|
%	paths. The mode (-, +) translates  a possible relative path into
 | 
						|
%	an absolute one.
 | 
						|
 | 
						|
uri_file_name(URI, FileName) :-
 | 
						|
	nonvar(URI), !,
 | 
						|
	uri_components(URI, Components),
 | 
						|
	uri_data(scheme, Components, file),
 | 
						|
	(   uri_data(authority, Components, '')
 | 
						|
	->  true
 | 
						|
	;   uri_data(authority, Components, localhost)
 | 
						|
	),
 | 
						|
	uri_data(path, Components, FileNameEnc),
 | 
						|
	uri_encoded(path, FileName0, FileNameEnc),
 | 
						|
	delete_leading_slash(FileName0, FileName).
 | 
						|
uri_file_name(URI, FileName) :-
 | 
						|
	nonvar(FileName), !,
 | 
						|
	absolute_file_name(FileName, Path0),
 | 
						|
	ensure_leading_slash(Path0, Path),
 | 
						|
	uri_encoded(path, Path, PathEnc),
 | 
						|
	uri_data(scheme, Components, file),
 | 
						|
	uri_data(authority, Components, ''),
 | 
						|
	uri_data(path, Components, PathEnc),
 | 
						|
	uri_components(URI, Components).
 | 
						|
 | 
						|
%%	ensure_leading_slash(+WinPath, -Path).
 | 
						|
%%	delete_leading_slash(+Path, -WinPath).
 | 
						|
%
 | 
						|
%	Deal with the fact that absolute paths   in Windows start with a
 | 
						|
%	drive letter rather than a  /.  For   URIs  we  need a path that
 | 
						|
%	starts with a /.
 | 
						|
 | 
						|
ensure_leading_slash(Path, SlashPath) :-
 | 
						|
	(   sub_atom(Path, 0, _, _, /)
 | 
						|
	->  SlashPath = Path
 | 
						|
	;   atom_concat(/, Path, SlashPath)
 | 
						|
	).
 | 
						|
 | 
						|
:- if(current_prolog_flag(windows, true)).
 | 
						|
delete_leading_slash(Path, WinPath) :-
 | 
						|
	atom_concat(/, WinPath, Path),
 | 
						|
	is_absolute_file_name(WinPath), !.
 | 
						|
:- endif.
 | 
						|
delete_leading_slash(Path, Path).
 |