286 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Prolog
		
	
	
	
	
	
			
		
		
	
	
			286 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Prolog
		
	
	
	
	
	
| /*  Part of SWI-Prolog
 | |
| 
 | |
|     Author:        Jan Wielemaker
 | |
|     E-mail:        J.Wielemaker@cs.vu.nl
 | |
|     WWW:           http://www.swi-prolog.org
 | |
|     Copyright (C): 2009, VU University Amsterdam
 | |
| 
 | |
|     This program is free software; you can redistribute it and/or
 | |
|     modify it under the terms of the GNU General Public License
 | |
|     as published by the Free Software Foundation; either version 2
 | |
|     of the License, or (at your option) any later version.
 | |
| 
 | |
|     This program is distributed in the hope that it will be useful,
 | |
|     but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | |
|     GNU General Public License for more details.
 | |
| 
 | |
|     You should have received a copy of the GNU General Public
 | |
|     License along with this library; if not, write to the Free Software
 | |
|     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 | |
| 
 | |
|     As a special exception, if you link this library with other files,
 | |
|     compiled with a Free Software compiler, to produce an executable, this
 | |
|     library does not by itself cause the resulting executable to be covered
 | |
|     by the GNU General Public License. This exception does not however
 | |
|     invalidate any other reasons why the executable file might be covered by
 | |
|     the GNU General Public License.
 | |
| */
 | |
| 
 | |
| :- module(uri,
 | |
| 	  [ uri_components/2,		% ?URI, ?Components
 | |
| 	    uri_data/3,			% ?Field, +Components, ?Data
 | |
| 	    uri_data/4,			% +Field, +Components, -Data, -New
 | |
| 
 | |
| 	    uri_normalized/2,		% +URI, -NormalizedURI
 | |
| 	    uri_normalized_iri/2,	% +URI, -NormalizedIRI
 | |
| 	    uri_normalized/3,		% +URI, +Base, -NormalizedURI
 | |
| 	    uri_normalized_iri/3,	% +URI, +Base, -NormalizedIRI
 | |
| 	    uri_resolve/3,		% +URI, +Base, -AbsURI
 | |
| 	    uri_is_global/1,		% +URI
 | |
| 	    uri_query_components/2,	% ?QueryString, ?NameValueList
 | |
| 	    uri_authority_components/2,	% ?Authority, ?Components
 | |
| 	    uri_authority_data/3,	% ?Field, ?Components, ?Data
 | |
| 					% Encoding
 | |
| 	    uri_encoded/3,		% +Component, ?Value, ?Encoded
 | |
| 	    uri_file_name/2,		% ?URI, ?Path
 | |
| 	    uri_iri/2			% ?URI, ?IRI
 | |
| 	  ]).
 | |
| 
 | |
| :- use_module(library(shlib)).
 | |
| 
 | |
| :- use_foreign_library(foreign(uri)).
 | |
| 
 | |
| /** <module> Process URIs
 | |
| 
 | |
| This  library  provides   high-performance    C-based   primitives   for
 | |
| manipulating URIs. We decided for a  C-based implementation for the much
 | |
| better performance on raw character  manipulation. Notably, URI handling
 | |
| primitives are used in  time-critical  parts   of  RDF  processing. This
 | |
| implementation is based on RFC-3986:
 | |
| 
 | |
| 	http://labs.apache.org/webarch/uri/rfc/rfc3986.html
 | |
| 
 | |
| The URI processing in this library is  rather liberal. That is, we break
 | |
| URIs according to the rules, but we  do not validate that the components
 | |
| are valid. Also, percent-decoding for IRIs   is  liberal. It first tries
 | |
| UTF-8; then ISO-Latin-1 and finally accepts %-characters verbatim.
 | |
| 
 | |
| Earlier experience has shown that strict   enforcement of the URI syntax
 | |
| results in many errors that  are   accepted  by  many other web-document
 | |
| processing tools.
 | |
| */
 | |
| 
 | |
| %%	uri_components(+URI, -Components) is det.
 | |
| %%	uri_components(-URI, +Components) is det.
 | |
| %
 | |
| %	Break a URI  into  its  5   basic  components  according  to the
 | |
| %	RFC-3986 regular expression:
 | |
| %
 | |
| %	    ==
 | |
| %	    ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
 | |
| %	     12            3  4          5       6  7        8 9
 | |
| %	    ==
 | |
| %
 | |
| %	@param Components is a term uri_components(Scheme, Authority,
 | |
| %	Path, Search, Fragment).  See uri_data/3 for accessing this
 | |
| %	structure.
 | |
| 
 | |
| %%	uri_data(?Field, +Components, ?Data) is semidet.
 | |
| %
 | |
| %	Provide access the uri_component structure.  Defined field-names
 | |
| %	are: =scheme=, =authority=, =path=, =search= and =fragment=
 | |
| 
 | |
| uri_data(scheme,    uri_components(S, _, _, _, _), S).
 | |
| uri_data(authority, uri_components(_, A, _, _, _), A).
 | |
| uri_data(path,	    uri_components(_, _, P, _, _), P).
 | |
| uri_data(search,    uri_components(_, _, _, S, _), S).
 | |
| uri_data(fragment,  uri_components(_, _, _, _, F), F).
 | |
| 
 | |
| %%	uri_data(+Field, +Components, +Data, -NewComponents) is semidet.
 | |
| %
 | |
| %	NewComponents is the same as Components with Field set to Data.
 | |
| 
 | |
| uri_data(scheme,    uri_components(_, A, P, Q, F), S,
 | |
| 		    uri_components(S, A, P, Q, F)).
 | |
| uri_data(authority, uri_components(S, _, P, Q, F), A,
 | |
| 		    uri_components(S, A, P, Q, F)).
 | |
| uri_data(path,      uri_components(S, A, _, Q, F), P,
 | |
| 		    uri_components(S, A, P, Q, F)).
 | |
| uri_data(search,    uri_components(S, A, P, _, F), Q,
 | |
| 		    uri_components(S, A, P, Q, F)).
 | |
| uri_data(fragment,  uri_components(S, A, P, Q, _), F,
 | |
| 		    uri_components(S, A, P, Q, F)).
 | |
| 
 | |
| %%	uri_normalized(+URI, -NormalizedURI) is det.
 | |
| %
 | |
| %	NormalizedURI is the normalized form   of  URI. Normalization is
 | |
| %	syntactic and involves the following steps:
 | |
| %
 | |
| %	    * 6.2.2.1. Case Normalization
 | |
| %	    * 6.2.2.2. Percent-Encoding Normalization
 | |
| %	    * 6.2.2.3. Path Segment Normalization
 | |
| 
 | |
| %%	uri_normalized_iri(+URI, -NormalizedIRI) is det.
 | |
| %
 | |
| %	As uri_normalized/2, but percent-encoding is translated into IRI
 | |
| %	Unicode characters. The translation  is   liberal:  valid  UTF-8
 | |
| %	sequences  of  %-encoded  bytes  are    mapped  to  the  Unicode
 | |
| %	character. Other %XX-sequences are mapped   to the corresponding
 | |
| %	ISO-Latin-1 character and sole % characters are left untouched.
 | |
| %
 | |
| %	@see uri_iri/2.
 | |
| 
 | |
| 
 | |
| %%	uri_is_global(+URI) is semidet.
 | |
| %
 | |
| %	True if URI has a scheme. The semantics  is the same as the code
 | |
| %	below, but the implementation is more   efficient as it does not
 | |
| %	need to parse the  other  components,   nor  needs  to  bind the
 | |
| %	scheme.
 | |
| %
 | |
| %	==
 | |
| %	uri_is_global(URI) :-
 | |
| %		uri_components(URI, Components),
 | |
| %		uri_data(Components, scheme, Scheme),
 | |
| %		nonvar(Scheme).
 | |
| %	==
 | |
| 
 | |
| %%	uri_resolve(+URI, +Base, -GlobalURI) is det.
 | |
| %
 | |
| %	Resolve a possibly local URI relative   to Base. This implements
 | |
| %	http://labs.apache.org/webarch/uri/rfc/rfc3986.html#relative-transform
 | |
| 
 | |
| %%	uri_normalized(+URI, +Base, -NormalizedGlobalURI) is det.
 | |
| %
 | |
| %	NormalizedGlobalURI is the normalized global version of URI.
 | |
| %	Behaves as if defined by:
 | |
| %
 | |
| %	==
 | |
| %	uri_normalized(URI, Base, NormalizedGlobalURI) :-
 | |
| %		uri_resolve(URI, Base, GlobalURI),
 | |
| %		uri_normalized(GlobalURI, NormalizedGlobalURI).
 | |
| %	==
 | |
| 
 | |
| %%	uri_normalized_iri(+URI, +Base, -NormalizedGlobalIRI) is det.
 | |
| %
 | |
| %	NormalizedGlobalIRI is the normalized global IRI of URI. Behaves
 | |
| %	as if defined by:
 | |
| %
 | |
| %	==
 | |
| %	uri_normalized(URI, Base, NormalizedGlobalIRI) :-
 | |
| %		uri_resolve(URI, Base, GlobalURI),
 | |
| %		uri_normalized_iri(GlobalURI, NormalizedGlobalIRI).
 | |
| %	==
 | |
| 
 | |
| %%	uri_query_components(+String, -Query) is det.
 | |
| %%	uri_query_components(-String, +Query) is det.
 | |
| %
 | |
| %	Perform encoding and decoding of an URI query string. Query is a
 | |
| %	list of fully decoded (Unicode) Name=Value pairs. In mode (-,+),
 | |
| %	query elements of the forms Name(Value)  and Name-Value are also
 | |
| %	accepted to enhance interoperability with   the option and pairs
 | |
| %	libraries.  E.g.
 | |
| %
 | |
| %	==
 | |
| %	?- uri_query_components(QS, [a=b, c('d+w'), n-'VU Amsterdam']).
 | |
| %	QS = 'a=b&c=d%2Bw&n=VU%20Amsterdam'.
 | |
| %
 | |
| %	?- uri_query_components('a=b&c=d%2Bw&n=VU%20Amsterdam', Q).
 | |
| %	Q = [a=b, c='d+w', n='VU Amsterdam'].
 | |
| %	==
 | |
| 
 | |
| 
 | |
| %%	uri_authority_components(+Authority, -Components) is det.
 | |
| %%	uri_authority_components(-Authority, +Components) is det.
 | |
| %
 | |
| %	Break-down the authority component of a   URI. The fields of the
 | |
| %	structure Components can be accessed using uri_authority_data/3.
 | |
| 
 | |
| %%	uri_authority_data(+Field, ?Components, ?Data) is semidet.
 | |
| %
 | |
| %	Provide access the uri_authority  structure. Defined field-names
 | |
| %	are: =user=, =password=, =host= and =port=
 | |
| 
 | |
| uri_authority_data(user,     uri_authority(U, _, _, _), U).
 | |
| uri_authority_data(password, uri_authority(_, P, _, _), P).
 | |
| uri_authority_data(host,     uri_authority(_, _, H, _), H).
 | |
| uri_authority_data(port,     uri_authority(_, _, _, P), P).
 | |
| 
 | |
| 
 | |
| %%	uri_encoded(+Component, +Value, -Encoded) is det.
 | |
| %%	uri_encoded(+Component, -Value, +Encoded) is det.
 | |
| %
 | |
| %	Encoded  is  the  URI   encoding    for   Value.  When  encoding
 | |
| %	(Value->Encoded), Component specifies the   URI  component where
 | |
| %	the value is used. It  is   one  of =query_value=, =fragment= or
 | |
| %	=path=.  Besides  alphanumerical  characters,    the   following
 | |
| %	characters are passed verbatim (the  set   is  split  in logical
 | |
| %	groups according to RFC3986).
 | |
| %
 | |
| %	    $ query_value, fragment :
 | |
| %	    "-._~" | "!$'()*,;" | ":@" | "/?"
 | |
| %	    $ path :
 | |
| %	    "-._~" | "!$&'()*,;=" | ":@" | "/"
 | |
| 
 | |
| 
 | |
| %%	uri_iri(+URI, -IRI) is det.
 | |
| %%	uri_iri(-URI, +IRI) is det.
 | |
| %
 | |
| %	Convert between a URI, encoded in US-ASCII and an IRI. An IRI is
 | |
| %	a fully expanded  Unicode  string.   Unicode  strings  are first
 | |
| %	encoded into UTF-8, after which %-encoding takes place.
 | |
| %
 | |
| %	@error syntax_error(Culprit) in mode (+,-) if URI is not a
 | |
| %	legally percent-encoded UTF-8 string.
 | |
| 
 | |
| 
 | |
| %%	uri_file_name(+URI, -FileName) is semidet.
 | |
| %%	uri_file_name(-URI, +FileName) is det.
 | |
| %
 | |
| %	Convert between a URI and a   local  file_name. This protocol is
 | |
| %	covered by RFC 1738. Please note   that file-URIs use _absolute_
 | |
| %	paths. The mode (-, +) translates  a possible relative path into
 | |
| %	an absolute one.
 | |
| 
 | |
| uri_file_name(URI, FileName) :-
 | |
| 	nonvar(URI), !,
 | |
| 	uri_components(URI, Components),
 | |
| 	uri_data(scheme, Components, file),
 | |
| 	(   uri_data(authority, Components, '')
 | |
| 	->  true
 | |
| 	;   uri_data(authority, Components, localhost)
 | |
| 	),
 | |
| 	uri_data(path, Components, FileNameEnc),
 | |
| 	uri_encoded(path, FileName0, FileNameEnc),
 | |
| 	delete_leading_slash(FileName0, FileName).
 | |
| uri_file_name(URI, FileName) :-
 | |
| 	nonvar(FileName), !,
 | |
| 	absolute_file_name(FileName, Path0),
 | |
| 	ensure_leading_slash(Path0, Path),
 | |
| 	uri_encoded(path, Path, PathEnc),
 | |
| 	uri_data(scheme, Components, file),
 | |
| 	uri_data(authority, Components, ''),
 | |
| 	uri_data(path, Components, PathEnc),
 | |
| 	uri_components(URI, Components).
 | |
| 
 | |
| %%	ensure_leading_slash(+WinPath, -Path).
 | |
| %%	delete_leading_slash(+Path, -WinPath).
 | |
| %
 | |
| %	Deal with the fact that absolute paths   in Windows start with a
 | |
| %	drive letter rather than a  /.  For   URIs  we  need a path that
 | |
| %	starts with a /.
 | |
| 
 | |
| ensure_leading_slash(Path, SlashPath) :-
 | |
| 	(   sub_atom(Path, 0, _, _, /)
 | |
| 	->  SlashPath = Path
 | |
| 	;   atom_concat(/, Path, SlashPath)
 | |
| 	).
 | |
| 
 | |
| :- if(current_prolog_flag(windows, true)).
 | |
| delete_leading_slash(Path, WinPath) :-
 | |
| 	atom_concat(/, WinPath, Path),
 | |
| 	is_absolute_file_name(WinPath), !.
 | |
| :- endif.
 | |
| delete_leading_slash(Path, Path).
 |