/*  $Id$

    Part of SWI-Prolog

    Author:        Jan Wielemaker
    E-mail:        J.Wielemaker@uva.nl
    WWW:           http://www.swi-prolog.org
    Copyright (C): 1985-2009, University of Amsterdam

    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public License
    as published by the Free Software Foundation; either version 2
    of the License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU Lesser General Public
    License along with this library; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

    As a special exception, if you link this library with other files,
    compiled with a Free Software compiler, to produce an executable, this
    library does not by itself cause the resulting executable to be covered
    by the GNU General Public License. This exception does not however
    invalidate any other reasons why the executable file might be covered by
    the GNU General Public License.
*/

:- module(url,
	  [ parse_url/2,		% +URL, -Parts | -URL +Parts
	    parse_url/3,		% +URL|URI, +BaseURL, -Parts
	    				% -URL, +BaseURL, +Parts
	    is_absolute_url/1,		% +URL
	    global_url/3,		% +Local, +Base, -Global
	    http_location/2,		% ?Parts, ?Location
	    www_form_encode/2,		% Value <-> Encoded
	    parse_url_search/2,		% Form-data <-> Form fields

	    url_iri/2,			% ?URL, ?IRI

	    file_name_to_url/2,		% ?FileName, ?URL

	    set_url_encoding/2		% ?Old, +New
	  ]).
:- use_module(library(lists)).
:- use_module(library(error)).
:- use_module(library(utf8)).

/** <module> Analysing and constructing URL

This library deals with the analysis and construction of a URL,
Universal Resource Locator. URL is the basis for communicating locations
of resources (data) on the web. A URL consists of a protocol identifier
(e.g. HTTP, FTP, and a protocol-specific syntax further defining the
location. URLs are standardized in RFC-1738.

The implementation in this library covers only a small portion of the
defined protocols.  Though the initial implementation followed RFC-1738
strictly, the current is more relaxed to deal with frequent violations
of the standard encountered in practical use.

@author	Jan Wielemaker
@author Lukas Faulstich
@deprecated New code should use library(uri), provided by the =clib=
 	    package.
*/

		 /*******************************
		 *	      GLOBALISE		*
		 *******************************/

%%	global_url(+URL, +Base, -Global) is det.
%
%	Translate a possibly relative URL  into   an  absolute  one.
%
%	@error syntax_error(illegal_url) if URL is not legal.

global_url(URL, BaseURL, Global) :-
	(   is_absolute_url(URL),
	    \+ sub_atom(URL, _, _, _, '%')	% may have escape, use general
	->  Global = URL
	;   sub_atom(URL, 0, _, _, '//')
	->  parse_url(BaseURL, [], Attributes),
	    memberchk(protocol(Proto), Attributes),
	    atomic_list_concat([Proto, :, URL], Global)
	;   sub_atom(URL, 0, _, _, #)
	->  (   sub_atom(BaseURL, _, _, 0, #)
	    ->	sub_atom(URL, 1, _, 0, NoHash),
		atom_concat(BaseURL, NoHash, Global)
	    ;	atom_concat(BaseURL, URL, Global)
	    )
	;   parse_url(URL, BaseURL, Attributes)
	->  phrase(curl(Attributes), Chars),
	    atom_codes(Global, Chars)
	;   throw(error(syntax_error(illegal_url), URL))
	).

%%	is_absolute_url(+URL)
%
%	True if URL is an absolute URL. That  is, a URL that starts with
%	a protocol identifier.

is_absolute_url(URL) :-
	sub_atom(URL, 0, _, _, 'http://'), !.
is_absolute_url(URL) :-
	sub_atom(URL, 0, _, _, 'https://'), !.
is_absolute_url(URL) :-
	sub_atom(URL, 0, _, _, 'ftp://'), !.
is_absolute_url(URL) :-
	sub_atom(URL, 0, _, _, 'file://'), !.
is_absolute_url(URL) :-
	atom_codes(URL, Codes),
	phrase(absolute_url, Codes, _), !.


		 /*******************************
		 *	  CREATE URL/URI	*
		 *******************************/

%%	http_location(?Parts, ?Location)
%
%	Construct or analyze an  HTTP  location.   This  is  similar  to
%	parse_url/2, but only deals with the   location  part of an HTTP
%	URL. That is, the path, search   and fragment specifiers. In the
%	HTTP protocol, the first line of a message is
%
%	    ==
%	    <Action> <Location> HTTP/<version>
%	    ==
%
%	@param Location	Atom or list of character codes.

http_location(Parts, Location) :-	% Parts --> Location
	nonvar(Parts), !,
	phrase(curi(Parts), String), !,
	atom_codes(Location, String).
http_location(Parts, Location) :-	% Location --> Parts
	atom(Location), !,
	atom_codes(Location, Codes),
	phrase(http_location(Parts), Codes).
http_location(Parts, Codes) :-		% LocationCodes --> Parts
	is_list(Codes),
	phrase(http_location(Parts), Codes).


curl(A) -->
	{ memberchk(protocol(Protocol), A)
	}, !,
	catomic(Protocol),
	":",
	curl(Protocol, A).
curl(A) -->
	curl(http, A).

curl(file, A) --> !,
	(   "//"
	->  cpath(A)
	;   cpath(A)
	).
curl(_, A) -->
	"//",
	cuser(A),
	chost(A),
	cport(A),
	cpath(A),
	csearch(A),
	cfragment(A).

curi(A) -->
	cpath(A),
	csearch(A).

cpath(A) -->
	(   { memberchk(path(Path), A) }
	->  { atom_codes(Path, Codes) },
	    www_encode(Codes, "/+:,")
	;   ""
	).

cuser(A) -->
	(   { memberchk(user(User), A) }
	->  { atom_codes(User, Codes) },
	    www_encode(Codes, ":"),
	    "@"
	;   ""
	).

chost(A) -->
	(   { memberchk(host(Host), A) }
	->  { atom_codes(Host, Codes) },
	    www_encode(Codes, "")
	;   ""
	).

cport(A) -->
	(   { memberchk(port(Port), A), Port \== 80 }
	->  { number_codes(Port, Codes) },
	    ":",
	    www_encode(Codes, "")
	;   ""
	).


catomic(A, In, Out) :-
	atom_codes(A, Codes),
	append(Codes, Out, In).

%%	csearch(+Attributes)//

csearch(A)-->
	(   { memberchk(search(Parameters), A) }
	->  csearch(Parameters, "?")
	;   []
	).

csearch([], _) -->
	[].
csearch([Parameter|Parameters], Sep) --> !,
	codes(Sep),
	cparam(Parameter),
	csearch(Parameters, "&").

cparam(Name=Value) --> !,
	cname(Name),
	"=",
	cvalue(Value).
cparam(NameValue) -->			% allow to feed Name(Value)
	{ compound(NameValue), !,
	  NameValue =.. [Name,Value]
	},
	cname(Name),
	"=",
	cvalue(Value).
cparam(Name)-->
	cname(Name).

codes([]) --> [].
codes([H|T]) --> [H], codes(T).

cname(Atom) -->
	{ atom_codes(Atom, Codes) },
	www_encode(Codes, "").

%%	cvalue(+Value)// is det.
%
%	Construct a string from  Value.  Value   is  either  atomic or a
%	code-list.

cvalue(Value) -->
	{ atomic(Value), !,
	  atom_codes(Value, Codes)
	},
	www_encode(Codes, "").
cvalue(Codes) -->
	{ must_be(codes, Codes)
	},
	www_encode(Codes, "").


%%	cfragment(+Attributes)//

cfragment(A) -->
	{ memberchk(fragment(Frag), A), !,
	  atom_codes(Frag, Codes)
	},
	"#",
	www_encode(Codes, "").
cfragment(_) -->
	"".


		 /*******************************
		 *	      PARSING		*
		 *******************************/

%%	parse_url(+URL, -Attributes) is det.
%
%	Construct or analyse a URL. URL is an   atom  holding a URL or a
%	variable. Parts is a list of   components.  Each component is of
%	the format Name(Value). Defined components are:
%
%	    * protocol(Protocol)
%	    The used protocol. This is, after  the optional =|url:|=, an
%	    identifier separated from the remainder of  the URL using :.
%	    parse_url/2 assumes the =http= protocol   if  no protocol is
%	    specified and the URL can be parsed  as a valid HTTP url. In
%	    addition to the RFC-1738  specified   protocols,  the =file=
%	    protocol is supported as well.
%
%	    * host(Host)
%	    Host-name or IP-address on which   the  resource is located.
%	    Supported by all network-based protocols.
%
%	    * port(Port)
%	    Integer port-number to access on   the \arg{Host}. This only
%	    appears if the port is  explicitly   specified  in  the URL.
%	    Implicit default ports (e.g.  80   for  HTTP)  do \emph{not}
%	    appear in the part-list.
%
%	    * path(Path)
%	    (File-) path addressed by the URL. This is supported for the
%	    =ftp=, =http= and =file= protocols. If  no path appears, the
%	    library generates the path =|/|=.
%
%	    * search(ListOfNameValue)
%	    Search-specification of HTTP URL. This is the part after the
%	    =|?|=, normally used to transfer data   from HTML forms that
%	    use the =GET=  protocol.  In  the   URL  it  consists  of  a
%	    www-form-encoded list of Name=Value pairs. This is mapped to
%	    a list of Prolog Name=Value  terms   with  decoded names and
%	    values.
%
%	    * fragment(Fragment)
%	    Fragment specification of HTTP URL. This   is the part after
%	    the =|#|= character.
%
%	The example below illustrates the all this for an HTTP URL.
%
%	    ==
%	    ?- parse_url('http://swi.psy.uva.nl/message.cgi?msg=Hello+World%21#x', P).
%
%	    P = [ protocol(http),
%		  host('swi.psy.uva.nl'),
%		  fragment(x),
%		  search([ msg = 'Hello World!'
%			 ]),
%		  path('/message.cgi')
%	        ]
%	    ==
%
%	By instantiating the parts-list this predicate   can  be used to
%	create a URL.

parse_url(URL, Attributes) :-
	nonvar(URL), !,
	atom_codes(URL, Codes),
	phrase(url(Attributes), Codes).
parse_url(URL, Attributes) :-
	phrase(curl(Attributes), Codes), !,
	atom_codes(URL, Codes).

%%	parse_url(+URL, +BaseURL, -Attributes) is det.
%
%	Similar to parse_url/2 for relative URLs.  If URL is relative,
%	it is resolved using the absolute URL BaseURL.

parse_url(URL, BaseURL, Attributes) :-
	nonvar(URL), !,
	atom_codes(URL, Codes),
	(   phrase(absolute_url, Codes, _)
	->  phrase(url(Attributes), Codes)
	;   (   atomic(BaseURL)
	    ->  parse_url(BaseURL, BaseA0)
	    ;	BaseA0 = BaseURL
	    ),
	    select(path(BasePath), BaseA0, BaseA1),
	    delete(BaseA1, search(_), BaseA2),
	    delete(BaseA2, fragment(_), BaseA3),
	    phrase(relative_uri(URIA0), Codes),
	    select(path(LocalPath), URIA0, URIA1), !,
	    globalise_path(LocalPath, BasePath, Path),
	    append(BaseA3, [path(Path)|URIA1], Attributes)
	).
parse_url(URL, BaseURL, Attributes) :-
	parse_url(BaseURL, BaseAttributes),
	memberchk(path(BasePath), BaseAttributes),
	(   memberchk(path(LocalPath), Attributes)
	->  globalise_path(LocalPath, BasePath, Path)
	;   Path = BasePath
	),
	append([path(Path)|Attributes], BaseAttributes, GlobalAttributes),
	phrase(curl(GlobalAttributes), Chars),
	atom_codes(URL, Chars).


%%	globalise_path(+LocalPath, +RelativeTo, -FullPath) is det.
%
%	The first clause deals with the  standard URL /... global paths.
%	The second with file://drive:path on MS-Windows.   This is a bit
%	of a cludge, but unfortunately common practice is -especially on
%	Windows- not always following the standard

globalise_path(LocalPath, _, LocalPath) :-
	sub_atom(LocalPath, 0, _, _, /), !.
globalise_path(LocalPath, _, LocalPath) :-
	is_absolute_file_name(LocalPath), !.
globalise_path(Local, Base, Path) :-
	base_dir(Base, BaseDir),
	make_path(BaseDir, Local, Path).

base_dir(BasePath, BaseDir) :-
	(   atom_concat(BaseDir, /, BasePath)
	->  true
	;   file_directory_name(BasePath, BaseDir)
	).

make_path(Dir, Local, Path) :-
	atom_concat('../', L2, Local),
	file_directory_name(Dir, Parent),
	Parent \== Dir, !,
	make_path(Parent, L2, Path).
make_path(/, Local, Path) :- !,
	atom_concat(/, Local, Path).
make_path(Dir, Local, Path) :-
	atomic_list_concat([Dir, /, Local], Path).


%%	absolute_url//
%
%	True if the input  describes  an   absolute  URL.  This means it
%	starts with a URL schema. We demand a   schema  of length > 1 to
%	avoid confusion with Windows drive letters.

absolute_url -->
	lwalpha(_First),
	schema_chars(Rest),
	{ Rest \== [] },
	":", !.


		 /*******************************
		 *	     SEQUENCES		*
		 *******************************/

digits(L) -->
	digits(L, []).

digits([C|T0], T) -->
	digit(C), !,
	digits(T0, T).
digits(T, T) -->
	[].


digit(C, [C|T], T) :- code_type(C, digit).

		 /*******************************
		 *	      RFC-3986		*
		 *******************************/

%%	uri(-Parts)//

url([protocol(Schema)|Parts]) -->
	schema(Schema),
	":", !,
	hier_part(Schema, Parts, P2),
	query(P2, P3),
	fragment(P3, []).
url([protocol(http)|Parts]) -->		% implicit HTTP
	authority(Parts, [path(Path)]),
	path_abempty(Path).

relative_uri(Parts) -->
	relative_part(Parts, P2),
	query(P2, P3),
	fragment(P3, []).

relative_part(Parts, Tail) -->
	"//", !,
	authority(Parts, [path(Path)|Tail]),
	path_abempty(Path).
relative_part([path(Path)|T], T) -->
	(   path_absolute(Path)
	;   path_noschema(Path)
	;   path_empty(Path)
	), !.

http_location([path(Path)|P2]) -->
	path_abempty(Path),
	query(P2, P3),
	fragment(P3, []).

%%	schema(-Atom)//
%
%	Schema  is  case-insensitive  and  the    canonical  version  is
%	lowercase.
%
%	==
%	Schema ::= ALPHA *(ALPHA|DIGIT|"+"|"-"|".")
%	==

schema(Schema) -->
	lwalpha(C0),
	schema_chars(Codes),
	{ atom_codes(Schema, [C0|Codes]) }.

schema_chars([H|T]) -->
	schema_char(H), !,
	schema_chars(T).
schema_chars([]) -->
	[].

schema_char(H) -->
	[C],
	{ C < 128,
	  (   code_type(C, alpha)
	  ->  code_type(H, to_lower(C))
	  ;   code_type(C, digit)
	  ->  H = C
	  ;   schema_extra(C)
	  ->  H = C
	  )
	}.

schema_extra(0'+).
schema_extra(0'-).
schema_extra(0'.).	% 0'


%%	hier_part(+Schema, -Parts, ?Tail)//

hier_part(file, [path(Path)|Tail], Tail) --> !,
	"//",
	(   win_drive_path(Path)
	;   path_absolute(Path)
	;   path_rootless(Path)
	;   path_empty(Path)
	), !.
hier_part(_, Parts, Tail) -->
	"//", !,
	authority(Parts, [path(Path)|Tail]),
	path_abempty(Path).
hier_part(_, [path(Path)|T], T) -->
	(   path_absolute(Path)
	;   path_rootless(Path)
	;   path_empty(Path)
	), !.

authority(Parts, Tail) -->
	user_info_chars(UserChars),
	"@", !,
	{ atom_codes(User, UserChars),
	  Parts = [user(User),host(Host)|T0]
	},
	host(Host),
	port(T0,Tail).
authority([host(Host)|T0], Tail) -->
	host(Host),
	port(T0, Tail).

user_info_chars([H|T]) -->
	user_info_char(H), !,
	user_info_chars(T).
user_info_chars([]) -->
	[].

user_info_char(_) --> "@", !, {fail}.
user_info_char(C) --> pchar(C).

%host(Host) --> ip_literal(Host), !.		% TBD: IP6 addresses
host(Host) --> ip4_address(Host), !.
host(Host) --> reg_name(Host).

ip4_address(Atom) -->
	i256_chars(Chars, [0'.|T0]),
	i256_chars(T0, [0'.|T1]),
	i256_chars(T1, [0'.|T2]),
	i256_chars(T2, []),
	{ atom_codes(Atom, Chars) }.

i256_chars(Chars, T) -->
	digits(Chars, T),
	\+ \+ { T = [],
		Chars \== [],
		number_codes(I, Chars),
		I < 256
	      }.

reg_name(Host) -->
	reg_name_chars(Chars),
	{ atom_codes(Host, Chars) }.

reg_name_chars([H|T]) -->
	reg_name_char(H), !,
	reg_name_chars(T).
reg_name_chars([]) -->
	[].

reg_name_char(C) -->
	pchar(C),
	{ C \== 0':,
	  C \== 0'@
	}.

port([port(Port)|T], T) -->
	":", !,
	digit(D0),
	digits(Ds),
	{ number_codes(Port, [D0|Ds]) }.
port(T, T) -->
	[].

path_abempty(Path) -->
	segments_chars(Chars, []),
	{   Chars == []
	->  Path = '/'
	;   atom_codes(Path, Chars)
	}.


win_drive_path(Path) -->
	drive_letter(C0),
	":",
	(   "/"
	->  {Codes = [C0, 0':, 0'/|Chars]}
	;   {Codes = [C0, 0':|Chars]}
	),
	segment_nz_chars(Chars, T0),
	segments_chars(T0, []),
	{ atom_codes(Path, Codes) }.


path_absolute(Path) -->
	"/",
	segment_nz_chars(Chars, T0),
	segments_chars(T0, []),
	{ atom_codes(Path, [0'/| Chars]) }.

path_noschema(Path) -->
	segment_nz_nc_chars(Chars, T0),
	segments_chars(T0, []),
	{ atom_codes(Path, Chars) }.

path_rootless(Path) -->
	segment_nz_chars(Chars, T0),
	segments_chars(T0, []),
	{ atom_codes(Path, Chars) }.

path_empty('/') -->
	"".

segments_chars([0'/|Chars], T) -->	% 0'
	"/", !,
	segment_chars(Chars, T0),
	segments_chars(T0, T).
segments_chars(T, T) -->
	[].

segment_chars([H|T0], T) -->
	pchar(H), !,
	segment_chars(T0, T).
segment_chars(T, T) -->
	[].

segment_nz_chars([H|T0], T) -->
	pchar(H),
	segment_chars(T0, T).

segment_nz_nc_chars([H|T0], T) -->
	segment_nz_nc_char(H), !,
	segment_nz_nc_chars(T0, T).
segment_nz_nc_chars(T, T) -->
	[].

segment_nz_nc_char(_) --> ":", !, {fail}.
segment_nz_nc_char(C) --> pchar(C).


%%	query(-Parts, ?Tail)// is det.
%
%	Extract &Name=Value, ...

query([search(Params)|T], T) -->
	"?", !,
	search(Params).
query(T,T) -->
	[].

search([Parameter|Parameters])-->
	parameter(Parameter), !,
	(   search_sep
        ->  search(Parameters)
        ;   { Parameters = [] }
        ).
search([]) -->
	[].

parameter(Param)--> !,
	search_chars(NameS),
	{ atom_codes(Name, NameS)
	},
	(   "="
        ->  search_value_chars(ValueS),
	    { atom_codes(Value, ValueS),
	      Param = (Name = Value)
	    }
        ;   { Param = Name
	    }
        ).

search_chars([C|T]) -->
	search_char(C), !,
	search_chars(T).
search_chars([]) -->
	[].

search_char(_) --> search_sep, !, { fail }.
search_char(_) --> "=", !, { fail }.
search_char(C) --> fragment_char(C).

search_value_chars([C|T]) -->
	search_value_char(C), !,
	search_value_chars(T).
search_value_chars([]) -->
	[].

search_value_char(_) --> search_sep, !, { fail }.
search_value_char(C) --> fragment_char(C).

%%	search_sep// is semidet.
%
%	Matches a search-parameter separator.  Traditonally, this is the
%	&-char, but these days there are `newstyle' ;-char separators.
%
%	@see http://perldoc.perl.org/CGI.html
%	@tbd This should be configurable

search_sep --> "&", !.
search_sep --> ";".


%%	fragment(-Fragment, ?Tail)//
%
%	Extract the fragment (after the =#=)

fragment([fragment(Fragment)|T], T) -->
	"#", !,
	fragment_chars(Codes),
	{ atom_codes(Fragment, Codes) }.
fragment(T, T) -->
	[].

fragment_chars([H|T]) -->
	fragment_char(H), !,
	fragment_chars(T).
fragment_chars([]) -->
	[].


%%	fragment_char(-Char)
%
%	Find a fragment character.

fragment_char(C)   --> pchar(C), !.
fragment_char(0'/) --> "/", !.
fragment_char(0'?) --> "?", !.
fragment_char(0'[) --> "[", !.		% Not according RDF3986!
fragment_char(0']) --> "]", !.


		 /*******************************
		 *	CHARACTER CLASSES	*
		 *******************************/

%%	pchar(-Code)//
%
%	unreserved|pct_encoded|sub_delim|":"|"@"
%
%	Performs UTF-8 decoding of percent encoded strings.

pchar(0' ) --> "+", !.			%' ?
pchar(C) -->
	[C],
	{   unreserved(C)
	;   sub_delim(C)
	;   C == 0':
        ;   C == 0'@
	}, !.
pchar(C) -->
	percent_coded(C).

%%	lwalpha(-C)//
%
%	Demand alpha, return as lowercase

lwalpha(H) -->
	[C],
	{ C < 128,
	  code_type(C, alpha),
	  code_type(H, to_lower(C))
	}.

drive_letter(C) -->
	[C],
	{ C < 128,
	  code_type(C, alpha)
	}.


		 /*******************************
		 *	RESERVED CHARACTERS	*
		 *******************************/

%%	sub_delim(?Code)
%
%	Sub-delimiters

sub_delim(0'!).
sub_delim(0'$).
sub_delim(0'&).
sub_delim(0'').
sub_delim(0'().
sub_delim(0')).
sub_delim(0'*).
sub_delim(0'+).
sub_delim(0',).
sub_delim(0';).
sub_delim(0'=).


%%	unreserved(+C)
%
%	Characters that can be represented without procent escaping
%	RFC 3986, section 2.3

term_expansion(unreserved(map), Clauses) :-
	findall(unreserved(C), unreserved_(C), Clauses).

unreserved_(C) :-
	between(1, 128, C),
	code_type(C, alnum).
unreserved_(0'-).
unreserved_(0'.).
unreserved_(0'_).
unreserved_(0'~).			% 0'

unreserved(map).			% Expanded


		 /*******************************
		 *	        FORMS		*
		 *******************************/

/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Encoding/decoding of form-fields  using   the  popular  www-form-encoded
encoding used with the HTTP GET.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

%%	www_form_encode(+Value, -XWWWFormEncoded) is det.
%%	www_form_encode(-Value, +XWWWFormEncoded) is det.
%
%	En/Decode        between        native          value        and
%	application/x-www-form-encoded. Maps space to   +,  keeps alnum,
%	maps anything else to =|%XX|= and   newlines to =|%OD%OA|=. When
%	decoding, newlines appear as a single newline (10) character.

www_form_encode(Value, Encoded) :-
	atomic(Value), !,
	atom_codes(Value, Codes),
	phrase(www_encode(Codes, ""), EncCodes),
	atom_codes(Encoded, EncCodes).
www_form_encode(Value, Encoded) :-
	atom_codes(Encoded, EncCodes),
	phrase(www_decode(Codes), EncCodes),
	atom_codes(Value, Codes).

%%	www_encode(+Codes, +ExtraUnescaped)//

www_encode([0'\r, 0'\n|T], Extra) --> !,
	"%0D%0A",
	www_encode(T, Extra).
www_encode([0'\n|T], Extra) --> !,
	"%0D%0A",
	www_encode(T, Extra).
www_encode([H|T], Extra) -->
	percent_encode(H, Extra),
	www_encode(T, Extra).
www_encode([], _) -->
	"".

percent_encode(C, _Extra) -->
	{ unreserved(C) }, !,
	[C].
percent_encode(C, Extra) -->
	{ memberchk(C, Extra) }, !,
	[C].
%percent_encode(0' , _) --> !, "+".	% Deprecated: use %20
percent_encode(C, _) -->
	{ C =< 127 }, !,
	percent_byte(C).
percent_encode(C, _) -->		% Unicode characters
	{ current_prolog_flag(url_encoding, utf8), !,
	  phrase(utf8_codes([C]), Bytes)
	},
	percent_bytes(Bytes).
percent_encode(C, _) -->
	{ C =< 255 }, !,
	percent_byte(C).
percent_encode(_C, _) -->
	{ representation_error(url_character)
	}.

percent_bytes([]) -->
	"".
percent_bytes([H|T]) -->
	percent_byte(H),
	percent_bytes(T).

percent_byte(C) -->
	[0'%, D1, D2],
	{   nonvar(C)
	->  Dv1 is (C>>4 /\ 0xf),
	    Dv2 is (C /\ 0xf),
	    code_type(D1, xdigit(Dv1)),
	    code_type(D2, xdigit(Dv2))
	;   code_type(D1, xdigit(Dv1)),
	    code_type(D2, xdigit(Dv2)),
	    C is ((Dv1)<<4) + Dv2
	}.

percent_coded(C) -->
	percent_byte(C0), !,
	(   { C0 == 13			% %0D%0A --> \n
	    },
	    "%0",
	    ( "A" ; "a" )
	->  { C = 10
	    }
	;   { C0 >= 0xc0 },		% UTF-8 lead-in
	    utf8_cont(Cs),
	    { phrase(utf8_codes([C]), [C0|Cs]) }
	->  []
	;   { C = C0
	    }
	).

%%	www_decode(-Codes)//

www_decode([0' |T]) -->
	"+", !,
        www_decode(T).
www_decode([C|T]) -->
	percent_coded(C), !,
	www_decode(T).
www_decode([C|T]) -->
	[C], !,
	www_decode(T).
www_decode([]) -->
	[].

utf8_cont([H|T]) -->
	percent_byte(H),
	{ between(0x80, 0xbf, H) }, !,
	utf8_cont(T).
utf8_cont([]) -->
	[].


%%	set_url_encoding(?Old, +New) is semidet.
%
%	Query and set the encoding for URLs.  The default is =utf8=.
%	The only other defined value is =iso_latin_1=.
%
%	@tbd	Having a global flag is highly inconvenient, but a
%		work-around for old sites using ISO Latin 1 encoding.

:- create_prolog_flag(url_encoding, utf8, [type(atom)]).

set_url_encoding(Old, New) :-
	current_prolog_flag(url_encoding, Old),
	(   Old == New
	->  true
	;   must_be(oneof([utf8, iso_latin_1]), New),
	    set_prolog_flag(url_encoding, New)
	).


		 /*******************************
		 *	 IRI PROCESSING		*
		 *******************************/

%%	url_iri(+Encoded, -Decoded) is det.
%%	url_iri(-Encoded, +Decoded) is det.
%
%	Convert between a URL, encoding in US-ASCII   and an IRI. An IRI
%	is a fully expanded Unicode string.   Unicode  strings are first
%	encoded into UTF-8, after which %-encoding takes place.

url_iri(Encoded, Decoded) :-
	nonvar(Encoded), !,
	(   sub_atom(Encoded, _, _, _, '%')
	->  atom_codes(Encoded, Codes),
	    unescape_precent(Codes, UTF8),
	    phrase(utf8_codes(Unicodes), UTF8),
	    atom_codes(Decoded, Unicodes)
	;   Decoded = Encoded
	).
url_iri(URL, IRI) :-
	atom_codes(IRI, IRICodes),
	phrase(percent_encode(IRICodes, "/:?#&="), UrlCodes),
	atom_codes(URL, UrlCodes).


unescape_precent([], []).
unescape_precent([0'%,C1,C2|T0], [H|T]) :- !,	%'
	code_type(C1, xdigit(D1)),
	code_type(C2, xdigit(D2)),
	H is D1*16 + D2,
	unescape_precent(T0, T).
unescape_precent([H|T0], [H|T]) :-
	unescape_precent(T0, T).


		 /*******************************
		 *	     FORM DATA		*
		 *******************************/

%%	parse_url_search(?Spec, ?Fields:list(Name=Value)) is det.
%
%	Construct or analyze an HTTP   search  specification. This deals
%	with       form       data       using       the       MIME-type
%	=application/x-www-form-urlencoded=  as  used   in    HTTP   GET
%	requests.

parse_url_search(Spec, Fields) :-
	atomic(Spec), !,
	atom_codes(Spec, Codes),
	phrase(search(Fields), Codes).
parse_url_search(Codes, Fields) :-
	is_list(Codes), !,
	phrase(search(Fields), Codes).
parse_url_search(Codes, Fields) :-
	must_be(list, Fields),
	phrase(csearch(Fields, ""), Codes).


		 /*******************************
		 *	    FILE URLs		*
		 *******************************/

%%	file_name_to_url(+File, -URL) is det.
%%	file_name_to_url(-File, +URL) is semidet.
%
%	Translate between a filename and a file:// URL.
%
%	@tbd	Current implementation does not deal with paths that
%		need special encoding.

file_name_to_url(File, FileURL) :-
	nonvar(File), !,
	absolute_file_name(File, Path),
	atom_concat('file://', Path, FileURL), !.
file_name_to_url(File, FileURL) :-
	atom_concat('file://', File, FileURL), !.