% Natural Language Processing in Prolog using Definite Clause Grammar rules
% 
% This example is a straightforward adaptation of the original plain Prolog 
% code described in the paper:
%
%	Tokenization using DCG Rules 
%	Michael A. Covington 
%	Artificial Intelligence Center 
%	The University of Georgia 
%	Athens, Georgia 30602-7415 U.S.A. 
%	2000 April 21
%
% A copy of the paper is available at:
%
%	http://www.ai.uga.edu/~mc/projpaper.ps
%
% Usage example:
%
%	| ?- tokenizer::tokens(" We owe $1,048,576.24 to Agent 007 for Version 3.14159! ", Tokens).

:- object(tokenizer).

	:- info([
		version is 1.0,
		date is 2006/2/11,
		author is 'Michael A. Covington',
		comment is 'Natural language tokenizer example using DCG rules.']).

	:- public(tokens/2).
	:- mode(tokens(+string, -list), zero_or_more).
	:- info(tokens/2, [
		comment is 'Parses a string into a list of tokens.',
		argnames is ['String', 'Tokens']]).

	tokens(String, Tokens) :-
		phrase(token_list(Tokens), String).

	% A token list is a series of zero or more tokens.
	% Its argument consists of the list of tokens, as atoms and numbers.
	% The cut ensures that the maximum number of characters is gathered into each token.

	token_list([T| Rest]) --> blank0, token(T), !, token_list(Rest).
	token_list([]) --> blank0.

	% blank0 is a series of zero or more blanks.

	blank0 --> [C], {char_type(C, blank)}, !, blank0.
	blank0 --> [].

	% Several kinds of tokens.
	% This is where lists of characters get converted into atoms or numbers.

	token(T) --> special(L), {atom_codes(T, L)}.
	token(T) --> word(W), {atom_codes(T, W)}.
	token(T) --> numeral(N), {number_codes(T, N)}.

	% A word is a series of one or more letters.
	% The rules are ordered so that we first try to gather as many
	% characters into one digit_string as possible.

	word([L| Rest]) --> letter(L), word(Rest).
	word([L]) --> letter(L).

	% A numeral is a list of characters that constitute a number.
	% The argument of numeral(...) is the list of character codes.

	numeral([C1, C2, C3| N]) --> ",", digit(C1), digit(C2), digit(C3), numeral(N).
	numeral([C1, C2, C3]) --> ",", digit(C1), digit(C2), digit(C3).
	numeral([C| N]) --> digit(C), numeral(N).	% multiple digits
	numeral([C]) --> digit(C).					% single digit
	numeral(N) --> decimal_part(N).				% decimal point and more digits

	decimal_part([46| Rest]) --> ".", digit_string(Rest).

	digit_string([D| N]) --> digit(D), digit_string(N).
	digit_string([D]) --> digit(D).

	% Various kinds of characters...

	digit(C) --> [C], {char_type(C, numeric)}.

	special([C]) --> [C], {char_type(C, special)}.

	letter(C) --> [C], {char_type(C, lowercase)}.
	letter(C) --> [U], {char_type(U, uppercase), C is U + 32}.	% Conversion to lowercase

	% char_type(+Code, ?Type)
	% Classifies a character (ASCII code) as blank, numeric, uppercase, lowercase, or special.
	% Adapted from Covington 1994.

	char_type(Code, Type) :-	% blanks, other ctrl codes 
		Code =< 32,
		!,
		Type = blank.
	
	char_type(Code, Type) :-	% digits
		48 =< Code, Code =< 57,
		!,
		Type = numeric.

	char_type(Code, Type) :-	% lowercase letters
		97 =< Code, Code =< 122,
		!,
		Type = lowercase.

	char_type(Code, Type) :-	% uppercase letters
		65 =< Code, Code =< 90,
		!,
		Type = uppercase.

	char_type(_, special).	% all others

:- end_object.