yap-6.3/Logtalk/examples/dcgs/tokenizer.lgt


% Natural Language Processing in Prolog using Definite Clause Grammar rules
% 
% This example is a straightforward adaptation of the original plain Prolog 
% code described in the paper:
%
%	Tokenization using DCG Rules 
%	Michael A. Covington 
%	Artificial Intelligence Center 
%	The University of Georgia 
%	Athens, Georgia 30602-7415 U.S.A. 
%	2000 April 21
%
% A copy of the paper is available at:
%
%	http://www.ai.uga.edu/~mc/projpaper.ps
%
% Usage example:
%
%	| ?- tokenizer::tokens(" We owe $1,048,576.24 to Agent 007 for Version 3.14159! ", Tokens).

:- object(tokenizer).

	:- info([
		version is 1.0,
		date is 2006/2/11,
		author is 'Michael A. Covington',
		comment is 'Natural language tokenizer example using DCG rules.']).

	:- public(tokens/2).
	:- mode(tokens(+string, -list), zero_or_more).
	:- info(tokens/2, [
		comment is 'Parses a string into a list of tokens.',
		argnames is ['String', 'Tokens']]).

	tokens(String, Tokens) :-
		phrase(token_list(Tokens), String).

	% A token list is a series of zero or more tokens.
	% Its argument consists of the list of tokens, as atoms and numbers.
	% The cut ensures that the maximum number of characters is gathered into each token.

	token_list([T| Rest]) --> blank0, token(T), !, token_list(Rest).
	token_list([]) --> blank0.

	% blank0 is a series of zero or more blanks.

	blank0 --> [C], {char_type(C, blank)}, !, blank0.
	blank0 --> [].

	% Several kinds of tokens.
	% This is where lists of characters get converted into atoms or numbers.

	token(T) --> special(L), {atom_codes(T, L)}.
	token(T) --> word(W), {atom_codes(T, W)}.
	token(T) --> numeral(N), {number_codes(T, N)}.

	% A word is a series of one or more letters.
	% The rules are ordered so that we first try to gather as many
	% characters into one digit_string as possible.

	word([L| Rest]) --> letter(L), word(Rest).
	word([L]) --> letter(L).

	% A numeral is a list of characters that constitute a number.
	% The argument of numeral(...) is the list of character codes.

	numeral([C1, C2, C3| N]) --> ",", digit(C1), digit(C2), digit(C3), numeral(N).
	numeral([C1, C2, C3]) --> ",", digit(C1), digit(C2), digit(C3).
	numeral([C| N]) --> digit(C), numeral(N).	% multiple digits
	numeral([C]) --> digit(C).					% single digit
	numeral(N) --> decimal_part(N).				% decimal point and more digits

	decimal_part([46| Rest]) --> ".", digit_string(Rest).

	digit_string([D| N]) --> digit(D), digit_string(N).
	digit_string([D]) --> digit(D).

	% Various kinds of characters...

	digit(C) --> [C], {char_type(C, numeric)}.

	special([C]) --> [C], {char_type(C, special)}.

	letter(C) --> [C], {char_type(C, lowercase)}.
	letter(C) --> [U], {char_type(U, uppercase), C is U + 32}.	% Conversion to lowercase

	% char_type(+Code, ?Type)
	% Classifies a character (ASCII code) as blank, numeric, uppercase, lowercase, or special.
	% Adapted from Covington 1994.

	char_type(Code, Type) :-	% blanks, other ctrl codes 
		Code =< 32,
		!,
		Type = blank.
	
	char_type(Code, Type) :-	% digits
		48 =< Code, Code =< 57,
		!,
		Type = numeric.

	char_type(Code, Type) :-	% lowercase letters
		97 =< Code, Code =< 122,
		!,
		Type = lowercase.

	char_type(Code, Type) :-	% uppercase letters
		65 =< Code, Code =< 90,
		!,
		Type = uppercase.

	char_type(_, special).	% all others

:- end_object.
Logtalk 2.27.1 files. git-svn-id: https://yap.svn.sf.net/svnroot/yap/trunk@1580 b08c6af1-5177-4d33-ba66-4b1c6b8b522a 2006-03-26 17:31:34 +00:00
			`% Natural Language Processing in Prolog using Definite Clause Grammar rules`
			`%`
			`% This example is a straightforward adaptation of the original plain Prolog`
			`% code described in the paper:`
			`%`
			`% Tokenization using DCG Rules`
			`% Michael A. Covington`
			`% Artificial Intelligence Center`
			`% The University of Georgia`
			`% Athens, Georgia 30602-7415 U.S.A.`
			`% 2000 April 21`
			`%`
			`% A copy of the paper is available at:`
			`%`
			`% http://www.ai.uga.edu/~mc/projpaper.ps`
			`%`
			`% Usage example:`
			`%`
			`% \| ?- tokenizer::tokens(" We owe $1,048,576.24 to Agent 007 for Version 3.14159! ", Tokens).`

			`:- object(tokenizer).`

			`:- info([`
			`version is 1.0,`
			`date is 2006/2/11,`
			`author is 'Michael A. Covington',`
			`comment is 'Natural language tokenizer example using DCG rules.']).`

			`:- public(tokens/2).`
			`:- mode(tokens(+string, -list), zero_or_more).`
			`:- info(tokens/2, [`
			`comment is 'Parses a string into a list of tokens.',`
			`argnames is ['String', 'Tokens']]).`

			`tokens(String, Tokens) :-`
			`phrase(token_list(Tokens), String).`

			`% A token list is a series of zero or more tokens.`
			`% Its argument consists of the list of tokens, as atoms and numbers.`
			`% The cut ensures that the maximum number of characters is gathered into each token.`

			`token_list([T\| Rest]) --> blank0, token(T), !, token_list(Rest).`
			`token_list([]) --> blank0.`

			`% blank0 is a series of zero or more blanks.`

			`blank0 --> [C], {char_type(C, blank)}, !, blank0.`
			`blank0 --> [].`

			`% Several kinds of tokens.`
			`% This is where lists of characters get converted into atoms or numbers.`

			`token(T) --> special(L), {atom_codes(T, L)}.`
			`token(T) --> word(W), {atom_codes(T, W)}.`
			`token(T) --> numeral(N), {number_codes(T, N)}.`

			`% A word is a series of one or more letters.`
			`% The rules are ordered so that we first try to gather as many`
			`% characters into one digit_string as possible.`

			`word([L\| Rest]) --> letter(L), word(Rest).`
			`word([L]) --> letter(L).`

			`% A numeral is a list of characters that constitute a number.`
			`% The argument of numeral(...) is the list of character codes.`

			`numeral([C1, C2, C3\| N]) --> ",", digit(C1), digit(C2), digit(C3), numeral(N).`
			`numeral([C1, C2, C3]) --> ",", digit(C1), digit(C2), digit(C3).`
			`numeral([C\| N]) --> digit(C), numeral(N). % multiple digits`
			`numeral([C]) --> digit(C). % single digit`
			`numeral(N) --> decimal_part(N). % decimal point and more digits`

			`decimal_part([46\| Rest]) --> ".", digit_string(Rest).`

			`digit_string([D\| N]) --> digit(D), digit_string(N).`
			`digit_string([D]) --> digit(D).`

			`% Various kinds of characters...`

			`digit(C) --> [C], {char_type(C, numeric)}.`

			`special([C]) --> [C], {char_type(C, special)}.`

			`letter(C) --> [C], {char_type(C, lowercase)}.`
			`letter(C) --> [U], {char_type(U, uppercase), C is U + 32}. % Conversion to lowercase`

			`% char_type(+Code, ?Type)`
			`% Classifies a character (ASCII code) as blank, numeric, uppercase, lowercase, or special.`
			`% Adapted from Covington 1994.`

			`char_type(Code, Type) :- % blanks, other ctrl codes`
			`Code =< 32,`
			`!,`
			`Type = blank.`

			`char_type(Code, Type) :- % digits`
			`48 =< Code, Code =< 57,`
			`!,`
			`Type = numeric.`

			`char_type(Code, Type) :- % lowercase letters`
			`97 =< Code, Code =< 122,`
			`!,`
			`Type = lowercase.`

			`char_type(Code, Type) :- % uppercase letters`
			`65 =< Code, Code =< 90,`
			`!,`
			`Type = uppercase.`

			`char_type(_, special). % all others`

			`:- end_object.`