git-svn-id: https://yap.svn.sf.net/svnroot/yap/trunk@1580 b08c6af1-5177-4d33-ba66-4b1c6b8b522a
		
			
				
	
	
		
			115 lines
		
	
	
		
			3.2 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			115 lines
		
	
	
		
			3.2 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
 | 
						|
% Natural Language Processing in Prolog using Definite Clause Grammar rules
 | 
						|
% 
 | 
						|
% This example is a straightforward adaptation of the original plain Prolog 
 | 
						|
% code described in the paper:
 | 
						|
%
 | 
						|
%	Tokenization using DCG Rules 
 | 
						|
%	Michael A. Covington 
 | 
						|
%	Artificial Intelligence Center 
 | 
						|
%	The University of Georgia 
 | 
						|
%	Athens, Georgia 30602-7415 U.S.A. 
 | 
						|
%	2000 April 21
 | 
						|
%
 | 
						|
% A copy of the paper is available at:
 | 
						|
%
 | 
						|
%	http://www.ai.uga.edu/~mc/projpaper.ps
 | 
						|
%
 | 
						|
% Usage example:
 | 
						|
%
 | 
						|
%	| ?- tokenizer::tokens(" We owe $1,048,576.24 to Agent 007 for Version 3.14159! ", Tokens).
 | 
						|
 | 
						|
:- object(tokenizer).
 | 
						|
 | 
						|
	:- info([
 | 
						|
		version is 1.0,
 | 
						|
		date is 2006/2/11,
 | 
						|
		author is 'Michael A. Covington',
 | 
						|
		comment is 'Natural language tokenizer example using DCG rules.']).
 | 
						|
 | 
						|
	:- public(tokens/2).
 | 
						|
	:- mode(tokens(+string, -list), zero_or_more).
 | 
						|
	:- info(tokens/2, [
 | 
						|
		comment is 'Parses a string into a list of tokens.',
 | 
						|
		argnames is ['String', 'Tokens']]).
 | 
						|
 | 
						|
	tokens(String, Tokens) :-
 | 
						|
		phrase(token_list(Tokens), String).
 | 
						|
 | 
						|
	% A token list is a series of zero or more tokens.
 | 
						|
	% Its argument consists of the list of tokens, as atoms and numbers.
 | 
						|
	% The cut ensures that the maximum number of characters is gathered into each token.
 | 
						|
 | 
						|
	token_list([T| Rest]) --> blank0, token(T), !, token_list(Rest).
 | 
						|
	token_list([]) --> blank0.
 | 
						|
 | 
						|
	% blank0 is a series of zero or more blanks.
 | 
						|
 | 
						|
	blank0 --> [C], {char_type(C, blank)}, !, blank0.
 | 
						|
	blank0 --> [].
 | 
						|
 | 
						|
	% Several kinds of tokens.
 | 
						|
	% This is where lists of characters get converted into atoms or numbers.
 | 
						|
 | 
						|
	token(T) --> special(L), {atom_codes(T, L)}.
 | 
						|
	token(T) --> word(W), {atom_codes(T, W)}.
 | 
						|
	token(T) --> numeral(N), {number_codes(T, N)}.
 | 
						|
 | 
						|
	% A word is a series of one or more letters.
 | 
						|
	% The rules are ordered so that we first try to gather as many
 | 
						|
	% characters into one digit_string as possible.
 | 
						|
 | 
						|
	word([L| Rest]) --> letter(L), word(Rest).
 | 
						|
	word([L]) --> letter(L).
 | 
						|
 | 
						|
	% A numeral is a list of characters that constitute a number.
 | 
						|
	% The argument of numeral(...) is the list of character codes.
 | 
						|
 | 
						|
	numeral([C1, C2, C3| N]) --> ",", digit(C1), digit(C2), digit(C3), numeral(N).
 | 
						|
	numeral([C1, C2, C3]) --> ",", digit(C1), digit(C2), digit(C3).
 | 
						|
	numeral([C| N]) --> digit(C), numeral(N).	% multiple digits
 | 
						|
	numeral([C]) --> digit(C).					% single digit
 | 
						|
	numeral(N) --> decimal_part(N).				% decimal point and more digits
 | 
						|
 | 
						|
	decimal_part([46| Rest]) --> ".", digit_string(Rest).
 | 
						|
 | 
						|
	digit_string([D| N]) --> digit(D), digit_string(N).
 | 
						|
	digit_string([D]) --> digit(D).
 | 
						|
 | 
						|
	% Various kinds of characters...
 | 
						|
 | 
						|
	digit(C) --> [C], {char_type(C, numeric)}.
 | 
						|
 | 
						|
	special([C]) --> [C], {char_type(C, special)}.
 | 
						|
 | 
						|
	letter(C) --> [C], {char_type(C, lowercase)}.
 | 
						|
	letter(C) --> [U], {char_type(U, uppercase), C is U + 32}.	% Conversion to lowercase
 | 
						|
 | 
						|
	% char_type(+Code, ?Type)
 | 
						|
	% Classifies a character (ASCII code) as blank, numeric, uppercase, lowercase, or special.
 | 
						|
	% Adapted from Covington 1994.
 | 
						|
 | 
						|
	char_type(Code, Type) :-	% blanks, other ctrl codes 
 | 
						|
		Code =< 32,
 | 
						|
		!,
 | 
						|
		Type = blank.
 | 
						|
	
 | 
						|
	char_type(Code, Type) :-	% digits
 | 
						|
		48 =< Code, Code =< 57,
 | 
						|
		!,
 | 
						|
		Type = numeric.
 | 
						|
 | 
						|
	char_type(Code, Type) :-	% lowercase letters
 | 
						|
		97 =< Code, Code =< 122,
 | 
						|
		!,
 | 
						|
		Type = lowercase.
 | 
						|
 | 
						|
	char_type(Code, Type) :-	% uppercase letters
 | 
						|
		65 =< Code, Code =< 90,
 | 
						|
		!,
 | 
						|
		Type = uppercase.
 | 
						|
 | 
						|
	char_type(_, special).	% all others
 | 
						|
 | 
						|
:- end_object.
 |