This repository has been archived on 2023-08-20. You can view files and clone it, but cannot push or open issues or pull requests.
yap-6.3/Logtalk/examples/dcgs/tokenizer.lgt

115 lines
3.2 KiB
Plaintext
Raw Normal View History

% Natural Language Processing in Prolog using Definite Clause Grammar rules
%
% This example is a straightforward adaptation of the original plain Prolog
% code described in the paper:
%
% Tokenization using DCG Rules
% Michael A. Covington
% Artificial Intelligence Center
% The University of Georgia
% Athens, Georgia 30602-7415 U.S.A.
% 2000 April 21
%
% A copy of the paper is available at:
%
% http://www.ai.uga.edu/~mc/projpaper.ps
%
% Usage example:
%
% | ?- tokenizer::tokens(" We owe $1,048,576.24 to Agent 007 for Version 3.14159! ", Tokens).
:- object(tokenizer).
:- info([
version is 1.0,
date is 2006/2/11,
author is 'Michael A. Covington',
comment is 'Natural language tokenizer example using DCG rules.']).
:- public(tokens/2).
:- mode(tokens(+string, -list), zero_or_more).
:- info(tokens/2, [
comment is 'Parses a string into a list of tokens.',
argnames is ['String', 'Tokens']]).
tokens(String, Tokens) :-
phrase(token_list(Tokens), String).
% A token list is a series of zero or more tokens.
% Its argument consists of the list of tokens, as atoms and numbers.
% The cut ensures that the maximum number of characters is gathered into each token.
token_list([T| Rest]) --> blank0, token(T), !, token_list(Rest).
token_list([]) --> blank0.
% blank0 is a series of zero or more blanks.
blank0 --> [C], {char_type(C, blank)}, !, blank0.
blank0 --> [].
% Several kinds of tokens.
% This is where lists of characters get converted into atoms or numbers.
token(T) --> special(L), {atom_codes(T, L)}.
token(T) --> word(W), {atom_codes(T, W)}.
token(T) --> numeral(N), {number_codes(T, N)}.
% A word is a series of one or more letters.
% The rules are ordered so that we first try to gather as many
% characters into one digit_string as possible.
word([L| Rest]) --> letter(L), word(Rest).
word([L]) --> letter(L).
% A numeral is a list of characters that constitute a number.
% The argument of numeral(...) is the list of character codes.
numeral([C1, C2, C3| N]) --> ",", digit(C1), digit(C2), digit(C3), numeral(N).
numeral([C1, C2, C3]) --> ",", digit(C1), digit(C2), digit(C3).
numeral([C| N]) --> digit(C), numeral(N). % multiple digits
numeral([C]) --> digit(C). % single digit
numeral(N) --> decimal_part(N). % decimal point and more digits
decimal_part([46| Rest]) --> ".", digit_string(Rest).
digit_string([D| N]) --> digit(D), digit_string(N).
digit_string([D]) --> digit(D).
% Various kinds of characters...
digit(C) --> [C], {char_type(C, numeric)}.
special([C]) --> [C], {char_type(C, special)}.
letter(C) --> [C], {char_type(C, lowercase)}.
letter(C) --> [U], {char_type(U, uppercase), C is U + 32}. % Conversion to lowercase
% char_type(+Code, ?Type)
% Classifies a character (ASCII code) as blank, numeric, uppercase, lowercase, or special.
% Adapted from Covington 1994.
char_type(Code, Type) :- % blanks, other ctrl codes
Code =< 32,
!,
Type = blank.
char_type(Code, Type) :- % digits
48 =< Code, Code =< 57,
!,
Type = numeric.
char_type(Code, Type) :- % lowercase letters
97 =< Code, Code =< 122,
!,
Type = lowercase.
char_type(Code, Type) :- % uppercase letters
65 =< Code, Code =< 90,
!,
Type = uppercase.
char_type(_, special). % all others
:- end_object.