115 lines
3.2 KiB
Plaintext
115 lines
3.2 KiB
Plaintext
|
|
||
|
% Natural Language Processing in Prolog using Definite Clause Grammar rules
|
||
|
%
|
||
|
% This example is a straightforward adaptation of the original plain Prolog
|
||
|
% code described in the paper:
|
||
|
%
|
||
|
% Tokenization using DCG Rules
|
||
|
% Michael A. Covington
|
||
|
% Artificial Intelligence Center
|
||
|
% The University of Georgia
|
||
|
% Athens, Georgia 30602-7415 U.S.A.
|
||
|
% 2000 April 21
|
||
|
%
|
||
|
% A copy of the paper is available at:
|
||
|
%
|
||
|
% http://www.ai.uga.edu/~mc/projpaper.ps
|
||
|
%
|
||
|
% Usage example:
|
||
|
%
|
||
|
% | ?- tokenizer::tokens(" We owe $1,048,576.24 to Agent 007 for Version 3.14159! ", Tokens).
|
||
|
|
||
|
:- object(tokenizer).
|
||
|
|
||
|
:- info([
|
||
|
version is 1.0,
|
||
|
date is 2006/2/11,
|
||
|
author is 'Michael A. Covington',
|
||
|
comment is 'Natural language tokenizer example using DCG rules.']).
|
||
|
|
||
|
:- public(tokens/2).
|
||
|
:- mode(tokens(+string, -list), zero_or_more).
|
||
|
:- info(tokens/2, [
|
||
|
comment is 'Parses a string into a list of tokens.',
|
||
|
argnames is ['String', 'Tokens']]).
|
||
|
|
||
|
tokens(String, Tokens) :-
|
||
|
phrase(token_list(Tokens), String).
|
||
|
|
||
|
% A token list is a series of zero or more tokens.
|
||
|
% Its argument consists of the list of tokens, as atoms and numbers.
|
||
|
% The cut ensures that the maximum number of characters is gathered into each token.
|
||
|
|
||
|
token_list([T| Rest]) --> blank0, token(T), !, token_list(Rest).
|
||
|
token_list([]) --> blank0.
|
||
|
|
||
|
% blank0 is a series of zero or more blanks.
|
||
|
|
||
|
blank0 --> [C], {char_type(C, blank)}, !, blank0.
|
||
|
blank0 --> [].
|
||
|
|
||
|
% Several kinds of tokens.
|
||
|
% This is where lists of characters get converted into atoms or numbers.
|
||
|
|
||
|
token(T) --> special(L), {atom_codes(T, L)}.
|
||
|
token(T) --> word(W), {atom_codes(T, W)}.
|
||
|
token(T) --> numeral(N), {number_codes(T, N)}.
|
||
|
|
||
|
% A word is a series of one or more letters.
|
||
|
% The rules are ordered so that we first try to gather as many
|
||
|
% characters into one digit_string as possible.
|
||
|
|
||
|
word([L| Rest]) --> letter(L), word(Rest).
|
||
|
word([L]) --> letter(L).
|
||
|
|
||
|
% A numeral is a list of characters that constitute a number.
|
||
|
% The argument of numeral(...) is the list of character codes.
|
||
|
|
||
|
numeral([C1, C2, C3| N]) --> ",", digit(C1), digit(C2), digit(C3), numeral(N).
|
||
|
numeral([C1, C2, C3]) --> ",", digit(C1), digit(C2), digit(C3).
|
||
|
numeral([C| N]) --> digit(C), numeral(N). % multiple digits
|
||
|
numeral([C]) --> digit(C). % single digit
|
||
|
numeral(N) --> decimal_part(N). % decimal point and more digits
|
||
|
|
||
|
decimal_part([46| Rest]) --> ".", digit_string(Rest).
|
||
|
|
||
|
digit_string([D| N]) --> digit(D), digit_string(N).
|
||
|
digit_string([D]) --> digit(D).
|
||
|
|
||
|
% Various kinds of characters...
|
||
|
|
||
|
digit(C) --> [C], {char_type(C, numeric)}.
|
||
|
|
||
|
special([C]) --> [C], {char_type(C, special)}.
|
||
|
|
||
|
letter(C) --> [C], {char_type(C, lowercase)}.
|
||
|
letter(C) --> [U], {char_type(U, uppercase), C is U + 32}. % Conversion to lowercase
|
||
|
|
||
|
% char_type(+Code, ?Type)
|
||
|
% Classifies a character (ASCII code) as blank, numeric, uppercase, lowercase, or special.
|
||
|
% Adapted from Covington 1994.
|
||
|
|
||
|
char_type(Code, Type) :- % blanks, other ctrl codes
|
||
|
Code =< 32,
|
||
|
!,
|
||
|
Type = blank.
|
||
|
|
||
|
char_type(Code, Type) :- % digits
|
||
|
48 =< Code, Code =< 57,
|
||
|
!,
|
||
|
Type = numeric.
|
||
|
|
||
|
char_type(Code, Type) :- % lowercase letters
|
||
|
97 =< Code, Code =< 122,
|
||
|
!,
|
||
|
Type = lowercase.
|
||
|
|
||
|
char_type(Code, Type) :- % uppercase letters
|
||
|
65 =< Code, Code =< 90,
|
||
|
!,
|
||
|
Type = uppercase.
|
||
|
|
||
|
char_type(_, special). % all others
|
||
|
|
||
|
:- end_object.
|