This repository has been archived on 2023-08-20. You can view files and clone it, but cannot push or open issues or pull requests.
yap-6.3/packages/prism/exs/votes.psm
2011-11-10 12:24:47 +00:00

113 lines
4.1 KiB
Plaintext

%%%%
%%%% Evaluation of a naive Bayes classifier for `votes' dataset
%%%% --- votes.psm
%%%%
%%%% Copyright (C) 2009
%%%% Sato Laboratory, Dept. of Computer Science,
%%%% Tokyo Institute of Technology
%% In this program, we conduct n-fold cross validation of a naive Bayes
%% classifier. This program was created to demonstrate the usefulness of
%% the built-in predicates introduced since version 1.12. The target
%% dataset is the congressional voting records (`votes') dataset, which
%% is available from UCI machine learning repository (http://archive.ics.
%% uci.edu/ml/).
%%
%% From this program, it is seen that, using new built-in predicates such
%% as maplist/5, avglist/2, random_shuffle/2, and so on, we can make the
%% utility part compact, as well as the modeling part. Also one may find
%% that we only combine general-purpose built-ins to implement n-fold cross
%% validation.
%%-------------------------------------
%% Quick start : sample session
%%
%% (Preparation: Download the data file `house-votes-84.data' from UCI ML
%% repository, and put it `as-is' on the current directly)
%%
%% ?- prism(votes),votes_learn. % Learn parameters from the whole dataset
%%
%% ?- prism(votes),votes_cv(10). % Conduct 10-fold cross validation
%%
%%-------------------------------------
%% Declarations
values(class,[democrat,republican]). % class labels
values(attr(_,_),[y,n]). % all attributes have two values: y or n
%%-------------------------------------
%% Modeling part (a naive Bayes model)
%%
%% [Note]
%% According to `house-votes-84.names', a data description file for the
%% `votes' dataset, '?' simply denotes that the value is not "yea" nor
%% "nay". On the other hand, in this program, we consider '?' as a missing
%% value just for demonstration purpose.
nbayes(C,Vals):- msw(class,C),nbayes(1,C,Vals).
nbayes(_,_,[]).
nbayes(J,C,[V|Vals]):-
choose(J,C,V),
J1 is J+1,
nbayes(J1,C,Vals).
choose(J,C,V):-
( V == '?' -> msw(attr(J,C),_) % handling '?' as a missing value
; msw(attr(J,C),V0),
V = V0
).
%%-------------------------------------
%% Utility part:
%% Batch routine for a simple learning
votes_learn:-
load_data_file(Gs),
learn(Gs).
%% Batch routine for N-fold cross validation
votes_cv(N):-
random_set_seed(81729), % Fix the random seed to keep the same splitting
load_data_file(Gs0), % Load the entire data
random_shuffle(Gs0,Gs), % Randomly reorder the data
numlist(1,N,Ks), % Get Ks = [1,...,N] (B-Prolog built-in)
maplist(K,Rate,votes_cv(Gs,K,N,Rate),Ks,Rates),
% Call votes_cv/2 for K=1...N
avglist(Rates,AvgRate), % Get the avg. of the precisions
maplist(K,Rate,format("Test #~d: ~2f%~n",[K,Rate*100]),Ks,Rates),
format("Average: ~2f%~n",[AvgRate*100]).
%% Subroutine for learning and testing for K-th split data (K = 1...N)
votes_cv(Gs,K,N,Rate):-
format("<<<< Test #~d >>>>~n",[K]),
separate_data(Gs,K,N,Gs0,Gs1), % Gs0: training data, Gs1: test data
learn(Gs0), % Learn by PRISM's built-in
maplist(nbayes(C,Vs),R,(viterbig(nbayes(C0,Vs)),(C0==C->R=1;R=0)),Gs1,Rs),
% Predict the class by viterbig/1 for each test example
% and evaluate it with the answer class label
avglist(Rs,Rate), % Get the accuracy for the K-th splitting
format("Done (~2f%).~n~n",[Rate*100]).
%% Split the entire data (Data) into the training data (Train)
%% and the test data (Test) for the K-th evaluation (K=1...N)
separate_data(Data,K,N,Train,Test):-
length(Data,L),
L0 is L*(K-1)//N, % L0: offset of the test data (// - integer division)
L1 is L*(K-0)//N-L0, % L1: size of the test data
splitlist(Train0,Rest,Data,L0), % Length of Train0 = L0
splitlist(Test,Train1,Rest,L1), % Length of Test = L1
append(Train0,Train1,Train).
%% Load the `votes' data in CSV form and convert it to suitable
%% Prolog terms
load_data_file(Gs):-
load_csv('house-votes-84.data',Gs0),
maplist(csvrow([C|Vs]),nbayes(C,Vs),true,Gs0,Gs).