This repository has been archived on 2023-08-20. You can view files and clone it, but cannot push or open issues or pull requests.
yap-6.3/packages/prism/exs/phmm.psm

264 lines
7.3 KiB
Plaintext

%%%%
%%%% Profile HMM --- phmm.psm
%%%%
%%%% Copyright (C) 2004,2006,2007,2008
%%%% Sato Laboratory, Dept. of Computer Science,
%%%% Tokyo Institute of Technology
%% Profile HMMs are a variant of HMMs that have three types of states,
%% i.e. `match state',`insert state' and `delete state.' Match states
%% constitute an HMM that outputs a `true' string. Insertion states
%% emit a symbol additionally to the `true' string whereas delete (skip)
%% states emit no symbol.
%%
%% Profile HMMs are used to align amino-acid sequences by inserting
%% and skipping symbols as well as matching symbols. For example
%% amino-acid sequences below
%%
%% HLKIANRKDKHHNKEFGGHHLA
%% HLKATHRKDQHHNREFGGHHLA
%% VLKFANRKSKHHNKEMGAHHLA
%% ...
%%
%% are aligned by the profile HMM program in this file as follows.
%%
%% -HLKIA-NRKDK-H-H----NKEFGGHH-LA
%% -HLK-A-T-HRK-DQHHN--R-EFGGHH-LA
%% -VLKFA-NRKSK-H-H----NKEMGAHH-LA
%% ...
%%-------------------------------------
%% Quick start : sample session, align the sample data in phmm.dat.
%%
%% To run on an interactive session:
%% ?- prism(phmm),go. (ML/MAP)
%% ?- prism(phmm),go_vb. (variational Bayes)
%%
%% To perform a batch execution:
%% > upprism phmm
go :-
read_goals(Gs,'phmm.dat'), % Read the sequence data from phmm.dat.
learn(Gs), % Learn parameters from the data.
wmag(Gs). % Compute viterbi paths using the learned
% parameters and aligns sequences in Gs.
% To enable variational Bayes, we need some additional flag settings:
go_vb :-
set_prism_flag(learn_mode,both),
set_prism_flag(viterbi_mode,hparams),
set_prism_flag(reset_hparams,on),
go.
prism_main :- go.
%prism_main :- go_vb.
%%%--------------------- model ---------------------
observe(Sequence) :- hmm(Sequence,start).
hmm([],end).
hmm(Sequence,State) :-
State \== end,
msw(move_from(State),NextState),
msw(emit_at(State), Symbol),
( Symbol = epsilon ->
hmm( Sequence, NextState )
; Sequence = [Symbol|TailSeq],
hmm( TailSeq , NextState )
).
amino_acids(['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R',
'S','T','V','W','X','Y']).
hmm_len(17).
%%%--------------------- values ---------------------
values(move_from(State),Values) :-
hmm_len(Len),
get_index(State,X),
( 0 =< X, X < Len ->
Y is X + 1,
Values = [insert(X),match(Y),delete(Y)]
; Values = [insert(X),end] ).
values(emit_at(State),Vs) :-
((State = insert(_) ; State = match(_)) ->
amino_acids(Vs)
; Vs = [epsilon] ).
%%%--------------------- set_sw ---------------------
:- init_set_sw.
init_set_sw :-
% tell('/dev/null'), % Suppress output (on Linux only)
set_sw( move_from(start) ),
set_sw( move_from(insert(0)) ),
set_sw( emit_at(start) ),
set_sw( emit_at(insert(0)) ),
hmm_len(Len),
% told,
init_set_sw(Len).
init_set_sw(0).
init_set_sw(X) :-
X > 0,
set_sw( move_from(insert(X)) ),
set_sw( move_from(match(X)) ),
set_sw( move_from(delete(X)) ),
set_sw( emit_at(insert(X)) ),
set_sw( emit_at(match(X)) ),
set_sw( emit_at(delete(X)) ),
Y is X - 1,
init_set_sw(Y).
%%%--------------------- estimation ---------------------
%% most likely path
%% mlpath(['A','E'],Path) => Path = [start,match(1),end]
mlpath(Sequence,Path):-
mlpath(Sequence,Path,_).
mlpath(Sequence,Path,Prob):-
viterbif(hmm(Sequence,start),Prob,Nodes),
nodes2path(Nodes,Path).
nodes2path([Node|Nodes],[State|Path]):-
Node = node(hmm(_,State),_),
nodes2path(Nodes,Path).
nodes2path([],[]).
mlpaths([Seq|Seqs],[Path|Paths], X):-
mlpath(Seq,Path),
X= [P|_], writeln(P),
stop_low_level_trace,
mlpaths(Seqs,Paths, X).
mlpaths([],[],_).
%%%--------------------- alignment ---------------------
wmag(Gs):-
seqs2goals(S,Gs),wma(S).
wma(Seqs):-
write_multiple_alignments(Seqs).
write_multiple_alignments(Seqs):-
nl,
write('search Viterbi paths...'),nl,
mlpaths(Seqs,Paths,Paths),
write('done.'),
nl,
write('------------ALIGNMENTS------------'),
nl,
write_multiple_alignments( Seqs, Paths ),
write('----------------------------------'),
nl.
make_max_length_list([Path|Paths],MaxLenList) :-
make_max_length_list(Paths, TmpLenList),
make_length_list(Path,LenList),
marge_len_list(LenList,TmpLenList,MaxLenList).
make_max_length_list([Path],MaxLenList) :-
!,make_length_list(Path,MaxLenList).
marge_len_list([H1|T1],[H2|T2],[MargedH|MargedT]) :-
max(MargedH,[H1,H2]),
marge_len_list(T1,T2,MargedT).
marge_len_list([],[],[]).
%% make_length_list([start,insert(0),match(1),end],LenList)
%% -> LenList = [2,1]
%% make_length_list([start,delete(1),insert(1),insert(1),end],LenList)
%% -> LenList = [1,1]
make_length_list(Path,[Len|LenList]) :-
count_emission(Path,Len,NextIndexPath),
make_length_list(NextIndexPath,LenList).
make_length_list([end],[]).
count_emission(Path,Count,NextIndexPath) :-
Path = [State|_],
get_index(State,Index),
count_emission2(Path,Count,Index,NextIndexPath).
%% count_emission2([start,insert(0),match(1),end],Count,0,NextIndexPath)
%% -> Count = 2, NextIndexPath = [match(1),end]
%% count_emission2([delete(1),insert(1),insert(1),end],Count,1,NextIndexPath)
%% -> Count = 2, NextIndexPath = [end]
count_emission2([State|Path],Count,Index,NextIndexPath) :-
( get_index(State,Index) ->
count_emission2( Path, Count2, Index, NextIndexPath ),
( (State = delete(_); State==start) ->
Count = Count2
; Count is Count2 + 1 )
; Count = 0,
NextIndexPath = [State|Path]
).
write_multiple_alignments(Seqs,Paths) :-
make_max_length_list(Paths,LenList),
write_multiple_alignments(Seqs,Paths,LenList).
write_multiple_alignments([Seq|Seqs],[Path|Paths],LenList) :-
write_alignment(Seq,Path,LenList),
write_multiple_alignments(Seqs,Paths,LenList).
write_multiple_alignments([],[],_).
write_alignment(Seq,Path,LenList) :-
write_alignment(Seq,Path,LenList,0).
write_alignment([],[end],[],_):- !,nl.
write_alignment(Seq,[State|Path],LenList,Index) :-
get_index(State,Index),!,
( (State = delete(_) ; State == start) ->
write_alignment( Seq, Path, LenList, Index )
; Seq = [Symbol|Seq2],
LenList = [Len|LenList2],
write(Symbol),
Len2 is Len - 1,
write_alignment(Seq2,Path,[Len2|LenList2],Index)
).
write_alignment(Seq,[State|Path],LenList,Index) :-
LenList = [Len|LenList2],
Index2 is Index + 1,
pad(Len),
write_alignment(Seq,[State|Path],LenList2,Index2).
pad(Len) :-
Len > 0,
write('-'),
Len2 is Len - 1,!,
pad(Len2).
pad(0).
%%%--------------------- utility ---------------------
get_index(State,Index) :-
(State=match(_),!,State=match(Index));
(State=insert(_),!,State=insert(Index));
(State=delete(_),!,State=delete(Index));
(State=start,!,Index=0);
(State=end,!,hmm_len(X),Index is X+1).
seqs2goals([Seq|Seqs],[Goal|Goals]) :-
Goal = observe(Seq),
seqs2goals(Seqs,Goals).
seqs2goals([],[]).
max(Max,[Head|Tail]) :-
max(Tmp,Tail),!,
( Tmp > Head -> Max = Tmp ; Max = Head ).
max(Max,[Max]).
read_goals(Goals,FileName) :-
see(FileName),
read_goals(Goals),
seen.
read_goals(Goals) :-
read(Term),
( Term = end_of_file ->
Goals = []
; Goals = [Term|Goals1],
read_goals(Goals1)
).