264 lines
7.3 KiB
Plaintext
264 lines
7.3 KiB
Plaintext
|
%%%%
|
||
|
%%%% Profile HMM --- phmm.psm
|
||
|
%%%%
|
||
|
%%%% Copyright (C) 2004,2006,2007,2008
|
||
|
%%%% Sato Laboratory, Dept. of Computer Science,
|
||
|
%%%% Tokyo Institute of Technology
|
||
|
|
||
|
%% Profile HMMs are a variant of HMMs that have three types of states,
|
||
|
%% i.e. `match state',`insert state' and `delete state.' Match states
|
||
|
%% constitute an HMM that outputs a `true' string. Insertion states
|
||
|
%% emit a symbol additionally to the `true' string whereas delete (skip)
|
||
|
%% states emit no symbol.
|
||
|
%%
|
||
|
%% Profile HMMs are used to align amino-acid sequences by inserting
|
||
|
%% and skipping symbols as well as matching symbols. For example
|
||
|
%% amino-acid sequences below
|
||
|
%%
|
||
|
%% HLKIANRKDKHHNKEFGGHHLA
|
||
|
%% HLKATHRKDQHHNREFGGHHLA
|
||
|
%% VLKFANRKSKHHNKEMGAHHLA
|
||
|
%% ...
|
||
|
%%
|
||
|
%% are aligned by the profile HMM program in this file as follows.
|
||
|
%%
|
||
|
%% -HLKIA-NRKDK-H-H----NKEFGGHH-LA
|
||
|
%% -HLK-A-T-HRK-DQHHN--R-EFGGHH-LA
|
||
|
%% -VLKFA-NRKSK-H-H----NKEMGAHH-LA
|
||
|
%% ...
|
||
|
|
||
|
%%-------------------------------------
|
||
|
%% Quick start : sample session, align the sample data in phmm.dat.
|
||
|
%%
|
||
|
%% To run on an interactive session:
|
||
|
%% ?- prism(phmm),go. (ML/MAP)
|
||
|
%% ?- prism(phmm),go_vb. (variational Bayes)
|
||
|
%%
|
||
|
%% To perform a batch execution:
|
||
|
%% > upprism phmm
|
||
|
|
||
|
go :-
|
||
|
read_goals(Gs,'phmm.dat'), % Read the sequence data from phmm.dat.
|
||
|
learn(Gs), % Learn parameters from the data.
|
||
|
wmag(Gs). % Compute viterbi paths using the learned
|
||
|
% parameters and aligns sequences in Gs.
|
||
|
|
||
|
% To enable variational Bayes, we need some additional flag settings:
|
||
|
go_vb :-
|
||
|
set_prism_flag(learn_mode,both),
|
||
|
set_prism_flag(viterbi_mode,hparams),
|
||
|
set_prism_flag(reset_hparams,on),
|
||
|
go.
|
||
|
|
||
|
prism_main :- go.
|
||
|
%prism_main :- go_vb.
|
||
|
|
||
|
|
||
|
%%%--------------------- model ---------------------
|
||
|
|
||
|
observe(Sequence) :- hmm(Sequence,start).
|
||
|
|
||
|
hmm([],end).
|
||
|
hmm(Sequence,State) :-
|
||
|
State \== end,
|
||
|
msw(move_from(State),NextState),
|
||
|
msw(emit_at(State), Symbol),
|
||
|
( Symbol = epsilon ->
|
||
|
hmm( Sequence, NextState )
|
||
|
; Sequence = [Symbol|TailSeq],
|
||
|
hmm( TailSeq , NextState )
|
||
|
).
|
||
|
|
||
|
amino_acids(['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R',
|
||
|
'S','T','V','W','X','Y']).
|
||
|
hmm_len(17).
|
||
|
|
||
|
%%%--------------------- values ---------------------
|
||
|
|
||
|
values(move_from(State),Values) :-
|
||
|
hmm_len(Len),
|
||
|
get_index(State,X),
|
||
|
( 0 =< X, X < Len ->
|
||
|
Y is X + 1,
|
||
|
Values = [insert(X),match(Y),delete(Y)]
|
||
|
; Values = [insert(X),end] ).
|
||
|
|
||
|
values(emit_at(State),Vs) :-
|
||
|
((State = insert(_) ; State = match(_)) ->
|
||
|
amino_acids(Vs)
|
||
|
; Vs = [epsilon] ).
|
||
|
|
||
|
%%%--------------------- set_sw ---------------------
|
||
|
|
||
|
:- init_set_sw.
|
||
|
|
||
|
init_set_sw :-
|
||
|
% tell('/dev/null'), % Suppress output (on Linux only)
|
||
|
set_sw( move_from(start) ),
|
||
|
set_sw( move_from(insert(0)) ),
|
||
|
set_sw( emit_at(start) ),
|
||
|
set_sw( emit_at(insert(0)) ),
|
||
|
hmm_len(Len),
|
||
|
% told,
|
||
|
init_set_sw(Len).
|
||
|
|
||
|
init_set_sw(0).
|
||
|
init_set_sw(X) :-
|
||
|
X > 0,
|
||
|
set_sw( move_from(insert(X)) ),
|
||
|
set_sw( move_from(match(X)) ),
|
||
|
set_sw( move_from(delete(X)) ),
|
||
|
set_sw( emit_at(insert(X)) ),
|
||
|
set_sw( emit_at(match(X)) ),
|
||
|
set_sw( emit_at(delete(X)) ),
|
||
|
Y is X - 1,
|
||
|
init_set_sw(Y).
|
||
|
|
||
|
%%%--------------------- estimation ---------------------
|
||
|
%% most likely path
|
||
|
%% mlpath(['A','E'],Path) => Path = [start,match(1),end]
|
||
|
|
||
|
mlpath(Sequence,Path):-
|
||
|
mlpath(Sequence,Path,_).
|
||
|
mlpath(Sequence,Path,Prob):-
|
||
|
viterbif(hmm(Sequence,start),Prob,Nodes),
|
||
|
nodes2path(Nodes,Path).
|
||
|
|
||
|
nodes2path([Node|Nodes],[State|Path]):-
|
||
|
Node = node(hmm(_,State),_),
|
||
|
nodes2path(Nodes,Path).
|
||
|
nodes2path([],[]).
|
||
|
|
||
|
mlpaths([Seq|Seqs],[Path|Paths], X):-
|
||
|
mlpath(Seq,Path),
|
||
|
X= [P|_], writeln(P),
|
||
|
stop_low_level_trace,
|
||
|
mlpaths(Seqs,Paths, X).
|
||
|
mlpaths([],[],_).
|
||
|
|
||
|
%%%--------------------- alignment ---------------------
|
||
|
|
||
|
wmag(Gs):-
|
||
|
seqs2goals(S,Gs),wma(S).
|
||
|
wma(Seqs):-
|
||
|
write_multiple_alignments(Seqs).
|
||
|
write_multiple_alignments(Seqs):-
|
||
|
nl,
|
||
|
write('search Viterbi paths...'),nl,
|
||
|
mlpaths(Seqs,Paths,Paths),
|
||
|
write('done.'),
|
||
|
nl,
|
||
|
write('------------ALIGNMENTS------------'),
|
||
|
nl,
|
||
|
write_multiple_alignments( Seqs, Paths ),
|
||
|
write('----------------------------------'),
|
||
|
nl.
|
||
|
|
||
|
make_max_length_list([Path|Paths],MaxLenList) :-
|
||
|
make_max_length_list(Paths, TmpLenList),
|
||
|
make_length_list(Path,LenList),
|
||
|
marge_len_list(LenList,TmpLenList,MaxLenList).
|
||
|
make_max_length_list([Path],MaxLenList) :-
|
||
|
!,make_length_list(Path,MaxLenList).
|
||
|
|
||
|
marge_len_list([H1|T1],[H2|T2],[MargedH|MargedT]) :-
|
||
|
max(MargedH,[H1,H2]),
|
||
|
marge_len_list(T1,T2,MargedT).
|
||
|
marge_len_list([],[],[]).
|
||
|
|
||
|
%% make_length_list([start,insert(0),match(1),end],LenList)
|
||
|
%% -> LenList = [2,1]
|
||
|
%% make_length_list([start,delete(1),insert(1),insert(1),end],LenList)
|
||
|
%% -> LenList = [1,1]
|
||
|
|
||
|
make_length_list(Path,[Len|LenList]) :-
|
||
|
count_emission(Path,Len,NextIndexPath),
|
||
|
make_length_list(NextIndexPath,LenList).
|
||
|
make_length_list([end],[]).
|
||
|
|
||
|
count_emission(Path,Count,NextIndexPath) :-
|
||
|
Path = [State|_],
|
||
|
get_index(State,Index),
|
||
|
count_emission2(Path,Count,Index,NextIndexPath).
|
||
|
|
||
|
%% count_emission2([start,insert(0),match(1),end],Count,0,NextIndexPath)
|
||
|
%% -> Count = 2, NextIndexPath = [match(1),end]
|
||
|
%% count_emission2([delete(1),insert(1),insert(1),end],Count,1,NextIndexPath)
|
||
|
%% -> Count = 2, NextIndexPath = [end]
|
||
|
|
||
|
count_emission2([State|Path],Count,Index,NextIndexPath) :-
|
||
|
( get_index(State,Index) ->
|
||
|
count_emission2( Path, Count2, Index, NextIndexPath ),
|
||
|
( (State = delete(_); State==start) ->
|
||
|
Count = Count2
|
||
|
; Count is Count2 + 1 )
|
||
|
; Count = 0,
|
||
|
NextIndexPath = [State|Path]
|
||
|
).
|
||
|
|
||
|
write_multiple_alignments(Seqs,Paths) :-
|
||
|
make_max_length_list(Paths,LenList),
|
||
|
write_multiple_alignments(Seqs,Paths,LenList).
|
||
|
write_multiple_alignments([Seq|Seqs],[Path|Paths],LenList) :-
|
||
|
write_alignment(Seq,Path,LenList),
|
||
|
write_multiple_alignments(Seqs,Paths,LenList).
|
||
|
write_multiple_alignments([],[],_).
|
||
|
|
||
|
write_alignment(Seq,Path,LenList) :-
|
||
|
write_alignment(Seq,Path,LenList,0).
|
||
|
|
||
|
write_alignment([],[end],[],_):- !,nl.
|
||
|
write_alignment(Seq,[State|Path],LenList,Index) :-
|
||
|
get_index(State,Index),!,
|
||
|
( (State = delete(_) ; State == start) ->
|
||
|
write_alignment( Seq, Path, LenList, Index )
|
||
|
; Seq = [Symbol|Seq2],
|
||
|
LenList = [Len|LenList2],
|
||
|
write(Symbol),
|
||
|
Len2 is Len - 1,
|
||
|
write_alignment(Seq2,Path,[Len2|LenList2],Index)
|
||
|
).
|
||
|
write_alignment(Seq,[State|Path],LenList,Index) :-
|
||
|
LenList = [Len|LenList2],
|
||
|
Index2 is Index + 1,
|
||
|
pad(Len),
|
||
|
write_alignment(Seq,[State|Path],LenList2,Index2).
|
||
|
|
||
|
pad(Len) :-
|
||
|
Len > 0,
|
||
|
write('-'),
|
||
|
Len2 is Len - 1,!,
|
||
|
pad(Len2).
|
||
|
pad(0).
|
||
|
|
||
|
%%%--------------------- utility ---------------------
|
||
|
|
||
|
get_index(State,Index) :-
|
||
|
(State=match(_),!,State=match(Index));
|
||
|
(State=insert(_),!,State=insert(Index));
|
||
|
(State=delete(_),!,State=delete(Index));
|
||
|
(State=start,!,Index=0);
|
||
|
(State=end,!,hmm_len(X),Index is X+1).
|
||
|
|
||
|
seqs2goals([Seq|Seqs],[Goal|Goals]) :-
|
||
|
Goal = observe(Seq),
|
||
|
seqs2goals(Seqs,Goals).
|
||
|
seqs2goals([],[]).
|
||
|
|
||
|
max(Max,[Head|Tail]) :-
|
||
|
max(Tmp,Tail),!,
|
||
|
( Tmp > Head -> Max = Tmp ; Max = Head ).
|
||
|
max(Max,[Max]).
|
||
|
|
||
|
read_goals(Goals,FileName) :-
|
||
|
see(FileName),
|
||
|
read_goals(Goals),
|
||
|
seen.
|
||
|
read_goals(Goals) :-
|
||
|
read(Term),
|
||
|
( Term = end_of_file ->
|
||
|
Goals = []
|
||
|
; Goals = [Term|Goals1],
|
||
|
read_goals(Goals1)
|
||
|
).
|