yap-6.3/packages/prism/exs/phmm.psm

%%%%
%%%%  Profile HMM --- phmm.psm
%%%%      
%%%%  Copyright (C) 2004,2006,2007,2008
%%%%    Sato Laboratory, Dept. of Computer Science,
%%%%    Tokyo Institute of Technology

%%  Profile HMMs are a variant of HMMs that have three types of states,
%%  i.e. `match state',`insert state' and `delete state.' Match states
%%  constitute an HMM that outputs a `true' string. Insertion states
%%  emit a symbol additionally to the `true' string whereas delete (skip)
%%  states emit no symbol.
%%
%%  Profile HMMs are used to align amino-acid sequences by inserting
%%  and skipping symbols as well as matching symbols.  For example
%%  amino-acid sequences below
%%
%%  HLKIANRKDKHHNKEFGGHHLA
%%  HLKATHRKDQHHNREFGGHHLA
%%  VLKFANRKSKHHNKEMGAHHLA
%%  ...
%% 
%%  are aligned by the profile HMM program in this file as follows.
%%
%%  -HLKIA-NRKDK-H-H----NKEFGGHH-LA
%%  -HLK-A-T-HRK-DQHHN--R-EFGGHH-LA
%%  -VLKFA-NRKSK-H-H----NKEMGAHH-LA
%%  ...

%%-------------------------------------
%%  Quick start : sample session, align the sample data in phmm.dat.
%%
%%  To run on an interactive session:
%%  ?- prism(phmm),go.         (ML/MAP)
%%  ?- prism(phmm),go_vb.      (variational Bayes)
%%
%%  To perform a batch execution:
%%  > upprism phmm

go :-
   read_goals(Gs,'phmm.dat'),   % Read the sequence data from phmm.dat.
   learn(Gs),                   % Learn parameters from the data.
   wmag(Gs).                    % Compute viterbi paths using the learned
                                % parameters and aligns sequences in Gs.

% To enable variational Bayes, we need some additional flag settings:
go_vb :-
   set_prism_flag(learn_mode,both),
   set_prism_flag(viterbi_mode,hparams),
   set_prism_flag(reset_hparams,on),
   go.

prism_main :- go.
%prism_main :- go_vb.


%%%--------------------- model ---------------------

observe(Sequence) :- hmm(Sequence,start).

hmm([],end).
hmm(Sequence,State) :-
   State \== end,
   msw(move_from(State),NextState),
   msw(emit_at(State), Symbol),
   ( Symbol = epsilon ->
       hmm( Sequence, NextState )
   ; Sequence = [Symbol|TailSeq],
       hmm( TailSeq , NextState )
   ).

amino_acids(['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R',
             'S','T','V','W','X','Y']).
hmm_len(17).

%%%--------------------- values ---------------------

values(move_from(State),Values) :-
   hmm_len(Len),
   get_index(State,X),
   ( 0 =< X, X < Len ->
       Y is X + 1,
       Values = [insert(X),match(Y),delete(Y)]
   ; Values = [insert(X),end] ).

values(emit_at(State),Vs) :-
   ((State = insert(_) ; State = match(_)) ->
       amino_acids(Vs)
   ; Vs = [epsilon] ).

%%%--------------------- set_sw ---------------------

:- init_set_sw.

init_set_sw :-
%  tell('/dev/null'),           % Suppress output (on Linux only)
   set_sw( move_from(start) ),
   set_sw( move_from(insert(0)) ),
   set_sw( emit_at(start) ),
   set_sw( emit_at(insert(0)) ),
   hmm_len(Len),
%  told,
   init_set_sw(Len).

init_set_sw(0).
init_set_sw(X) :-
   X > 0,
   set_sw( move_from(insert(X)) ),
   set_sw( move_from(match(X)) ),
   set_sw( move_from(delete(X)) ),
   set_sw( emit_at(insert(X)) ),
   set_sw( emit_at(match(X)) ),
   set_sw( emit_at(delete(X)) ),
   Y is X - 1,
   init_set_sw(Y).

%%%--------------------- estimation ---------------------
%% most likely path
%% mlpath(['A','E'],Path) => Path = [start,match(1),end]

mlpath(Sequence,Path):-
   mlpath(Sequence,Path,_).
mlpath(Sequence,Path,Prob):-
   viterbif(hmm(Sequence,start),Prob,Nodes),
   nodes2path(Nodes,Path).

nodes2path([Node|Nodes],[State|Path]):-
   Node = node(hmm(_,State),_),
   nodes2path(Nodes,Path).
nodes2path([],[]).

mlpaths([Seq|Seqs],[Path|Paths], X):- 
   mlpath(Seq,Path),
X= [P|_], writeln(P),
stop_low_level_trace,
   mlpaths(Seqs,Paths, X).
mlpaths([],[],_).

%%%--------------------- alignment ---------------------

wmag(Gs):-
   seqs2goals(S,Gs),wma(S).
wma(Seqs):-
   write_multiple_alignments(Seqs).
write_multiple_alignments(Seqs):-
   nl,
   write('search Viterbi paths...'),nl,
   mlpaths(Seqs,Paths,Paths),
   write('done.'),
   nl,
   write('------------ALIGNMENTS------------'),
   nl,
   write_multiple_alignments( Seqs, Paths ),
   write('----------------------------------'),
   nl.

make_max_length_list([Path|Paths],MaxLenList) :-
   make_max_length_list(Paths, TmpLenList),
   make_length_list(Path,LenList),
   marge_len_list(LenList,TmpLenList,MaxLenList).
make_max_length_list([Path],MaxLenList) :- 
   !,make_length_list(Path,MaxLenList).

marge_len_list([H1|T1],[H2|T2],[MargedH|MargedT]) :-
   max(MargedH,[H1,H2]),
   marge_len_list(T1,T2,MargedT).
marge_len_list([],[],[]).

%% make_length_list([start,insert(0),match(1),end],LenList)
%% -> LenList = [2,1]
%% make_length_list([start,delete(1),insert(1),insert(1),end],LenList)
%% -> LenList = [1,1]

make_length_list(Path,[Len|LenList]) :-
   count_emission(Path,Len,NextIndexPath),
   make_length_list(NextIndexPath,LenList).
make_length_list([end],[]).

count_emission(Path,Count,NextIndexPath) :-
   Path = [State|_],
   get_index(State,Index),
   count_emission2(Path,Count,Index,NextIndexPath).

%% count_emission2([start,insert(0),match(1),end],Count,0,NextIndexPath)
%% -> Count = 2, NextIndexPath = [match(1),end]
%% count_emission2([delete(1),insert(1),insert(1),end],Count,1,NextIndexPath)
%% -> Count = 2, NextIndexPath = [end]

count_emission2([State|Path],Count,Index,NextIndexPath) :-
   ( get_index(State,Index) ->
       count_emission2( Path, Count2, Index, NextIndexPath ),
       ( (State = delete(_); State==start) ->
           Count = Count2
       ; Count is Count2 + 1 )
   ; Count = 0,
       NextIndexPath = [State|Path]
   ).

write_multiple_alignments(Seqs,Paths) :-
   make_max_length_list(Paths,LenList),
   write_multiple_alignments(Seqs,Paths,LenList).
write_multiple_alignments([Seq|Seqs],[Path|Paths],LenList) :-
   write_alignment(Seq,Path,LenList),
   write_multiple_alignments(Seqs,Paths,LenList).
write_multiple_alignments([],[],_).

write_alignment(Seq,Path,LenList) :- 
   write_alignment(Seq,Path,LenList,0).

write_alignment([],[end],[],_):- !,nl.
write_alignment(Seq,[State|Path],LenList,Index) :-
   get_index(State,Index),!,
   ( (State = delete(_) ; State == start) ->
       write_alignment( Seq, Path, LenList, Index )
   ; Seq = [Symbol|Seq2],
       LenList = [Len|LenList2],
       write(Symbol),
       Len2 is Len - 1,
       write_alignment(Seq2,Path,[Len2|LenList2],Index)
   ).
write_alignment(Seq,[State|Path],LenList,Index) :-
   LenList = [Len|LenList2],
   Index2 is Index + 1,
   pad(Len),
   write_alignment(Seq,[State|Path],LenList2,Index2).

pad(Len) :-
   Len > 0,
   write('-'),
   Len2 is Len - 1,!,
   pad(Len2).
pad(0).

%%%--------------------- utility ---------------------

get_index(State,Index) :-
   (State=match(_),!,State=match(Index));
   (State=insert(_),!,State=insert(Index));
   (State=delete(_),!,State=delete(Index));
   (State=start,!,Index=0);
   (State=end,!,hmm_len(X),Index is X+1).

seqs2goals([Seq|Seqs],[Goal|Goals]) :-
   Goal = observe(Seq),
   seqs2goals(Seqs,Goals).
seqs2goals([],[]).

max(Max,[Head|Tail]) :-
   max(Tmp,Tail),!,
   ( Tmp > Head -> Max = Tmp ; Max = Head ).
max(Max,[Max]).

read_goals(Goals,FileName) :-
   see(FileName),
   read_goals(Goals),
   seen.
read_goals(Goals) :-
   read(Term),
   ( Term = end_of_file ->
       Goals = []
   ; Goals = [Term|Goals1],
       read_goals(Goals1)
   ).
prism logical probabilistic system. 2011-11-10 12:24:47 +00:00			`%%%%`
			`%%%% Profile HMM --- phmm.psm`
			`%%%%`
			`%%%% Copyright (C) 2004,2006,2007,2008`
			`%%%% Sato Laboratory, Dept. of Computer Science,`
			`%%%% Tokyo Institute of Technology`

			`%% Profile HMMs are a variant of HMMs that have three types of states,`
			%% i.e. `match state',`insert state' and `delete state.' Match states
			%% constitute an HMM that outputs a `true' string. Insertion states
			%% emit a symbol additionally to the `true' string whereas delete (skip)
			`%% states emit no symbol.`
			`%%`
			`%% Profile HMMs are used to align amino-acid sequences by inserting`
			`%% and skipping symbols as well as matching symbols. For example`
			`%% amino-acid sequences below`
			`%%`
			`%% HLKIANRKDKHHNKEFGGHHLA`
			`%% HLKATHRKDQHHNREFGGHHLA`
			`%% VLKFANRKSKHHNKEMGAHHLA`
			`%% ...`
			`%%`
			`%% are aligned by the profile HMM program in this file as follows.`
			`%%`
			`%% -HLKIA-NRKDK-H-H----NKEFGGHH-LA`
			`%% -HLK-A-T-HRK-DQHHN--R-EFGGHH-LA`
			`%% -VLKFA-NRKSK-H-H----NKEMGAHH-LA`
			`%% ...`

			`%%-------------------------------------`
			`%% Quick start : sample session, align the sample data in phmm.dat.`
			`%%`
			`%% To run on an interactive session:`
			`%% ?- prism(phmm),go. (ML/MAP)`
			`%% ?- prism(phmm),go_vb. (variational Bayes)`
			`%%`
			`%% To perform a batch execution:`
			`%% > upprism phmm`

			`go :-`
			`read_goals(Gs,'phmm.dat'), % Read the sequence data from phmm.dat.`
			`learn(Gs), % Learn parameters from the data.`
			`wmag(Gs). % Compute viterbi paths using the learned`
			`% parameters and aligns sequences in Gs.`

			`% To enable variational Bayes, we need some additional flag settings:`
			`go_vb :-`
			`set_prism_flag(learn_mode,both),`
			`set_prism_flag(viterbi_mode,hparams),`
			`set_prism_flag(reset_hparams,on),`
			`go.`

			`prism_main :- go.`
			`%prism_main :- go_vb.`


			`%%%--------------------- model ---------------------`

			`observe(Sequence) :- hmm(Sequence,start).`

			`hmm([],end).`
			`hmm(Sequence,State) :-`
			`State \== end,`
			`msw(move_from(State),NextState),`
			`msw(emit_at(State), Symbol),`
			`( Symbol = epsilon ->`
			`hmm( Sequence, NextState )`
			`; Sequence = [Symbol\|TailSeq],`
			`hmm( TailSeq , NextState )`
			`).`

			`amino_acids(['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R',`
			`'S','T','V','W','X','Y']).`
			`hmm_len(17).`

			`%%%--------------------- values ---------------------`

			`values(move_from(State),Values) :-`
			`hmm_len(Len),`
			`get_index(State,X),`
			`( 0 =< X, X < Len ->`
			`Y is X + 1,`
			`Values = [insert(X),match(Y),delete(Y)]`
			`; Values = [insert(X),end] ).`

			`values(emit_at(State),Vs) :-`
			`((State = insert(_) ; State = match(_)) ->`
			`amino_acids(Vs)`
			`; Vs = [epsilon] ).`

			`%%%--------------------- set_sw ---------------------`

			`:- init_set_sw.`

			`init_set_sw :-`
			`% tell('/dev/null'), % Suppress output (on Linux only)`
			`set_sw( move_from(start) ),`
			`set_sw( move_from(insert(0)) ),`
			`set_sw( emit_at(start) ),`
			`set_sw( emit_at(insert(0)) ),`
			`hmm_len(Len),`
			`% told,`
			`init_set_sw(Len).`

			`init_set_sw(0).`
			`init_set_sw(X) :-`
			`X > 0,`
			`set_sw( move_from(insert(X)) ),`
			`set_sw( move_from(match(X)) ),`
			`set_sw( move_from(delete(X)) ),`
			`set_sw( emit_at(insert(X)) ),`
			`set_sw( emit_at(match(X)) ),`
			`set_sw( emit_at(delete(X)) ),`
			`Y is X - 1,`
			`init_set_sw(Y).`

			`%%%--------------------- estimation ---------------------`
			`%% most likely path`
			`%% mlpath(['A','E'],Path) => Path = [start,match(1),end]`

			`mlpath(Sequence,Path):-`
			`mlpath(Sequence,Path,_).`
			`mlpath(Sequence,Path,Prob):-`
			`viterbif(hmm(Sequence,start),Prob,Nodes),`
			`nodes2path(Nodes,Path).`

			`nodes2path([Node\|Nodes],[State\|Path]):-`
			`Node = node(hmm(_,State),_),`
			`nodes2path(Nodes,Path).`
			`nodes2path([],[]).`

			`mlpaths([Seq\|Seqs],[Path\|Paths], X):-`
			`mlpath(Seq,Path),`
			`X= [P\|_], writeln(P),`
			`stop_low_level_trace,`
			`mlpaths(Seqs,Paths, X).`
			`mlpaths([],[],_).`

			`%%%--------------------- alignment ---------------------`

			`wmag(Gs):-`
			`seqs2goals(S,Gs),wma(S).`
			`wma(Seqs):-`
			`write_multiple_alignments(Seqs).`
			`write_multiple_alignments(Seqs):-`
			`nl,`
			`write('search Viterbi paths...'),nl,`
			`mlpaths(Seqs,Paths,Paths),`
			`write('done.'),`
			`nl,`
			`write('------------ALIGNMENTS------------'),`
			`nl,`
			`write_multiple_alignments( Seqs, Paths ),`
			`write('----------------------------------'),`
			`nl.`

			`make_max_length_list([Path\|Paths],MaxLenList) :-`
			`make_max_length_list(Paths, TmpLenList),`
			`make_length_list(Path,LenList),`
			`marge_len_list(LenList,TmpLenList,MaxLenList).`
			`make_max_length_list([Path],MaxLenList) :-`
			`!,make_length_list(Path,MaxLenList).`

			`marge_len_list([H1\|T1],[H2\|T2],[MargedH\|MargedT]) :-`
			`max(MargedH,[H1,H2]),`
			`marge_len_list(T1,T2,MargedT).`
			`marge_len_list([],[],[]).`

			`%% make_length_list([start,insert(0),match(1),end],LenList)`
			`%% -> LenList = [2,1]`
			`%% make_length_list([start,delete(1),insert(1),insert(1),end],LenList)`
			`%% -> LenList = [1,1]`

			`make_length_list(Path,[Len\|LenList]) :-`
			`count_emission(Path,Len,NextIndexPath),`
			`make_length_list(NextIndexPath,LenList).`
			`make_length_list([end],[]).`

			`count_emission(Path,Count,NextIndexPath) :-`
			`Path = [State\|_],`
			`get_index(State,Index),`
			`count_emission2(Path,Count,Index,NextIndexPath).`

			`%% count_emission2([start,insert(0),match(1),end],Count,0,NextIndexPath)`
			`%% -> Count = 2, NextIndexPath = [match(1),end]`
			`%% count_emission2([delete(1),insert(1),insert(1),end],Count,1,NextIndexPath)`
			`%% -> Count = 2, NextIndexPath = [end]`

			`count_emission2([State\|Path],Count,Index,NextIndexPath) :-`
			`( get_index(State,Index) ->`
			`count_emission2( Path, Count2, Index, NextIndexPath ),`
			`( (State = delete(_); State==start) ->`
			`Count = Count2`
			`; Count is Count2 + 1 )`
			`; Count = 0,`
			`NextIndexPath = [State\|Path]`
			`).`

			`write_multiple_alignments(Seqs,Paths) :-`
			`make_max_length_list(Paths,LenList),`
			`write_multiple_alignments(Seqs,Paths,LenList).`
			`write_multiple_alignments([Seq\|Seqs],[Path\|Paths],LenList) :-`
			`write_alignment(Seq,Path,LenList),`
			`write_multiple_alignments(Seqs,Paths,LenList).`
			`write_multiple_alignments([],[],_).`

			`write_alignment(Seq,Path,LenList) :-`
			`write_alignment(Seq,Path,LenList,0).`

			`write_alignment([],[end],[],_):- !,nl.`
			`write_alignment(Seq,[State\|Path],LenList,Index) :-`
			`get_index(State,Index),!,`
			`( (State = delete(_) ; State == start) ->`
			`write_alignment( Seq, Path, LenList, Index )`
			`; Seq = [Symbol\|Seq2],`
			`LenList = [Len\|LenList2],`
			`write(Symbol),`
			`Len2 is Len - 1,`
			`write_alignment(Seq2,Path,[Len2\|LenList2],Index)`
			`).`
			`write_alignment(Seq,[State\|Path],LenList,Index) :-`
			`LenList = [Len\|LenList2],`
			`Index2 is Index + 1,`
			`pad(Len),`
			`write_alignment(Seq,[State\|Path],LenList2,Index2).`

			`pad(Len) :-`
			`Len > 0,`
			`write('-'),`
			`Len2 is Len - 1,!,`
			`pad(Len2).`
			`pad(0).`

			`%%%--------------------- utility ---------------------`

			`get_index(State,Index) :-`
			`(State=match(_),!,State=match(Index));`
			`(State=insert(_),!,State=insert(Index));`
			`(State=delete(_),!,State=delete(Index));`
			`(State=start,!,Index=0);`
			`(State=end,!,hmm_len(X),Index is X+1).`

			`seqs2goals([Seq\|Seqs],[Goal\|Goals]) :-`
			`Goal = observe(Seq),`
			`seqs2goals(Seqs,Goals).`
			`seqs2goals([],[]).`

			`max(Max,[Head\|Tail]) :-`
			`max(Tmp,Tail),!,`
			`( Tmp > Head -> Max = Tmp ; Max = Head ).`
			`max(Max,[Max]).`

			`read_goals(Goals,FileName) :-`
			`see(FileName),`
			`read_goals(Goals),`
			`seen.`
			`read_goals(Goals) :-`
			`read(Term),`
			`( Term = end_of_file ->`
			`Goals = []`
			`; Goals = [Term\|Goals1],`
			`read_goals(Goals1)`
			`).`