%%%% %%%% Profile HMM --- phmm.psm %%%% %%%% Copyright (C) 2004,2006,2007,2008 %%%% Sato Laboratory, Dept. of Computer Science, %%%% Tokyo Institute of Technology %% Profile HMMs are a variant of HMMs that have three types of states, %% i.e. `match state',`insert state' and `delete state.' Match states %% constitute an HMM that outputs a `true' string. Insertion states %% emit a symbol additionally to the `true' string whereas delete (skip) %% states emit no symbol. %% %% Profile HMMs are used to align amino-acid sequences by inserting %% and skipping symbols as well as matching symbols. For example %% amino-acid sequences below %% %% HLKIANRKDKHHNKEFGGHHLA %% HLKATHRKDQHHNREFGGHHLA %% VLKFANRKSKHHNKEMGAHHLA %% ... %% %% are aligned by the profile HMM program in this file as follows. %% %% -HLKIA-NRKDK-H-H----NKEFGGHH-LA %% -HLK-A-T-HRK-DQHHN--R-EFGGHH-LA %% -VLKFA-NRKSK-H-H----NKEMGAHH-LA %% ... %%------------------------------------- %% Quick start : sample session, align the sample data in phmm.dat. %% %% To run on an interactive session: %% ?- prism(phmm),go. (ML/MAP) %% ?- prism(phmm),go_vb. (variational Bayes) %% %% To perform a batch execution: %% > upprism phmm go :- read_goals(Gs,'phmm.dat'), % Read the sequence data from phmm.dat. learn(Gs), % Learn parameters from the data. wmag(Gs). % Compute viterbi paths using the learned % parameters and aligns sequences in Gs. % To enable variational Bayes, we need some additional flag settings: go_vb :- set_prism_flag(learn_mode,both), set_prism_flag(viterbi_mode,hparams), set_prism_flag(reset_hparams,on), go. prism_main :- go. %prism_main :- go_vb. %%%--------------------- model --------------------- observe(Sequence) :- hmm(Sequence,start). hmm([],end). hmm(Sequence,State) :- State \== end, msw(move_from(State),NextState), msw(emit_at(State), Symbol), ( Symbol = epsilon -> hmm( Sequence, NextState ) ; Sequence = [Symbol|TailSeq], hmm( TailSeq , NextState ) ). amino_acids(['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R', 'S','T','V','W','X','Y']). hmm_len(17). %%%--------------------- values --------------------- values(move_from(State),Values) :- hmm_len(Len), get_index(State,X), ( 0 =< X, X < Len -> Y is X + 1, Values = [insert(X),match(Y),delete(Y)] ; Values = [insert(X),end] ). values(emit_at(State),Vs) :- ((State = insert(_) ; State = match(_)) -> amino_acids(Vs) ; Vs = [epsilon] ). %%%--------------------- set_sw --------------------- :- init_set_sw. init_set_sw :- % tell('/dev/null'), % Suppress output (on Linux only) set_sw( move_from(start) ), set_sw( move_from(insert(0)) ), set_sw( emit_at(start) ), set_sw( emit_at(insert(0)) ), hmm_len(Len), % told, init_set_sw(Len). init_set_sw(0). init_set_sw(X) :- X > 0, set_sw( move_from(insert(X)) ), set_sw( move_from(match(X)) ), set_sw( move_from(delete(X)) ), set_sw( emit_at(insert(X)) ), set_sw( emit_at(match(X)) ), set_sw( emit_at(delete(X)) ), Y is X - 1, init_set_sw(Y). %%%--------------------- estimation --------------------- %% most likely path %% mlpath(['A','E'],Path) => Path = [start,match(1),end] mlpath(Sequence,Path):- mlpath(Sequence,Path,_). mlpath(Sequence,Path,Prob):- viterbif(hmm(Sequence,start),Prob,Nodes), nodes2path(Nodes,Path). nodes2path([Node|Nodes],[State|Path]):- Node = node(hmm(_,State),_), nodes2path(Nodes,Path). nodes2path([],[]). mlpaths([Seq|Seqs],[Path|Paths], X):- mlpath(Seq,Path), X= [P|_], writeln(P), stop_low_level_trace, mlpaths(Seqs,Paths, X). mlpaths([],[],_). %%%--------------------- alignment --------------------- wmag(Gs):- seqs2goals(S,Gs),wma(S). wma(Seqs):- write_multiple_alignments(Seqs). write_multiple_alignments(Seqs):- nl, write('search Viterbi paths...'),nl, mlpaths(Seqs,Paths,Paths), write('done.'), nl, write('------------ALIGNMENTS------------'), nl, write_multiple_alignments( Seqs, Paths ), write('----------------------------------'), nl. make_max_length_list([Path|Paths],MaxLenList) :- make_max_length_list(Paths, TmpLenList), make_length_list(Path,LenList), marge_len_list(LenList,TmpLenList,MaxLenList). make_max_length_list([Path],MaxLenList) :- !,make_length_list(Path,MaxLenList). marge_len_list([H1|T1],[H2|T2],[MargedH|MargedT]) :- max(MargedH,[H1,H2]), marge_len_list(T1,T2,MargedT). marge_len_list([],[],[]). %% make_length_list([start,insert(0),match(1),end],LenList) %% -> LenList = [2,1] %% make_length_list([start,delete(1),insert(1),insert(1),end],LenList) %% -> LenList = [1,1] make_length_list(Path,[Len|LenList]) :- count_emission(Path,Len,NextIndexPath), make_length_list(NextIndexPath,LenList). make_length_list([end],[]). count_emission(Path,Count,NextIndexPath) :- Path = [State|_], get_index(State,Index), count_emission2(Path,Count,Index,NextIndexPath). %% count_emission2([start,insert(0),match(1),end],Count,0,NextIndexPath) %% -> Count = 2, NextIndexPath = [match(1),end] %% count_emission2([delete(1),insert(1),insert(1),end],Count,1,NextIndexPath) %% -> Count = 2, NextIndexPath = [end] count_emission2([State|Path],Count,Index,NextIndexPath) :- ( get_index(State,Index) -> count_emission2( Path, Count2, Index, NextIndexPath ), ( (State = delete(_); State==start) -> Count = Count2 ; Count is Count2 + 1 ) ; Count = 0, NextIndexPath = [State|Path] ). write_multiple_alignments(Seqs,Paths) :- make_max_length_list(Paths,LenList), write_multiple_alignments(Seqs,Paths,LenList). write_multiple_alignments([Seq|Seqs],[Path|Paths],LenList) :- write_alignment(Seq,Path,LenList), write_multiple_alignments(Seqs,Paths,LenList). write_multiple_alignments([],[],_). write_alignment(Seq,Path,LenList) :- write_alignment(Seq,Path,LenList,0). write_alignment([],[end],[],_):- !,nl. write_alignment(Seq,[State|Path],LenList,Index) :- get_index(State,Index),!, ( (State = delete(_) ; State == start) -> write_alignment( Seq, Path, LenList, Index ) ; Seq = [Symbol|Seq2], LenList = [Len|LenList2], write(Symbol), Len2 is Len - 1, write_alignment(Seq2,Path,[Len2|LenList2],Index) ). write_alignment(Seq,[State|Path],LenList,Index) :- LenList = [Len|LenList2], Index2 is Index + 1, pad(Len), write_alignment(Seq,[State|Path],LenList2,Index2). pad(Len) :- Len > 0, write('-'), Len2 is Len - 1,!, pad(Len2). pad(0). %%%--------------------- utility --------------------- get_index(State,Index) :- (State=match(_),!,State=match(Index)); (State=insert(_),!,State=insert(Index)); (State=delete(_),!,State=delete(Index)); (State=start,!,Index=0); (State=end,!,hmm_len(X),Index is X+1). seqs2goals([Seq|Seqs],[Goal|Goals]) :- Goal = observe(Seq), seqs2goals(Seqs,Goals). seqs2goals([],[]). max(Max,[Head|Tail]) :- max(Tmp,Tail),!, ( Tmp > Head -> Max = Tmp ; Max = Head ). max(Max,[Max]). read_goals(Goals,FileName) :- see(FileName), read_goals(Goals), seen. read_goals(Goals) :- read(Term), ( Term = end_of_file -> Goals = [] ; Goals = [Term|Goals1], read_goals(Goals1) ).