264 lines
		
	
	
		
			7.3 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
		
		
			
		
	
	
			264 lines
		
	
	
		
			7.3 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
|   | %%%% | ||
|  | %%%%  Profile HMM --- phmm.psm | ||
|  | %%%%       | ||
|  | %%%%  Copyright (C) 2004,2006,2007,2008 | ||
|  | %%%%    Sato Laboratory, Dept. of Computer Science, | ||
|  | %%%%    Tokyo Institute of Technology | ||
|  | 
 | ||
|  | %%  Profile HMMs are a variant of HMMs that have three types of states, | ||
|  | %%  i.e. `match state',`insert state' and `delete state.' Match states | ||
|  | %%  constitute an HMM that outputs a `true' string. Insertion states | ||
|  | %%  emit a symbol additionally to the `true' string whereas delete (skip) | ||
|  | %%  states emit no symbol. | ||
|  | %% | ||
|  | %%  Profile HMMs are used to align amino-acid sequences by inserting | ||
|  | %%  and skipping symbols as well as matching symbols.  For example | ||
|  | %%  amino-acid sequences below | ||
|  | %% | ||
|  | %%  HLKIANRKDKHHNKEFGGHHLA | ||
|  | %%  HLKATHRKDQHHNREFGGHHLA | ||
|  | %%  VLKFANRKSKHHNKEMGAHHLA | ||
|  | %%  ... | ||
|  | %%  | ||
|  | %%  are aligned by the profile HMM program in this file as follows. | ||
|  | %% | ||
|  | %%  -HLKIA-NRKDK-H-H----NKEFGGHH-LA | ||
|  | %%  -HLK-A-T-HRK-DQHHN--R-EFGGHH-LA | ||
|  | %%  -VLKFA-NRKSK-H-H----NKEMGAHH-LA | ||
|  | %%  ... | ||
|  | 
 | ||
|  | %%------------------------------------- | ||
|  | %%  Quick start : sample session, align the sample data in phmm.dat. | ||
|  | %% | ||
|  | %%  To run on an interactive session: | ||
|  | %%  ?- prism(phmm),go.         (ML/MAP) | ||
|  | %%  ?- prism(phmm),go_vb.      (variational Bayes) | ||
|  | %% | ||
|  | %%  To perform a batch execution: | ||
|  | %%  > upprism phmm | ||
|  | 
 | ||
|  | go :- | ||
|  |    read_goals(Gs,'phmm.dat'),   % Read the sequence data from phmm.dat. | ||
|  |    learn(Gs),                   % Learn parameters from the data. | ||
|  |    wmag(Gs).                    % Compute viterbi paths using the learned | ||
|  |                                 % parameters and aligns sequences in Gs. | ||
|  | 
 | ||
|  | % To enable variational Bayes, we need some additional flag settings: | ||
|  | go_vb :- | ||
|  |    set_prism_flag(learn_mode,both), | ||
|  |    set_prism_flag(viterbi_mode,hparams), | ||
|  |    set_prism_flag(reset_hparams,on), | ||
|  |    go. | ||
|  | 
 | ||
|  | prism_main :- go. | ||
|  | %prism_main :- go_vb. | ||
|  | 
 | ||
|  | 
 | ||
|  | %%%--------------------- model --------------------- | ||
|  | 
 | ||
|  | observe(Sequence) :- hmm(Sequence,start). | ||
|  | 
 | ||
|  | hmm([],end). | ||
|  | hmm(Sequence,State) :- | ||
|  |    State \== end, | ||
|  |    msw(move_from(State),NextState), | ||
|  |    msw(emit_at(State), Symbol), | ||
|  |    ( Symbol = epsilon -> | ||
|  |        hmm( Sequence, NextState ) | ||
|  |    ; Sequence = [Symbol|TailSeq], | ||
|  |        hmm( TailSeq , NextState ) | ||
|  |    ). | ||
|  | 
 | ||
|  | amino_acids(['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R', | ||
|  |              'S','T','V','W','X','Y']). | ||
|  | hmm_len(17). | ||
|  | 
 | ||
|  | %%%--------------------- values --------------------- | ||
|  | 
 | ||
|  | values(move_from(State),Values) :- | ||
|  |    hmm_len(Len), | ||
|  |    get_index(State,X), | ||
|  |    ( 0 =< X, X < Len -> | ||
|  |        Y is X + 1, | ||
|  |        Values = [insert(X),match(Y),delete(Y)] | ||
|  |    ; Values = [insert(X),end] ). | ||
|  | 
 | ||
|  | values(emit_at(State),Vs) :- | ||
|  |    ((State = insert(_) ; State = match(_)) -> | ||
|  |        amino_acids(Vs) | ||
|  |    ; Vs = [epsilon] ). | ||
|  | 
 | ||
|  | %%%--------------------- set_sw --------------------- | ||
|  | 
 | ||
|  | :- init_set_sw. | ||
|  | 
 | ||
|  | init_set_sw :- | ||
|  | %  tell('/dev/null'),           % Suppress output (on Linux only) | ||
|  |    set_sw( move_from(start) ), | ||
|  |    set_sw( move_from(insert(0)) ), | ||
|  |    set_sw( emit_at(start) ), | ||
|  |    set_sw( emit_at(insert(0)) ), | ||
|  |    hmm_len(Len), | ||
|  | %  told, | ||
|  |    init_set_sw(Len). | ||
|  | 
 | ||
|  | init_set_sw(0). | ||
|  | init_set_sw(X) :- | ||
|  |    X > 0, | ||
|  |    set_sw( move_from(insert(X)) ), | ||
|  |    set_sw( move_from(match(X)) ), | ||
|  |    set_sw( move_from(delete(X)) ), | ||
|  |    set_sw( emit_at(insert(X)) ), | ||
|  |    set_sw( emit_at(match(X)) ), | ||
|  |    set_sw( emit_at(delete(X)) ), | ||
|  |    Y is X - 1, | ||
|  |    init_set_sw(Y). | ||
|  | 
 | ||
|  | %%%--------------------- estimation --------------------- | ||
|  | %% most likely path | ||
|  | %% mlpath(['A','E'],Path) => Path = [start,match(1),end] | ||
|  | 
 | ||
|  | mlpath(Sequence,Path):- | ||
|  |    mlpath(Sequence,Path,_). | ||
|  | mlpath(Sequence,Path,Prob):- | ||
|  |    viterbif(hmm(Sequence,start),Prob,Nodes), | ||
|  |    nodes2path(Nodes,Path). | ||
|  | 
 | ||
|  | nodes2path([Node|Nodes],[State|Path]):- | ||
|  |    Node = node(hmm(_,State),_), | ||
|  |    nodes2path(Nodes,Path). | ||
|  | nodes2path([],[]). | ||
|  | 
 | ||
|  | mlpaths([Seq|Seqs],[Path|Paths], X):-  | ||
|  |    mlpath(Seq,Path), | ||
|  | X= [P|_], writeln(P), | ||
|  | stop_low_level_trace, | ||
|  |    mlpaths(Seqs,Paths, X). | ||
|  | mlpaths([],[],_). | ||
|  | 
 | ||
|  | %%%--------------------- alignment --------------------- | ||
|  | 
 | ||
|  | wmag(Gs):- | ||
|  |    seqs2goals(S,Gs),wma(S). | ||
|  | wma(Seqs):- | ||
|  |    write_multiple_alignments(Seqs). | ||
|  | write_multiple_alignments(Seqs):- | ||
|  |    nl, | ||
|  |    write('search Viterbi paths...'),nl, | ||
|  |    mlpaths(Seqs,Paths,Paths), | ||
|  |    write('done.'), | ||
|  |    nl, | ||
|  |    write('------------ALIGNMENTS------------'), | ||
|  |    nl, | ||
|  |    write_multiple_alignments( Seqs, Paths ), | ||
|  |    write('----------------------------------'), | ||
|  |    nl. | ||
|  | 
 | ||
|  | make_max_length_list([Path|Paths],MaxLenList) :- | ||
|  |    make_max_length_list(Paths, TmpLenList), | ||
|  |    make_length_list(Path,LenList), | ||
|  |    marge_len_list(LenList,TmpLenList,MaxLenList). | ||
|  | make_max_length_list([Path],MaxLenList) :-  | ||
|  |    !,make_length_list(Path,MaxLenList). | ||
|  | 
 | ||
|  | marge_len_list([H1|T1],[H2|T2],[MargedH|MargedT]) :- | ||
|  |    max(MargedH,[H1,H2]), | ||
|  |    marge_len_list(T1,T2,MargedT). | ||
|  | marge_len_list([],[],[]). | ||
|  | 
 | ||
|  | %% make_length_list([start,insert(0),match(1),end],LenList) | ||
|  | %% -> LenList = [2,1] | ||
|  | %% make_length_list([start,delete(1),insert(1),insert(1),end],LenList) | ||
|  | %% -> LenList = [1,1] | ||
|  | 
 | ||
|  | make_length_list(Path,[Len|LenList]) :- | ||
|  |    count_emission(Path,Len,NextIndexPath), | ||
|  |    make_length_list(NextIndexPath,LenList). | ||
|  | make_length_list([end],[]). | ||
|  | 
 | ||
|  | count_emission(Path,Count,NextIndexPath) :- | ||
|  |    Path = [State|_], | ||
|  |    get_index(State,Index), | ||
|  |    count_emission2(Path,Count,Index,NextIndexPath). | ||
|  | 
 | ||
|  | %% count_emission2([start,insert(0),match(1),end],Count,0,NextIndexPath) | ||
|  | %% -> Count = 2, NextIndexPath = [match(1),end] | ||
|  | %% count_emission2([delete(1),insert(1),insert(1),end],Count,1,NextIndexPath) | ||
|  | %% -> Count = 2, NextIndexPath = [end] | ||
|  | 
 | ||
|  | count_emission2([State|Path],Count,Index,NextIndexPath) :- | ||
|  |    ( get_index(State,Index) -> | ||
|  |        count_emission2( Path, Count2, Index, NextIndexPath ), | ||
|  |        ( (State = delete(_); State==start) -> | ||
|  |            Count = Count2 | ||
|  |        ; Count is Count2 + 1 ) | ||
|  |    ; Count = 0, | ||
|  |        NextIndexPath = [State|Path] | ||
|  |    ). | ||
|  | 
 | ||
|  | write_multiple_alignments(Seqs,Paths) :- | ||
|  |    make_max_length_list(Paths,LenList), | ||
|  |    write_multiple_alignments(Seqs,Paths,LenList). | ||
|  | write_multiple_alignments([Seq|Seqs],[Path|Paths],LenList) :- | ||
|  |    write_alignment(Seq,Path,LenList), | ||
|  |    write_multiple_alignments(Seqs,Paths,LenList). | ||
|  | write_multiple_alignments([],[],_). | ||
|  | 
 | ||
|  | write_alignment(Seq,Path,LenList) :-  | ||
|  |    write_alignment(Seq,Path,LenList,0). | ||
|  | 
 | ||
|  | write_alignment([],[end],[],_):- !,nl. | ||
|  | write_alignment(Seq,[State|Path],LenList,Index) :- | ||
|  |    get_index(State,Index),!, | ||
|  |    ( (State = delete(_) ; State == start) -> | ||
|  |        write_alignment( Seq, Path, LenList, Index ) | ||
|  |    ; Seq = [Symbol|Seq2], | ||
|  |        LenList = [Len|LenList2], | ||
|  |        write(Symbol), | ||
|  |        Len2 is Len - 1, | ||
|  |        write_alignment(Seq2,Path,[Len2|LenList2],Index) | ||
|  |    ). | ||
|  | write_alignment(Seq,[State|Path],LenList,Index) :- | ||
|  |    LenList = [Len|LenList2], | ||
|  |    Index2 is Index + 1, | ||
|  |    pad(Len), | ||
|  |    write_alignment(Seq,[State|Path],LenList2,Index2). | ||
|  | 
 | ||
|  | pad(Len) :- | ||
|  |    Len > 0, | ||
|  |    write('-'), | ||
|  |    Len2 is Len - 1,!, | ||
|  |    pad(Len2). | ||
|  | pad(0). | ||
|  | 
 | ||
|  | %%%--------------------- utility --------------------- | ||
|  | 
 | ||
|  | get_index(State,Index) :- | ||
|  |    (State=match(_),!,State=match(Index)); | ||
|  |    (State=insert(_),!,State=insert(Index)); | ||
|  |    (State=delete(_),!,State=delete(Index)); | ||
|  |    (State=start,!,Index=0); | ||
|  |    (State=end,!,hmm_len(X),Index is X+1). | ||
|  | 
 | ||
|  | seqs2goals([Seq|Seqs],[Goal|Goals]) :- | ||
|  |    Goal = observe(Seq), | ||
|  |    seqs2goals(Seqs,Goals). | ||
|  | seqs2goals([],[]). | ||
|  | 
 | ||
|  | max(Max,[Head|Tail]) :- | ||
|  |    max(Tmp,Tail),!, | ||
|  |    ( Tmp > Head -> Max = Tmp ; Max = Head ). | ||
|  | max(Max,[Max]). | ||
|  | 
 | ||
|  | read_goals(Goals,FileName) :- | ||
|  |    see(FileName), | ||
|  |    read_goals(Goals), | ||
|  |    seen. | ||
|  | read_goals(Goals) :- | ||
|  |    read(Term), | ||
|  |    ( Term = end_of_file -> | ||
|  |        Goals = [] | ||
|  |    ; Goals = [Term|Goals1], | ||
|  |        read_goals(Goals1) | ||
|  |    ). |