330 lines
9.4 KiB
Prolog
330 lines
9.4 KiB
Prolog
%
|
|
% Convert HMMer model to CLP(BN) program
|
|
%
|
|
main :-
|
|
open('hmmer_b.1.18.1.hmm', read, S),
|
|
open('hmmer_b.1.18.1.yap', write, W),
|
|
% open('globin.hmm', read, S),
|
|
catch(parse_model(S,_),done(Info),stop(S,W,Info)),
|
|
close(S),
|
|
close(W).
|
|
|
|
stop(S,W,Info) :-
|
|
ground(Info),
|
|
gen_program(W, Info).
|
|
stop(_,_,_) :-
|
|
format(user_error,"Bad HMM~n", []).
|
|
|
|
parse_model(S,Info) :-
|
|
get_line(S, Line, Info),
|
|
% format('~s~n',[Line]),
|
|
match_field(Info, S, Line, []),
|
|
parse_model(S, Info).
|
|
|
|
get_line(S, Out, Info) :-
|
|
get0(S, C),
|
|
(
|
|
C == 10 -> Out = [] ;
|
|
C == -1 -> throw(done(Info)) ;
|
|
Out = [C|Line], get_line(S, Line, Info)
|
|
).
|
|
|
|
|
|
match_field(hmmer(2,_,_,_,_,_,_,_), _) --> "HMMER", !,
|
|
scanner_skip. % mandatory field, should be ground
|
|
match_field(hmmer(_,Name,_,_,_,_,_,_),_) --> "NAME", !, % mandatory field
|
|
scanner_skip_blanks,
|
|
get_name(String), {atom_codes(Name,String) }.
|
|
match_field(_,_) --> "ACC", !. % accession id, used to track DB accesses
|
|
match_field(hmmer(_,_,NOfStates,_,_,_,_,_),_) --> "LENG", !, % number of states
|
|
scanner_skip_blanks,
|
|
get_number(NOfStates, 0).
|
|
match_field(hmmer(_,_,_,Alph,_,_,_,_),_) --> "ALPH", !, % aminos or bases
|
|
scanner_skip_blanks,
|
|
check_alphabet(Alph).
|
|
match_field(_,_) --> "RF", !, scanner_skip.
|
|
match_field(_,_) --> "CS", !, scanner_skip.
|
|
match_field(hmmer(_,_,_,_,_,_,_,MAP),_) --> "MAP", !,
|
|
scanner_skip_blanks,
|
|
to_lower(Codes),
|
|
{ map_code(Codes,MAP) }.
|
|
match_field(_,_) --> "COM", !, scanner_skip.
|
|
match_field(_,_) --> "CKSUM", !, scanner_skip.
|
|
match_field(_,_) --> "GA", !, scanner_skip.
|
|
match_field(_,_) --> "TC", !, scanner_skip.
|
|
match_field(_,_) --> "NC", !, scanner_skip.
|
|
match_field(_,_) --> "NSEQ", !, scanner_skip.
|
|
match_field(_,_) --> "DATE", !, scanner_skip.
|
|
match_field(hmmer(_,_,_,_,special(NB,NN,EC,EJ,CT,CC,JB,JJ,_,_),_,_,_),_) --> "XT", !,
|
|
scan_transition(NB),
|
|
scan_transition(NN),
|
|
scan_transition(EC),
|
|
scan_transition(EJ),
|
|
scan_transition(CT),
|
|
scan_transition(CC),
|
|
scan_transition(JB),
|
|
scan_transition(JJ).
|
|
match_field(hmmer(_,_,_,_,special(_,_,_,_,_,_,_,_,GG,GF),_,_,_),_) --> "NULT", !,
|
|
scan_transition(GG),
|
|
scan_transition(GF).
|
|
match_field(hmmer(_,_,_,Alph,_,NULE,_,_),_) --> "NULE", !,
|
|
{ nof_symbols(Alph,N) },
|
|
scan_transitions(N,Transitions),
|
|
{ NULE =.. [null|Transitions] }.
|
|
match_field(_,_) --> "EVD", !,
|
|
scanner_skip. % optional, but should do later.x
|
|
match_field(Info,S) --> "HMM", !,
|
|
scanner_skip,
|
|
{
|
|
get_line(S,_,Info),
|
|
Info = hmmer(_,_,NOfStates,Alph,_,_,model(BD,NBD,Transitions),MAP),
|
|
nof_symbols(Alph,N),
|
|
scan_model(S,NOfStates,N,BD,NBD,Transitions,MAP,Info),
|
|
throw(done(Info))
|
|
}.
|
|
|
|
scan_model(S,NOfStates,N,BD,NBD,Transitions,MAP,Info) :-
|
|
get_line(S,Line, Info),
|
|
scan_bd(BD, NBD, Line, []),
|
|
scan_states(NOfStates, N, S, MAP, Transitions, Info).
|
|
|
|
scan_states(0, _, _, MAP, [], _) :- !, close_map(MAP).
|
|
scan_states(NOfStates, N, Stream, MAP, [t(E,I,S)|Transitions], Info) :-
|
|
NOfStates1 is NOfStates-1,
|
|
scan_state(Stream, E, I, MAP, S, N, NMAP, Info),
|
|
scan_states(NOfStates1, N, Stream, NMAP, Transitions, Info).
|
|
|
|
scan_state(Stream, E,I,MAP,s(MM,MI,MD,IM,II,DM,DD,BM,ME), N, NMAP, Info) :-
|
|
get_line(Stream, ELine, Info),
|
|
get_line(Stream, ILine, Info),
|
|
get_line(Stream, SLine, Info),
|
|
% format('~s~n~s~n~s~n',[ELine,ILine,SLine]),
|
|
scanner_skip_field(ELine,Eline1),
|
|
scan_transitions(N, E, Eline1, EMap),
|
|
scan_map(MAP,NMAP,EMap, []),
|
|
scanner_skip_field(ILine,ILine1),
|
|
scan_transitions(N, I, ILine1, []),
|
|
scanner_skip_field(SLine,SLine1),
|
|
scan_transitions(MM, MI, MD, IM, II, DM, DD, BM, ME, SLine1, []).
|
|
|
|
scan_transitions(MM, MI, MD, IM, II, DM, DD, BM, ME) -->
|
|
scan_transition(MM),
|
|
scan_transition(MI),
|
|
scan_transition(MD),
|
|
scan_transition(IM),
|
|
scan_transition(II),
|
|
scan_transition(DM),
|
|
scan_transition(DD),
|
|
scan_transition(BM),
|
|
scan_transition(ME).
|
|
|
|
|
|
scan_transitions(0, []) --> !.
|
|
scan_transitions(N, [T|Ts]) -->
|
|
{ N1 is N-1 },
|
|
scan_transition(T),
|
|
scan_transitions(N1, Ts).
|
|
|
|
scan_transition(T) -->
|
|
scanner_skip_blanks,
|
|
get_transition(T).
|
|
|
|
get_transition('*') --> "*", !, scanner_skip_blanks.
|
|
get_transition(N) --> [C],
|
|
{ C >= 0'0, C =< 0'9, !, C0 is C-0'0 },
|
|
get_number(N, C0 ).
|
|
get_transition(N) --> "-",
|
|
get_number(C0, 0 ),
|
|
{ N is -C0}.
|
|
|
|
get_number(Nf, N0) --> [C], !,
|
|
( { C >= 0'0, C =< 0'9 } -> { Ni is N0*10+(C-0'0) }, get_number(Nf,Ni) ;
|
|
{ Nf = N0 }
|
|
).
|
|
get_number(N, N) --> [].
|
|
|
|
scanner_skip_blanks --> " ", !, scanner_skip_blanks.
|
|
scanner_skip_blanks --> " ", !, scanner_skip_blanks.
|
|
scanner_skip_blanks --> [].
|
|
|
|
|
|
scanner_skip_field --> scan_transition(_).
|
|
|
|
scan_bd(BD, NBD) -->
|
|
scan_transition(BD),
|
|
scanner_skip_field,
|
|
scan_transition(NBD).
|
|
|
|
check_alphabet(Alph) -->
|
|
to_lower(Lower),
|
|
{ get_alph(Lower, Alph) }.
|
|
|
|
to_lower([CF|Lower]) --> [C], !,
|
|
{ ( C >= 0'A, C =< 0'Z -> CF is C+(0'a-0'A) ; CF = C) },
|
|
to_lower(Lower).
|
|
to_lower([]) --> [].
|
|
|
|
get_alph("amino", amino).
|
|
get_alph("nucleic", nucleic).
|
|
|
|
map_code("yes", yes(_)).
|
|
map_code("no", no).
|
|
|
|
nof_symbols(amino,20).
|
|
nof_symbols(nucleic,4).
|
|
|
|
scanner_skip(_,_).
|
|
|
|
get_name(L,L,[]).
|
|
|
|
scan_map(yes([Id|Next]),yes(Next)) -->
|
|
scan_transition(Id).
|
|
scan_map(no,no) --> [].
|
|
|
|
close_map(yes([])) :- !.
|
|
close_map(no).
|
|
|
|
|
|
gen_program(W, hmmer(VersionId,Name,NOfStates,Alphabet,SpecialTransitions,NULE,Model,MAP)) :-
|
|
format(W, '~n% HMMer Version ~d (Plan 7) using ~a~n',[VersionId,Alphabet]),
|
|
format(W, '~n% Name: ~a~n',[Name]),
|
|
format(W, '~n% Size: ~d~n',[NOfStates]),
|
|
format(W, 'slices(~d).~n',[NOfStates]),
|
|
gen_specials(W, SpecialTransitions),
|
|
gen_nule(W, NULE,Alphabet,NULEProbs),
|
|
gen_model(W, Model,NULEProbs),
|
|
gen_map(W, MAP).
|
|
|
|
|
|
%
|
|
% special nodes in graph.
|
|
%
|
|
gen_specials(W, special(NB,NN,EC,EJ,CT,CC,JB,JJ,GG,GF)) :-
|
|
% reaching state N
|
|
normalize(NN,1.0,NNCPT),
|
|
format(W, '~n%Reaching state N.~n',[]),
|
|
format(W, 'n_n_cpt(~w,1.0,~w).~n',[NN,NNCPT]),
|
|
% reaching state B
|
|
normalize(JB,1.0,JBCPT),
|
|
format(W, '~n%Reaching state B.~n',[]),
|
|
format(W, 'j_b_cpt(~w,1.0,~w).~n',[JB,JBCPT]),
|
|
normalize(NB,1.0,NBCPT),
|
|
format(W, 'n_b_cpt(~w,1.0,~w).~n',[NB,NBCPT]),
|
|
% reaching state J
|
|
format(W, '~n%Reaching state J.~n',[]),
|
|
normalize(EJ,1.0,EJCPT),
|
|
format(W, 'e_j_cpt(~w,1.0,~w).~n',[EJ,EJCPT]),
|
|
normalize(JJ,1.0,JJCPT),
|
|
format(W, 'j_j_cpt(~w,1.0,~w).~n',[JJ,JJCPT]),
|
|
% reaching state C
|
|
format(W, '~n%Reaching state C.~n',[]),
|
|
normalize(EC,1.0,ECCPT),
|
|
format(W, 'e_c_cpt(~w,1.0,~w).~n',[EC,ECCPT]),
|
|
normalize(CC,1.0,CCCPT),
|
|
format(W, 'c_c_cpt(~w,1.0,~w).~n',[CC,CCCPT]),
|
|
% reaching state T
|
|
format(W, '~n%Reaching state T.~n',[]),
|
|
normalize(CT,1.0,CTCPT),
|
|
format(W, 'c_t_cpt(~w,1.0,~w).~n',[CT,CTCPT]),
|
|
% null model
|
|
format(W, '~n%Reaching state G (Null Model).~n',[]),
|
|
normalize(GG,1.0,GGCPT),
|
|
format(W, 'g_g_cpt(~w,1.0,~w).~n',[GG,GGCPT]),
|
|
format(W, '~n%Reaching state F (Null Model).~n',[]),
|
|
normalize(GF,1.0,GFCPT),
|
|
format(W, 'g_f_cpt(~w,1.0,~w).~n',[GF,GFCPT]).
|
|
|
|
|
|
normalize(*,_,0.0) :- !.
|
|
normalize(Score,NULL,Prob) :-
|
|
Prob is NULL * exp(2.0, (Score/1000)).
|
|
|
|
normalizel([],_,[]).
|
|
normalizel([Score|Scores],NULL,[Prob|Probs]) :-
|
|
normalize(Score, NULL, Prob),
|
|
normalizel(Scores,NULL,Probs).
|
|
|
|
normalizell([],[],[]).
|
|
normalizell([Score|Scores],[Norm|Norms],[Prob|Probs]) :-
|
|
normalize(Score, Norm, Prob),
|
|
normalizell(Scores,Norms,Probs).
|
|
|
|
% null emission CPT
|
|
gen_nule(W, NULE,Alph,PsCPT) :-
|
|
NULE =.. [_|Ps],
|
|
nof_symbols(Alph,Size),
|
|
Norm is 1/Size,
|
|
normalizel(Ps,Norm,PsCPT),
|
|
E0s =.. [e|Ps],
|
|
Es =.. [e|PsCPT],
|
|
format(W, '~n%Null state emission CPT.~n',[]),
|
|
format(W, 'nule_cpt(~n ~w,~n ~w,~n ~w).~n',[E0s,Norm,Es]).
|
|
|
|
gen_model(W, model(BD,NBD,States),PsCPT) :-
|
|
normalize_bd(BD,NBD,BDCPT),
|
|
format(W, '~n%Reaching first D.~n',[]),
|
|
format(W, 'b_d_cpt(~w,~w,~w).~n',[BD,NBD,BDCPT]),
|
|
gen_states(W, States,1,PsCPT).
|
|
|
|
gen_states(_, [],_,_).
|
|
gen_states(W, [State|States],StateNo,PsCPT) :-
|
|
gen_state(W, State,StateNo,PsCPT),
|
|
NStateNo is StateNo+1,
|
|
gen_states(W, States,NStateNo,PsCPT).
|
|
|
|
gen_state(W, t(E,I,s(MM,MI,MD,IM,II,DM,DD,BM,ME)),StateNo,PsCPT) :-
|
|
format(W, '~n%State ~d.~n',[StateNo]),
|
|
Norm =.. [e|PsCPT],
|
|
normalizell(E,PsCPT,ECPT),
|
|
E0s =.. [e|E],
|
|
Es =.. [e|ECPT],
|
|
find_consensus(W,StateNo,ECPT),
|
|
sum(ECPT,0,TE),
|
|
format(W, 'me_cpt(~d,~n ~w,~n ~w,~n ~w). % ~w~n',[StateNo,E0s,Norm,Es,TE]),
|
|
normalizell(I,PsCPT,ICPT),
|
|
I0s =.. [e|ICPT],
|
|
Is =.. [e|ICPT],
|
|
sum(ICPT,0,TI),
|
|
format(W, 'ie_cpt(~d,~n ~w,~n ~w,~n ~w). %~w~n',[StateNo,I0s,Norm,Is,TI]),
|
|
normalize(MM,1.0,MMCPT),
|
|
format(W, 'm_m_cpt(~d,~w,~w,~w).~n',[StateNo,MM,1.0,MMCPT]),
|
|
normalize(MI,1.0,MICPT),
|
|
format(W, 'm_i_cpt(~d,~w,~w,~w).~n',[StateNo,MI,1.0,MICPT]),
|
|
normalize(MD,1.0,MDCPT),
|
|
format(W, 'm_d_cpt(~d,~w,~w,~w).~n',[StateNo,MD,1.0,MDCPT]),
|
|
normalize(II,1.0,IICPT),
|
|
format(W, 'i_i_cpt(~d,~w,~w,~w).~n',[StateNo,II,1.0,IICPT]),
|
|
normalize(IM,1.0,IMCPT),
|
|
format(W, 'i_m_cpt(~d,~w,~w,~w).~n',[StateNo,IM,1.0,IMCPT]),
|
|
normalize(DM,1.0,DMCPT),
|
|
format(W, 'd_m_cpt(~d,~w,~w,~w).~n',[StateNo,DM,1.0,DMCPT]),
|
|
normalize(DD,1.0,DDCPT),
|
|
format(W, 'd_d_cpt(~d,~w,~w,~w).~n',[StateNo,DD,1.0,DDCPT]),
|
|
normalize(BM,1.0,BMCPT),
|
|
format(W, 'b_m_cpt(~d,~w,~w,~w).~n',[StateNo,BM,1.0,BMCPT]),
|
|
normalize(ME,1.0,MECPT),
|
|
format(W, 'm_e_cpt(~d,~w,~w,~w).~n',[StateNo,ME,1.0,MECPT]).
|
|
|
|
gen_map(_,_).
|
|
|
|
normalize_bd(A,B,A).
|
|
|
|
sum([],S,S).
|
|
sum([P|Ps],S0,S) :-
|
|
Si is S0+P,
|
|
sum(Ps,Si,S).
|
|
|
|
find_consensus(W, StateNo,ECPT) :-
|
|
max_index(ECPT,1,0,-1,_,MaxIndex),
|
|
format(W, 'consensus(~d,~d).~n',[StateNo,MaxIndex]).
|
|
|
|
max_index([],_,Max,MaxIndex,Max,MaxIndex).
|
|
max_index([H|L],I0,Max0,MaxIndex0,Max,MaxIndex) :-
|
|
H > Max0, !,
|
|
I is I0+1,
|
|
max_index(L,I,H,I0,Max,MaxIndex).
|
|
max_index([_|L],I0,Max0,MaxIndex0,Max,MaxIndex) :-
|
|
I is I0+1,
|
|
max_index(L,I,Max0,MaxIndex0,Max,MaxIndex).
|
|
|