This repository has been archived on 2023-08-20. You can view files and clone it, but cannot push or open issues or pull requests.
yap-6.3/packages/CLPBN/examples/HMMer/scan.yap
2012-12-19 18:22:47 +00:00

330 lines
9.4 KiB
Prolog

%
% Convert HMMer model to CLP(BN) program
%
main :-
open('hmmer_b.1.18.1.hmm', read, S),
open('hmmer_b.1.18.1.yap', write, W),
% open('globin.hmm', read, S),
catch(parse_model(S,_),done(Info),stop(S,W,Info)),
close(S),
close(W).
stop(S,W,Info) :-
ground(Info),
gen_program(W, Info).
stop(_,_,_) :-
format(user_error,"Bad HMM~n", []).
parse_model(S,Info) :-
get_line(S, Line, Info),
% format('~s~n',[Line]),
match_field(Info, S, Line, []),
parse_model(S, Info).
get_line(S, Out, Info) :-
get0(S, C),
(
C == 10 -> Out = [] ;
C == -1 -> throw(done(Info)) ;
Out = [C|Line], get_line(S, Line, Info)
).
match_field(hmmer(2,_,_,_,_,_,_,_), _) --> "HMMER", !,
scanner_skip. % mandatory field, should be ground
match_field(hmmer(_,Name,_,_,_,_,_,_),_) --> "NAME", !, % mandatory field
scanner_skip_blanks,
get_name(String), {atom_codes(Name,String) }.
match_field(_,_) --> "ACC", !. % accession id, used to track DB accesses
match_field(hmmer(_,_,NOfStates,_,_,_,_,_),_) --> "LENG", !, % number of states
scanner_skip_blanks,
get_number(NOfStates, 0).
match_field(hmmer(_,_,_,Alph,_,_,_,_),_) --> "ALPH", !, % aminos or bases
scanner_skip_blanks,
check_alphabet(Alph).
match_field(_,_) --> "RF", !, scanner_skip.
match_field(_,_) --> "CS", !, scanner_skip.
match_field(hmmer(_,_,_,_,_,_,_,MAP),_) --> "MAP", !,
scanner_skip_blanks,
to_lower(Codes),
{ map_code(Codes,MAP) }.
match_field(_,_) --> "COM", !, scanner_skip.
match_field(_,_) --> "CKSUM", !, scanner_skip.
match_field(_,_) --> "GA", !, scanner_skip.
match_field(_,_) --> "TC", !, scanner_skip.
match_field(_,_) --> "NC", !, scanner_skip.
match_field(_,_) --> "NSEQ", !, scanner_skip.
match_field(_,_) --> "DATE", !, scanner_skip.
match_field(hmmer(_,_,_,_,special(NB,NN,EC,EJ,CT,CC,JB,JJ,_,_),_,_,_),_) --> "XT", !,
scan_transition(NB),
scan_transition(NN),
scan_transition(EC),
scan_transition(EJ),
scan_transition(CT),
scan_transition(CC),
scan_transition(JB),
scan_transition(JJ).
match_field(hmmer(_,_,_,_,special(_,_,_,_,_,_,_,_,GG,GF),_,_,_),_) --> "NULT", !,
scan_transition(GG),
scan_transition(GF).
match_field(hmmer(_,_,_,Alph,_,NULE,_,_),_) --> "NULE", !,
{ nof_symbols(Alph,N) },
scan_transitions(N,Transitions),
{ NULE =.. [null|Transitions] }.
match_field(_,_) --> "EVD", !,
scanner_skip. % optional, but should do later.x
match_field(Info,S) --> "HMM", !,
scanner_skip,
{
get_line(S,_,Info),
Info = hmmer(_,_,NOfStates,Alph,_,_,model(BD,NBD,Transitions),MAP),
nof_symbols(Alph,N),
scan_model(S,NOfStates,N,BD,NBD,Transitions,MAP,Info),
throw(done(Info))
}.
scan_model(S,NOfStates,N,BD,NBD,Transitions,MAP,Info) :-
get_line(S,Line, Info),
scan_bd(BD, NBD, Line, []),
scan_states(NOfStates, N, S, MAP, Transitions, Info).
scan_states(0, _, _, MAP, [], _) :- !, close_map(MAP).
scan_states(NOfStates, N, Stream, MAP, [t(E,I,S)|Transitions], Info) :-
NOfStates1 is NOfStates-1,
scan_state(Stream, E, I, MAP, S, N, NMAP, Info),
scan_states(NOfStates1, N, Stream, NMAP, Transitions, Info).
scan_state(Stream, E,I,MAP,s(MM,MI,MD,IM,II,DM,DD,BM,ME), N, NMAP, Info) :-
get_line(Stream, ELine, Info),
get_line(Stream, ILine, Info),
get_line(Stream, SLine, Info),
% format('~s~n~s~n~s~n',[ELine,ILine,SLine]),
scanner_skip_field(ELine,Eline1),
scan_transitions(N, E, Eline1, EMap),
scan_map(MAP,NMAP,EMap, []),
scanner_skip_field(ILine,ILine1),
scan_transitions(N, I, ILine1, []),
scanner_skip_field(SLine,SLine1),
scan_transitions(MM, MI, MD, IM, II, DM, DD, BM, ME, SLine1, []).
scan_transitions(MM, MI, MD, IM, II, DM, DD, BM, ME) -->
scan_transition(MM),
scan_transition(MI),
scan_transition(MD),
scan_transition(IM),
scan_transition(II),
scan_transition(DM),
scan_transition(DD),
scan_transition(BM),
scan_transition(ME).
scan_transitions(0, []) --> !.
scan_transitions(N, [T|Ts]) -->
{ N1 is N-1 },
scan_transition(T),
scan_transitions(N1, Ts).
scan_transition(T) -->
scanner_skip_blanks,
get_transition(T).
get_transition('*') --> "*", !, scanner_skip_blanks.
get_transition(N) --> [C],
{ C >= 0'0, C =< 0'9, !, C0 is C-0'0 },
get_number(N, C0 ).
get_transition(N) --> "-",
get_number(C0, 0 ),
{ N is -C0}.
get_number(Nf, N0) --> [C], !,
( { C >= 0'0, C =< 0'9 } -> { Ni is N0*10+(C-0'0) }, get_number(Nf,Ni) ;
{ Nf = N0 }
).
get_number(N, N) --> [].
scanner_skip_blanks --> " ", !, scanner_skip_blanks.
scanner_skip_blanks --> " ", !, scanner_skip_blanks.
scanner_skip_blanks --> [].
scanner_skip_field --> scan_transition(_).
scan_bd(BD, NBD) -->
scan_transition(BD),
scanner_skip_field,
scan_transition(NBD).
check_alphabet(Alph) -->
to_lower(Lower),
{ get_alph(Lower, Alph) }.
to_lower([CF|Lower]) --> [C], !,
{ ( C >= 0'A, C =< 0'Z -> CF is C+(0'a-0'A) ; CF = C) },
to_lower(Lower).
to_lower([]) --> [].
get_alph("amino", amino).
get_alph("nucleic", nucleic).
map_code("yes", yes(_)).
map_code("no", no).
nof_symbols(amino,20).
nof_symbols(nucleic,4).
scanner_skip(_,_).
get_name(L,L,[]).
scan_map(yes([Id|Next]),yes(Next)) -->
scan_transition(Id).
scan_map(no,no) --> [].
close_map(yes([])) :- !.
close_map(no).
gen_program(W, hmmer(VersionId,Name,NOfStates,Alphabet,SpecialTransitions,NULE,Model,MAP)) :-
format(W, '~n% HMMer Version ~d (Plan 7) using ~a~n',[VersionId,Alphabet]),
format(W, '~n% Name: ~a~n',[Name]),
format(W, '~n% Size: ~d~n',[NOfStates]),
format(W, 'slices(~d).~n',[NOfStates]),
gen_specials(W, SpecialTransitions),
gen_nule(W, NULE,Alphabet,NULEProbs),
gen_model(W, Model,NULEProbs),
gen_map(W, MAP).
%
% special nodes in graph.
%
gen_specials(W, special(NB,NN,EC,EJ,CT,CC,JB,JJ,GG,GF)) :-
% reaching state N
normalize(NN,1.0,NNCPT),
format(W, '~n%Reaching state N.~n',[]),
format(W, 'n_n_cpt(~w,1.0,~w).~n',[NN,NNCPT]),
% reaching state B
normalize(JB,1.0,JBCPT),
format(W, '~n%Reaching state B.~n',[]),
format(W, 'j_b_cpt(~w,1.0,~w).~n',[JB,JBCPT]),
normalize(NB,1.0,NBCPT),
format(W, 'n_b_cpt(~w,1.0,~w).~n',[NB,NBCPT]),
% reaching state J
format(W, '~n%Reaching state J.~n',[]),
normalize(EJ,1.0,EJCPT),
format(W, 'e_j_cpt(~w,1.0,~w).~n',[EJ,EJCPT]),
normalize(JJ,1.0,JJCPT),
format(W, 'j_j_cpt(~w,1.0,~w).~n',[JJ,JJCPT]),
% reaching state C
format(W, '~n%Reaching state C.~n',[]),
normalize(EC,1.0,ECCPT),
format(W, 'e_c_cpt(~w,1.0,~w).~n',[EC,ECCPT]),
normalize(CC,1.0,CCCPT),
format(W, 'c_c_cpt(~w,1.0,~w).~n',[CC,CCCPT]),
% reaching state T
format(W, '~n%Reaching state T.~n',[]),
normalize(CT,1.0,CTCPT),
format(W, 'c_t_cpt(~w,1.0,~w).~n',[CT,CTCPT]),
% null model
format(W, '~n%Reaching state G (Null Model).~n',[]),
normalize(GG,1.0,GGCPT),
format(W, 'g_g_cpt(~w,1.0,~w).~n',[GG,GGCPT]),
format(W, '~n%Reaching state F (Null Model).~n',[]),
normalize(GF,1.0,GFCPT),
format(W, 'g_f_cpt(~w,1.0,~w).~n',[GF,GFCPT]).
normalize(*,_,0.0) :- !.
normalize(Score,NULL,Prob) :-
Prob is NULL * exp(2.0, (Score/1000)).
normalizel([],_,[]).
normalizel([Score|Scores],NULL,[Prob|Probs]) :-
normalize(Score, NULL, Prob),
normalizel(Scores,NULL,Probs).
normalizell([],[],[]).
normalizell([Score|Scores],[Norm|Norms],[Prob|Probs]) :-
normalize(Score, Norm, Prob),
normalizell(Scores,Norms,Probs).
% null emission CPT
gen_nule(W, NULE,Alph,PsCPT) :-
NULE =.. [_|Ps],
nof_symbols(Alph,Size),
Norm is 1/Size,
normalizel(Ps,Norm,PsCPT),
E0s =.. [e|Ps],
Es =.. [e|PsCPT],
format(W, '~n%Null state emission CPT.~n',[]),
format(W, 'nule_cpt(~n ~w,~n ~w,~n ~w).~n',[E0s,Norm,Es]).
gen_model(W, model(BD,NBD,States),PsCPT) :-
normalize_bd(BD,NBD,BDCPT),
format(W, '~n%Reaching first D.~n',[]),
format(W, 'b_d_cpt(~w,~w,~w).~n',[BD,NBD,BDCPT]),
gen_states(W, States,1,PsCPT).
gen_states(_, [],_,_).
gen_states(W, [State|States],StateNo,PsCPT) :-
gen_state(W, State,StateNo,PsCPT),
NStateNo is StateNo+1,
gen_states(W, States,NStateNo,PsCPT).
gen_state(W, t(E,I,s(MM,MI,MD,IM,II,DM,DD,BM,ME)),StateNo,PsCPT) :-
format(W, '~n%State ~d.~n',[StateNo]),
Norm =.. [e|PsCPT],
normalizell(E,PsCPT,ECPT),
E0s =.. [e|E],
Es =.. [e|ECPT],
find_consensus(W,StateNo,ECPT),
sum(ECPT,0,TE),
format(W, 'me_cpt(~d,~n ~w,~n ~w,~n ~w). % ~w~n',[StateNo,E0s,Norm,Es,TE]),
normalizell(I,PsCPT,ICPT),
I0s =.. [e|ICPT],
Is =.. [e|ICPT],
sum(ICPT,0,TI),
format(W, 'ie_cpt(~d,~n ~w,~n ~w,~n ~w). %~w~n',[StateNo,I0s,Norm,Is,TI]),
normalize(MM,1.0,MMCPT),
format(W, 'm_m_cpt(~d,~w,~w,~w).~n',[StateNo,MM,1.0,MMCPT]),
normalize(MI,1.0,MICPT),
format(W, 'm_i_cpt(~d,~w,~w,~w).~n',[StateNo,MI,1.0,MICPT]),
normalize(MD,1.0,MDCPT),
format(W, 'm_d_cpt(~d,~w,~w,~w).~n',[StateNo,MD,1.0,MDCPT]),
normalize(II,1.0,IICPT),
format(W, 'i_i_cpt(~d,~w,~w,~w).~n',[StateNo,II,1.0,IICPT]),
normalize(IM,1.0,IMCPT),
format(W, 'i_m_cpt(~d,~w,~w,~w).~n',[StateNo,IM,1.0,IMCPT]),
normalize(DM,1.0,DMCPT),
format(W, 'd_m_cpt(~d,~w,~w,~w).~n',[StateNo,DM,1.0,DMCPT]),
normalize(DD,1.0,DDCPT),
format(W, 'd_d_cpt(~d,~w,~w,~w).~n',[StateNo,DD,1.0,DDCPT]),
normalize(BM,1.0,BMCPT),
format(W, 'b_m_cpt(~d,~w,~w,~w).~n',[StateNo,BM,1.0,BMCPT]),
normalize(ME,1.0,MECPT),
format(W, 'm_e_cpt(~d,~w,~w,~w).~n',[StateNo,ME,1.0,MECPT]).
gen_map(_,_).
normalize_bd(A,B,A).
sum([],S,S).
sum([P|Ps],S0,S) :-
Si is S0+P,
sum(Ps,Si,S).
find_consensus(W, StateNo,ECPT) :-
max_index(ECPT,1,0,-1,_,MaxIndex),
format(W, 'consensus(~d,~d).~n',[StateNo,MaxIndex]).
max_index([],_,Max,MaxIndex,Max,MaxIndex).
max_index([H|L],I0,Max0,MaxIndex0,Max,MaxIndex) :-
H > Max0, !,
I is I0+1,
max_index(L,I,H,I0,Max,MaxIndex).
max_index([_|L],I0,Max0,MaxIndex0,Max,MaxIndex) :-
I is I0+1,
max_index(L,I,Max0,MaxIndex0,Max,MaxIndex).