yap-6.3/packages/CLPBN/examples/HMMer/scan.yap

%
% Convert HMMer model to CLP(BN) program
%
main :-
	open('hmmer_b.1.18.1.hmm', read, S),
	open('hmmer_b.1.18.1.yap', write, W),
%	open('globin.hmm', read, S),
	catch(parse_model(S,_),done(Info),stop(S,W,Info)),
	close(S),
	close(W).

stop(S,W,Info) :-
	ground(Info),
	gen_program(W, Info).
stop(_,_,_) :-
	format(user_error,"Bad HMM~n", []).

parse_model(S,Info) :-
	get_line(S, Line, Info),
%	format('~s~n',[Line]),
	match_field(Info, S, Line, []),
	parse_model(S, Info).

get_line(S, Out, Info) :-
	get0(S, C),
	(
	  C == 10 -> Out = [] ;
	  C == -1 -> throw(done(Info)) ;
	  Out = [C|Line], get_line(S, Line, Info)
	).


match_field(hmmer(2,_,_,_,_,_,_,_), _) --> "HMMER", !,
	scanner_skip.			% mandatory field, should be ground
match_field(hmmer(_,Name,_,_,_,_,_,_),_) --> "NAME", !, 	% mandatory field
	scanner_skip_blanks,
	get_name(String), {atom_codes(Name,String) }.
match_field(_,_) --> "ACC", !. % accession id, used to track DB accesses
match_field(hmmer(_,_,NOfStates,_,_,_,_,_),_) --> "LENG", !, % number of states
	scanner_skip_blanks,
	get_number(NOfStates, 0).
match_field(hmmer(_,_,_,Alph,_,_,_,_),_) --> "ALPH", !, % aminos or bases
	scanner_skip_blanks,
	check_alphabet(Alph).
match_field(_,_) --> "RF", !, scanner_skip.
match_field(_,_) --> "CS", !, scanner_skip.
match_field(hmmer(_,_,_,_,_,_,_,MAP),_) --> "MAP", !,
	scanner_skip_blanks,
	to_lower(Codes),
	{ map_code(Codes,MAP) }.
match_field(_,_) --> "COM", !, scanner_skip.
match_field(_,_) --> "CKSUM", !, scanner_skip.
match_field(_,_) --> "GA", !, scanner_skip.
match_field(_,_) --> "TC", !, scanner_skip.
match_field(_,_) --> "NC", !, scanner_skip.
match_field(_,_) --> "NSEQ", !, scanner_skip.
match_field(_,_) --> "DATE", !, scanner_skip.
match_field(hmmer(_,_,_,_,special(NB,NN,EC,EJ,CT,CC,JB,JJ,_,_),_,_,_),_) --> "XT", !,
	scan_transition(NB),
	scan_transition(NN),
	scan_transition(EC),
	scan_transition(EJ),
	scan_transition(CT),
	scan_transition(CC),
	scan_transition(JB),
	scan_transition(JJ).
match_field(hmmer(_,_,_,_,special(_,_,_,_,_,_,_,_,GG,GF),_,_,_),_) --> "NULT", !,
	scan_transition(GG),
	scan_transition(GF).
match_field(hmmer(_,_,_,Alph,_,NULE,_,_),_) --> "NULE", !,
	{ nof_symbols(Alph,N) },
	scan_transitions(N,Transitions),
	{ NULE =.. [null|Transitions] }.
match_field(_,_) --> "EVD", !,
	scanner_skip.			% optional, but should do later.x
match_field(Info,S) --> "HMM", !,
	scanner_skip,
	{
	  get_line(S,_,Info),
	  Info = hmmer(_,_,NOfStates,Alph,_,_,model(BD,NBD,Transitions),MAP),
	  nof_symbols(Alph,N),
	  scan_model(S,NOfStates,N,BD,NBD,Transitions,MAP,Info),
	  throw(done(Info))
	}.

scan_model(S,NOfStates,N,BD,NBD,Transitions,MAP,Info) :-
	get_line(S,Line, Info),
	scan_bd(BD, NBD, Line, []),
	scan_states(NOfStates, N, S, MAP, Transitions, Info).

scan_states(0, _, _, MAP, [], _) :- !, close_map(MAP).
scan_states(NOfStates, N, Stream, MAP, [t(E,I,S)|Transitions], Info) :-
	NOfStates1 is NOfStates-1,
	scan_state(Stream, E, I, MAP, S, N, NMAP, Info),
	scan_states(NOfStates1, N, Stream, NMAP, Transitions, Info).

scan_state(Stream, E,I,MAP,s(MM,MI,MD,IM,II,DM,DD,BM,ME), N, NMAP, Info) :-
	get_line(Stream, ELine, Info),
	get_line(Stream, ILine, Info),
	get_line(Stream, SLine, Info),
%	format('~s~n~s~n~s~n',[ELine,ILine,SLine]),
	scanner_skip_field(ELine,Eline1),
	scan_transitions(N, E, Eline1, EMap),
	scan_map(MAP,NMAP,EMap, []),
	scanner_skip_field(ILine,ILine1),
	scan_transitions(N, I, ILine1, []),
	scanner_skip_field(SLine,SLine1),
	scan_transitions(MM, MI, MD, IM, II, DM, DD, BM, ME, SLine1, []).

scan_transitions(MM, MI, MD, IM, II, DM, DD, BM, ME) -->
	scan_transition(MM),
	scan_transition(MI),
	scan_transition(MD),
	scan_transition(IM),
	scan_transition(II),
	scan_transition(DM),
	scan_transition(DD),
	scan_transition(BM),
	scan_transition(ME).


scan_transitions(0, []) --> !.
scan_transitions(N, [T|Ts]) -->
	{ N1 is N-1 },
	scan_transition(T),
	scan_transitions(N1, Ts).

scan_transition(T) -->
	scanner_skip_blanks,
	get_transition(T).

get_transition('*') --> "*", !, scanner_skip_blanks.
get_transition(N) --> [C],
	{ C >= 0'0, C =< 0'9, !, C0 is C-0'0 },
	get_number(N, C0 ).
get_transition(N) --> "-",
	get_number(C0, 0 ),
	{ N is -C0}.

get_number(Nf, N0) --> [C], !,
	( { C >= 0'0, C =< 0'9 } -> { Ni is N0*10+(C-0'0) }, get_number(Nf,Ni) ;
	    { Nf = N0 }
	).
get_number(N, N) --> [].

scanner_skip_blanks --> " ", !, scanner_skip_blanks.
scanner_skip_blanks --> "	", !, scanner_skip_blanks.
scanner_skip_blanks --> [].


scanner_skip_field --> scan_transition(_).

scan_bd(BD, NBD) -->
	scan_transition(BD),
	scanner_skip_field,
	scan_transition(NBD).

check_alphabet(Alph) -->
	to_lower(Lower),
	{ get_alph(Lower, Alph) }.

to_lower([CF|Lower]) --> [C], !,
	{ ( C >= 0'A, C =< 0'Z -> CF is C+(0'a-0'A) ; CF = C) },
	to_lower(Lower).
to_lower([]) --> [].

get_alph("amino", amino).
get_alph("nucleic", nucleic).

map_code("yes", yes(_)).
map_code("no", no).

nof_symbols(amino,20).
nof_symbols(nucleic,4).

scanner_skip(_,_).

get_name(L,L,[]).

scan_map(yes([Id|Next]),yes(Next)) -->
	scan_transition(Id).
scan_map(no,no) --> [].

close_map(yes([])) :- !.
close_map(no).


gen_program(W, hmmer(VersionId,Name,NOfStates,Alphabet,SpecialTransitions,NULE,Model,MAP)) :-
	format(W, '~n% HMMer Version ~d (Plan 7) using ~a~n',[VersionId,Alphabet]),
	format(W, '~n% Name: ~a~n',[Name]),
	format(W, '~n% Size: ~d~n',[NOfStates]),
	format(W, 'slices(~d).~n',[NOfStates]),
	gen_specials(W, SpecialTransitions),
	gen_nule(W, NULE,Alphabet,NULEProbs),
	gen_model(W, Model,NULEProbs),
	gen_map(W, MAP).


%
% special nodes in graph.
%
gen_specials(W, special(NB,NN,EC,EJ,CT,CC,JB,JJ,GG,GF)) :-
	% reaching state N
	normalize(NN,1.0,NNCPT),
	format(W, '~n%Reaching state N.~n',[]),
	format(W, 'n_n_cpt(~w,1.0,~w).~n',[NN,NNCPT]),
	% reaching state B
	normalize(JB,1.0,JBCPT),
	format(W, '~n%Reaching state B.~n',[]),
	format(W, 'j_b_cpt(~w,1.0,~w).~n',[JB,JBCPT]),
	normalize(NB,1.0,NBCPT),
	format(W, 'n_b_cpt(~w,1.0,~w).~n',[NB,NBCPT]),
	% reaching state J
	format(W, '~n%Reaching state J.~n',[]),
	normalize(EJ,1.0,EJCPT),
	format(W, 'e_j_cpt(~w,1.0,~w).~n',[EJ,EJCPT]),
	normalize(JJ,1.0,JJCPT),
	format(W, 'j_j_cpt(~w,1.0,~w).~n',[JJ,JJCPT]),
	% reaching state C
	format(W, '~n%Reaching state C.~n',[]),
	normalize(EC,1.0,ECCPT),
	format(W, 'e_c_cpt(~w,1.0,~w).~n',[EC,ECCPT]),
	normalize(CC,1.0,CCCPT),
	format(W, 'c_c_cpt(~w,1.0,~w).~n',[CC,CCCPT]),
	% reaching state T
	format(W, '~n%Reaching state T.~n',[]),
	normalize(CT,1.0,CTCPT),
	format(W, 'c_t_cpt(~w,1.0,~w).~n',[CT,CTCPT]),
	% null model
	format(W, '~n%Reaching state G (Null Model).~n',[]),
	normalize(GG,1.0,GGCPT),
	format(W, 'g_g_cpt(~w,1.0,~w).~n',[GG,GGCPT]),
	format(W, '~n%Reaching state F (Null Model).~n',[]),
	normalize(GF,1.0,GFCPT),
	format(W, 'g_f_cpt(~w,1.0,~w).~n',[GF,GFCPT]).


normalize(*,_,0.0) :- !.
normalize(Score,NULL,Prob) :-
	Prob is NULL * exp(2.0, (Score/1000)).

normalizel([],_,[]).
normalizel([Score|Scores],NULL,[Prob|Probs]) :-
	normalize(Score, NULL, Prob),
	normalizel(Scores,NULL,Probs).

normalizell([],[],[]).
normalizell([Score|Scores],[Norm|Norms],[Prob|Probs]) :-
	normalize(Score, Norm, Prob),
	normalizell(Scores,Norms,Probs).

% null emission CPT
gen_nule(W, NULE,Alph,PsCPT) :-
	NULE =.. [_|Ps],
	nof_symbols(Alph,Size),
	Norm is 1/Size,
	normalizel(Ps,Norm,PsCPT),
	E0s =.. [e|Ps],
	Es =.. [e|PsCPT],
	format(W, '~n%Null state emission CPT.~n',[]),
	format(W, 'nule_cpt(~n     ~w,~n     ~w,~n     ~w).~n',[E0s,Norm,Es]).

gen_model(W, model(BD,NBD,States),PsCPT) :-
	normalize_bd(BD,NBD,BDCPT),
	format(W, '~n%Reaching first D.~n',[]),
	format(W, 'b_d_cpt(~w,~w,~w).~n',[BD,NBD,BDCPT]),
	gen_states(W, States,1,PsCPT).

gen_states(_, [],_,_).
gen_states(W, [State|States],StateNo,PsCPT) :-
	gen_state(W, State,StateNo,PsCPT),
	NStateNo is StateNo+1,
	gen_states(W, States,NStateNo,PsCPT).

gen_state(W, t(E,I,s(MM,MI,MD,IM,II,DM,DD,BM,ME)),StateNo,PsCPT) :-
	format(W, '~n%State ~d.~n',[StateNo]),
	Norm =.. [e|PsCPT],
	normalizell(E,PsCPT,ECPT),
	E0s =.. [e|E],
	Es =.. [e|ECPT],
	find_consensus(W,StateNo,ECPT),
	sum(ECPT,0,TE),
	format(W, 'me_cpt(~d,~n    ~w,~n    ~w,~n    ~w). % ~w~n',[StateNo,E0s,Norm,Es,TE]),
	normalizell(I,PsCPT,ICPT),
	I0s =.. [e|ICPT],
	Is =.. [e|ICPT],
	sum(ICPT,0,TI),
	format(W, 'ie_cpt(~d,~n    ~w,~n    ~w,~n    ~w). %~w~n',[StateNo,I0s,Norm,Is,TI]),
	normalize(MM,1.0,MMCPT),
	format(W, 'm_m_cpt(~d,~w,~w,~w).~n',[StateNo,MM,1.0,MMCPT]),
	normalize(MI,1.0,MICPT),
	format(W, 'm_i_cpt(~d,~w,~w,~w).~n',[StateNo,MI,1.0,MICPT]),
	normalize(MD,1.0,MDCPT),
	format(W, 'm_d_cpt(~d,~w,~w,~w).~n',[StateNo,MD,1.0,MDCPT]),
	normalize(II,1.0,IICPT),
	format(W, 'i_i_cpt(~d,~w,~w,~w).~n',[StateNo,II,1.0,IICPT]),
	normalize(IM,1.0,IMCPT),
	format(W, 'i_m_cpt(~d,~w,~w,~w).~n',[StateNo,IM,1.0,IMCPT]),
	normalize(DM,1.0,DMCPT),
	format(W, 'd_m_cpt(~d,~w,~w,~w).~n',[StateNo,DM,1.0,DMCPT]),
	normalize(DD,1.0,DDCPT),
	format(W, 'd_d_cpt(~d,~w,~w,~w).~n',[StateNo,DD,1.0,DDCPT]),
	normalize(BM,1.0,BMCPT),
	format(W, 'b_m_cpt(~d,~w,~w,~w).~n',[StateNo,BM,1.0,BMCPT]),
	normalize(ME,1.0,MECPT),
	format(W, 'm_e_cpt(~d,~w,~w,~w).~n',[StateNo,ME,1.0,MECPT]).

gen_map(_,_).

normalize_bd(A,B,A).

sum([],S,S).
sum([P|Ps],S0,S) :-
	Si is S0+P,
	sum(Ps,Si,S).

find_consensus(W, StateNo,ECPT) :-
	max_index(ECPT,1,0,-1,_,MaxIndex),
	format(W, 'consensus(~d,~d).~n',[StateNo,MaxIndex]).

max_index([],_,Max,MaxIndex,Max,MaxIndex).
max_index([H|L],I0,Max0,MaxIndex0,Max,MaxIndex) :-
	H > Max0, !,
	I is I0+1,
	max_index(L,I,H,I0,Max,MaxIndex).
max_index([_|L],I0,Max0,MaxIndex0,Max,MaxIndex) :-
	I is I0+1,
	max_index(L,I,Max0,MaxIndex0,Max,MaxIndex).