% Viterbi: Matlab implementation of the Viterbi % algorithm for use in finding the best % fit of a protein sequence to an HMM. % % Scott F. Smith % Department of Electrical and Computer Engineering % Boise State University % SFSmith@BoiseState.edu % % 12 March 2004 % Filename to read (fname.txt) % Should contain a single sequence in FASTA format fname = 'test'; % Specify HMM to fit %NullModel; GoodModel % Calculate natural logs of HMM probabilities DDln = log(DDtrans); MMln = log(MMtrans); IIln = log(IItrans); DMln = log(DMtrans); MDln = log(MDtrans); IDln = log(IDtrans); DIln = log(DItrans); IMln = log(IMtrans); MIln = log(MItrans); Acidln = log(AcidProbs); % List of allowed protein characters ProtChars = 'ACDEFGHIKLMNPQRSTVWY'; % Read FASTA format input file PString = []; fid = fopen([fname '.txt']); % Throw away first (comment) line FileLine = fgetl(fid); % Read sequence data and make sure it is upper case while 1 FileLine = fgetl(fid); if ~ischar(FileLine) break end PString = [PString upper(FileLine)]; end fclose(fid); % Convert protein sequence ASCII characters to amino acid index numbers PIndex = zeros(size(PString)); for i = 1:length(PString) if length(find(ProtChars == PString(i))) > 0 PIndex(i) = find(ProtChars == PString(i)); end end % Do not allow outputs longer than the input sting length plus model length MaxT = length(PString) + N + 1; % Initialize accumulated score (delta), low-cost predecessor (psi), and % current input character index (char) % Psi codes: D(n): -1 if M(n-1), 0 if I(n-1), 1 if D(n-1) % I(n): -1 if M(n), 0 if I(n), 1 if D(n) % M(n): -1 if M(n-1), 0 if I(n-1), 1 if D(n-1) deltaD = NaN * ones(MaxT,N); deltaI = NaN * ones(MaxT,N+1); deltaM = NaN * ones(MaxT,N+1); % Excluding BEG, but including END=M(N+1) psiD = NaN * ones(MaxT,N); psiI = NaN * ones(MaxT,N+1); psiM = NaN * ones(MaxT,N+1); % Excluding BEG, but including END=M(N+1) charD = NaN * ones(MaxT,N); charI = NaN * ones(MaxT,N+1); charM = NaN * ones(MaxT,N+1); % Excluding BEG, but including END=M(N+1) %%% Calculate deltas, psis, and chars % Special case for t=1 deltaD(1,1) = -MDln(1); deltaI(1,1) = -MIln(1); deltaM(1,1) = -MMln(1) - Acidln(1,PIndex(1)); psiD(1,1) = -1; psiI(1,1) = -1; psiM(1,1) = -1; charD(1,1) = 0; charI(1,1) = 1; charM(1,1) = 1; for t = 2:MaxT % Special case for i=1 -- only possible path is from I1 if t < length(PIndex)+2 deltaD(t,1) = deltaI(t-1,1) - IDln(1); psiD(t,1) = 0; charD(t,1) = t-1; end if t < length(PIndex)+1 deltaI(t,1) = deltaI(t-1,1) - IIln(1); deltaM(t,1) = deltaI(t-1,1) - IMln(1) - Acidln(1,PIndex(t)); psiI(t,1) = 0; psiM(t,1) = 0; charI(t,1) = t; charM(t,1) = t; end for i = 2:N Dlist = [deltaM(t-1,i-1)-MDln(i) deltaI(t-1,i)-IDln(i) deltaD(t-1,i-1)-DDln(i-1)]; Ilist = [deltaM(t-1,i-1)-MIln(i) deltaI(t-1,i)-IIln(i) deltaD(t-1,i-1)-DIln(i-1)]; Mlist = [deltaM(t-1,i-1)-MMln(i) deltaI(t-1,i)-IMln(i) deltaD(t-1,i-1)-DMln(i-1)]; if charM(t-1,i-1) == length(PIndex) Ilist(1) = NaN; Mlist(1) = NaN; end if charI(t-1,i) == length(PIndex) Ilist(2) = NaN; Mlist(2) = NaN; end if charD(t-1,i-1) == length(PIndex) Ilist(3) = NaN; Mlist(3) = NaN; end deltaD(t,i) = min(Dlist); deltaI(t,i) = min(Ilist); deltaM(t,i) = min(Mlist); if length(find(Dlist == min(Dlist))) > 0 temp = find(Dlist == min(Dlist)) - 2; psiD(t,i) = temp(1); if temp(1) == 1 charD(t,i) = charD(t-1,i-1); end if temp(1) == 0 charD(t,i) = charI(t-1,i); end if temp(1) == -1 charD(t,i) = charM(t-1,i-1); end end if length(find(Ilist == min(Ilist))) > 0 temp = find(Ilist == min(Ilist)) - 2; psiI(t,i) = temp(1); if temp(1) == 1 charI(t,i) = charD(t-1,i-1) + 1; end if temp(1) == 0 charI(t,i) = charI(t-1,i) + 1; end if temp(1) == -1 charI(t,i) = charM(t-1,i-1) + 1; end end if length(find(Mlist == min(Mlist))) > 0 temp = find(Mlist == min(Mlist)) - 2; psiM(t,i) = temp(1); if temp(1) == 1 charM(t,i) = charD(t-1,i-1) + 1; end if temp(1) == 0 charM(t,i) = charI(t-1,i) + 1; end if temp(1) == -1 charM(t,i) = charM(t-1,i-1) + 1; end deltaM(t,i) = deltaM(t,i) - Acidln(i,PIndex(charM(t,i))); end end % Special case for i=N+1 i = N+1; Ilist = [deltaM(t-1,i-1)-MIln(i) deltaI(t-1,i)-IIln(i) deltaD(t-1,i-1)-DIln(i-1)]; Mlist = [deltaM(t-1,i-1)-MMln(i) deltaI(t-1,i)-IMln(i) deltaD(t-1,i-1)-DMln(i-1)]; if charM(t-1,i-1) == length(PIndex) Ilist(1) = NaN; end if charM(t-1,i-1) ~= length(PIndex) Mlist(1) = NaN; end if charI(t-1,i) == length(PIndex) Ilist(2) = NaN; end if charI(t-1,i) ~= length(PIndex) Mlist(2) = NaN; end if charD(t-1,i-1) == length(PIndex) Ilist(3) = NaN; end if charD(t-1,i-1) ~= length(PIndex) Mlist(3) = NaN; end deltaI(t,i) = min(Ilist); deltaM(t,i) = min(Mlist); if length(find(Ilist == min(Ilist))) > 0 temp = find(Ilist == min(Ilist)) - 2; psiI(t,i) = temp(1); if temp(1) == 1 charI(t,i) = charD(t-1,i-1) + 1; end if temp(1) == 0 charI(t,i) = charI(t-1,i) + 1; end if temp(1) == -1 charI(t,i) = charM(t-1,i-1) + 1; end end if length(find(Mlist == min(Mlist))) > 0 temp = find(Mlist == min(Mlist)) - 2; psiM(t,i) = temp(1); if temp(1) == 1 charM(t,i) = charD(t-1,i-1) + 1; end if temp(1) == 0 charM(t,i) = charI(t-1,i) + 1; end if temp(1) == -1 charM(t,i) = charM(t-1,i-1) + 1; end deltaM(t,i) = deltaM(t,i); end end % Find the output string length which minimizes cost function. % This will be the minimum delta in the END-state over time % such that there are no left over characters. temp = find(charM(:,N+1) > length(PString)); Tout = temp(1); % Backtrace from END-state at Tout to BEG-state at t=0 % state_type: -1 for M, 0 for I, 1 for D OutString = []; ModelString = []; state_type = psiM(Tout,N+1); % Type of predecessor to END i = N; charLoc = length(PString); for t = Tout-1:-1:1 if state_type == -1 state_type = psiM(t,i); bestChar = ProtChars(find(Acidln(i,:) == max(Acidln(i,:)))); i = i - 1; OutString = [PString(charLoc) OutString]; charLoc = charLoc - 1; ModelString = [bestChar ModelString]; elseif state_type == 0 state_type = psiI(t,i+1); OutString = [PString(charLoc) OutString]; charLoc = charLoc - 1; ModelString = ['-' ModelString]; else state_type = psiD(t,i); skipChar = ProtChars(find(Acidln(i,:) == max(Acidln(i,:)))); i = i - 1; OutString = ['-' OutString]; ModelString = [skipChar ModelString]; end end OutString ModelString % Output aligned sequence fidOut = fopen([fname '_aligned.txt'],'w'); StartLoc = 0; while StartLoc < length(OutString) EndLoc = min([StartLoc+50 length(OutString)]); fprintf(fid,'%.0f to %.0f\r\n',StartLoc+1,EndLoc); fprintf(fidOut,'Target: %s\r\n',OutString(StartLoc+1:EndLoc)); fprintf(fidOut,'Model: %s\r\n\r\n',ModelString(StartLoc+1:EndLoc)); StartLoc = StartLoc + 50; end fclose(fidOut);