% FuzzyOther: Test fuzzy, linear, and unweighted % conservation estimates against proteins % not in the TPR family. % % Scott F. Smith % Department of Electrical and Computer Engineering % Boise State University % SFSmith@BoiseState.edu % % 10 April 2004 % Specify number of sequences with know structure NumbStruct = 12; % Read in sequences Sequences = []; fileIn = fopen('TPR.txt'); while 1 FileLine = fgetl(fileIn); if ~ischar(FileLine) break end Sequences = [Sequences; FileLine(24:length(FileLine))]; end fclose(fileIn); CoreSeq = (Sequences == lower(Sequences)) & (Sequences ~= '.'); Sequences = upper(Sequences); % Read test sequences fileIn = fopen('Other.txt'); TestSeq = []; while 1 FileLine = fgetl(fileIn); if ~ischar(FileLine) break end TestSeq = [TestSeq FileLine]; end fclose(fileIn); % Determine number of sequences and number of positions [NumbSeq, NumbPos] = size(Sequences); % Initialize fractions matrix Fractions = NaN * ones(4,NumbPos); % Calculate fraction of Glycine (G) at each position for i = 1:NumbPos Fractions(1,i) = length(find(Sequences(:,i) == 'G'))/NumbSeq; end % Calculate fraction of Hydrophobic (A, V, L, I, M, F, or P) at each position for i = 1:NumbPos Temp = length(find(Sequences(:,i) == 'A')); Temp = Temp + length(find(Sequences(:,i) == 'V')); Temp = Temp + length(find(Sequences(:,i) == 'L')); Temp = Temp + length(find(Sequences(:,i) == 'I')); Temp = Temp + length(find(Sequences(:,i) == 'M')); Temp = Temp + length(find(Sequences(:,i) == 'F')); Temp = Temp + length(find(Sequences(:,i) == 'P')); Fractions(2,i) = Temp/NumbSeq; end % Calculate fraction of Polar (S, T, Y, H, C, N, Q, W) at each position for i = 1:NumbPos Temp = length(find(Sequences(:,i) == 'S')); Temp = Temp + length(find(Sequences(:,i) == 'T')); Temp = Temp + length(find(Sequences(:,i) == 'Y')); Temp = Temp + length(find(Sequences(:,i) == 'H')); Temp = Temp + length(find(Sequences(:,i) == 'C')); Temp = Temp + length(find(Sequences(:,i) == 'N')); Temp = Temp + length(find(Sequences(:,i) == 'Q')); Temp = Temp + length(find(Sequences(:,i) == 'W')); Fractions(3,i) = Temp/NumbSeq; end % Calculate fraction of Charged (D, E, R, K) at each position for i = 1:NumbPos Temp = length(find(Sequences(:,i) == 'D')); Temp = Temp + length(find(Sequences(:,i) == 'E')); Temp = Temp + length(find(Sequences(:,i) == 'R')); Temp = Temp + length(find(Sequences(:,i) == 'K')); Fractions(4,i) = Temp/NumbSeq; end % Find maximum fractions at each position = mQ(x) mQ = max(Fractions); mQ = mQ / mean(mQ); mVQ = mQ .* mQ; mVQ = mVQ / mean(mVQ); % Calculate fraction of structural core at each position = mT(x) mT = NaN * ones(1,NumbPos); for i = 1:NumbPos mT(i) = length(find(CoreSeq(1:NumbStruct,i) == 1))/NumbStruct; end mT = mT / mean(mT); mVT = mT .* mT; mVT = mVT / mean(mVT); % Calculate combined conservation membership function = mC(x) mC = min([mQ; mT]); mC = max([mC; mVQ; mVT]); mC = mC / mean(mC); % Calculate combined conservation using linear average aC = (mQ + mT)/2; % Calculate weighted PSSM PSSM = NaN * ones(21,NumbPos); for i = 1:NumbPos PSSM(1,i) = (1 + length(find(Sequences(:,i) == 'G')))/NumbSeq; PSSM(2,i) = (1 + length(find(Sequences(:,i) == 'A')))/NumbSeq; PSSM(3,i) = (1 + length(find(Sequences(:,i) == 'V')))/NumbSeq; PSSM(4,i) = (1 + length(find(Sequences(:,i) == 'L')))/NumbSeq; PSSM(5,i) = (1 + length(find(Sequences(:,i) == 'I')))/NumbSeq; PSSM(6,i) = (1 + length(find(Sequences(:,i) == 'M')))/NumbSeq; PSSM(7,i) = (1 + length(find(Sequences(:,i) == 'F')))/NumbSeq; PSSM(8,i) = (1 + length(find(Sequences(:,i) == 'P')))/NumbSeq; PSSM(9,i) = (1 + length(find(Sequences(:,i) == 'S')))/NumbSeq; PSSM(10,i) = (1 + length(find(Sequences(:,i) == 'T')))/NumbSeq; PSSM(11,i) = (1 + length(find(Sequences(:,i) == 'Y')))/NumbSeq; PSSM(12,i) = (1 + length(find(Sequences(:,i) == 'H')))/NumbSeq; PSSM(13,i) = (1 + length(find(Sequences(:,i) == 'C')))/NumbSeq; PSSM(14,i) = (1 + length(find(Sequences(:,i) == 'N')))/NumbSeq; PSSM(15,i) = (1 + length(find(Sequences(:,i) == 'Q')))/NumbSeq; PSSM(16,i) = (1 + length(find(Sequences(:,i) == 'W')))/NumbSeq; PSSM(17,i) = (1 + length(find(Sequences(:,i) == 'D')))/NumbSeq; PSSM(18,i) = (1 + length(find(Sequences(:,i) == 'E')))/NumbSeq; PSSM(19,i) = (1 + length(find(Sequences(:,i) == 'R')))/NumbSeq; PSSM(20,i) = (1 + length(find(Sequences(:,i) == 'K')))/NumbSeq; PSSM(21,i) = (1 + length(find(Sequences(:,i) == '.')))/NumbSeq; PSSM(:,i) = log2(PSSM(:,i)); PSSMFlat(:,i) = PSSM(:,i); PSSMAvg(:,i) = PSSM(:,i) * aC(i); PSSM(:,i) = PSSM(:,i) * mC(i); end % ******* Fuzzy ******************************** ProtChars = 'GAVLIMFPSTYHCNQWDERK.'; % Calculate maximum score of test sequence TestScore = 0; for i = 1:NumbPos CharNumb = find(ProtChars == TestSeq(i)); if length(CharNumb) == 1 TestScore = TestScore + PSSM(CharNumb,i); end end MaxScore = TestScore; Maxj = 0; for j = 1:length(TestSeq)-34 TestScore = 0; for i = 1:NumbPos CharNumb = find(ProtChars == TestSeq(i + j)); if length(CharNumb) == 1 TestScore = TestScore + PSSM(CharNumb,i); end end if TestScore > MaxScore MaxScore = TestScore; Maxj = j; end end % Calculate score of randomized test sequence SumRandScore = 0; for j = 1:1000 RandScore = 0; for i = 1:NumbPos RandChar = TestSeq(Maxj + ceil(NumbPos*rand(1))); CharNumb = find(ProtChars == RandChar); if length(CharNumb) == 1 RandScore = RandScore + PSSM(CharNumb,i); end end SumRandScore = SumRandScore + RandScore; end RandScore = SumRandScore / 1000; % Calculate significance of result Significance = MaxScore - RandScore; % ******* Linear ********************************* % Calculate score of test sequence using average conservation TestScore = 0; for i = 1:NumbPos CharNumb = find(ProtChars == TestSeq(i)); if length(CharNumb) == 1 TestScore = TestScore + PSSMAvg(CharNumb,i); end end MaxScore = TestScore; Maxj = 0; for j = 1:length(TestSeq)-34 TestScore = 0; for i = 1:NumbPos CharNumb = find(ProtChars == TestSeq(i + j)); if length(CharNumb) == 1 TestScore = TestScore + PSSMAvg(CharNumb,i); end end if TestScore > MaxScore MaxScore = TestScore; Maxj = j; end end % Calculate average-conservation score of randomized test sequence SumRandScore = 0; for j = 1:1000 RandScore = 0; for i = 1:NumbPos RandChar = TestSeq(ceil(NumbPos*rand(1))); CharNumb = find(ProtChars == RandChar); if length(CharNumb) == 1 RandScore = RandScore + PSSMAvg(CharNumb,i); end end SumRandScore = SumRandScore + RandScore; end RandScore = SumRandScore / 1000; % Calculate significance of sequence-only result AvgSig = MaxScore - RandScore; % ******* Flat (unweighted) ************************* % Calculate score of test sequence using unweighted PSSM TestScore = 0; for i = 1:NumbPos CharNumb = find(ProtChars == TestSeq(i)); if length(CharNumb) == 1 TestScore = TestScore + PSSMFlat(CharNumb,i); end end MaxScore = TestScore; Maxj = 0; for j = 1:length(TestSeq)-34 TestScore = 0; for i = 1:NumbPos CharNumb = find(ProtChars == TestSeq(i + j)); if length(CharNumb) == 1 TestScore = TestScore + PSSMFlat(CharNumb,i); end end if TestScore > MaxScore MaxScore = TestScore; Maxj = j; end end % Calculate average-conservation score of randomized test sequence SumRandScore = 0; for j = 1:1000 RandScore = 0; for i = 1:NumbPos RandChar = TestSeq(ceil(NumbPos*rand(1))); CharNumb = find(ProtChars == RandChar); if length(CharNumb) == 1 RandScore = RandScore + PSSMFlat(CharNumb,i); end end SumRandScore = SumRandScore + RandScore; end RandScore = SumRandScore / 1000; % Calculate significance of sequence-only result FlatSig = MaxScore - RandScore; % Show results [Significance AvgSig FlatSig] / log10(length(TestSeq))