% Fuzzy: Use fuzzy logic to determine if a protein % sequence belongs to a protein family. The % fuzzy-based score significance is given along % with equally-weighted average and flat weighting % for comparison. % % Scott F. Smith % Department of Electrical and Computer Engineering % Boise State University % SFSmith@BoiseState.edu % % 10 April 2004 % % The input file should contain one row for each protein domain % sequence within the family. Columns 1-23 contain the protein % name (this information is discarded). The one-letter amino acid % codes start in column 24 (a '.' is used for gaps). The % known-structure proteins should be placed at the top of the input % file and the 'NumbStruct' variable below set to the number of % sequences with surface/interior coding (upper/lower case). The % sequence number to be removed for testing should be specified by % the 'TestSeqNumber' variable below. % % The program result is four numbers: % First number - The test sequence number % Second number - The significance of the score using fuzzy % Third number - The significance of the score using linear weights % Fourth number - The significance of the score using no weights % Specify number of sequences with know structure NumbStruct = 36; % Specify test sequence TestSeqNumber = 58; % Read in sequences Sequences = []; fileIn = fopen('Ank.txt'); while 1 FileLine = fgetl(fileIn); if ~ischar(FileLine) break end Sequences = [Sequences; FileLine(24:length(FileLine))]; end fclose(fileIn); CoreSeq = (Sequences == lower(Sequences)) & (Sequences ~= '.'); Sequences = upper(Sequences); % Remove the test sequence from the input data TestSeq = Sequences(TestSeqNumber,:); Sequences(TestSeqNumber,:) = []; % Determine number of sequences and number of positions [NumbSeq, NumbPos] = size(Sequences); % Initialize fractions matrix Fractions = NaN * ones(4,NumbPos); % Calculate fraction of Glycine (G) at each position for i = 1:NumbPos Fractions(1,i) = length(find(Sequences(:,i) == 'G'))/NumbSeq; end % Calculate fraction of Hydrophobic (A, V, L, I, M, F, or P) at each position for i = 1:NumbPos Temp = length(find(Sequences(:,i) == 'A')); Temp = Temp + length(find(Sequences(:,i) == 'V')); Temp = Temp + length(find(Sequences(:,i) == 'L')); Temp = Temp + length(find(Sequences(:,i) == 'I')); Temp = Temp + length(find(Sequences(:,i) == 'M')); Temp = Temp + length(find(Sequences(:,i) == 'F')); Temp = Temp + length(find(Sequences(:,i) == 'P')); Fractions(2,i) = Temp/NumbSeq; end % Calculate fraction of Polar (S, T, Y, H, C, N, Q, W) at each position for i = 1:NumbPos Temp = length(find(Sequences(:,i) == 'S')); Temp = Temp + length(find(Sequences(:,i) == 'T')); Temp = Temp + length(find(Sequences(:,i) == 'Y')); Temp = Temp + length(find(Sequences(:,i) == 'H')); Temp = Temp + length(find(Sequences(:,i) == 'C')); Temp = Temp + length(find(Sequences(:,i) == 'N')); Temp = Temp + length(find(Sequences(:,i) == 'Q')); Temp = Temp + length(find(Sequences(:,i) == 'W')); Fractions(3,i) = Temp/NumbSeq; end % Calculate fraction of Charged (D, E, R, K) at each position for i = 1:NumbPos Temp = length(find(Sequences(:,i) == 'D')); Temp = Temp + length(find(Sequences(:,i) == 'E')); Temp = Temp + length(find(Sequences(:,i) == 'R')); Temp = Temp + length(find(Sequences(:,i) == 'K')); Fractions(4,i) = Temp/NumbSeq; end % Find maximum fractions at each position = mQ(x) mQ = max(Fractions); mQ = mQ / mean(mQ); mVQ = mQ .* mQ; mVQ = mVQ / mean(mVQ); % Calculate fraction of structural core at each position = mT(x) mT = NaN * ones(1,NumbPos); for i = 1:NumbPos mT(i) = length(find(CoreSeq(1:NumbStruct,i) == 1))/NumbStruct; end mT = mT / mean(mT); mVT = mT .* mT; mVT = mVT / mean(mVT); % Calculate combined conservation membership function = mC(x) mC = min([mQ; mT]); mC = max([mC; mVQ; mVT]); mC = mC / mean(mC); % Calculate combined conservation using linear average aC = (mQ + mT)/2; % Calculate weighted PSSM PSSM = NaN * ones(21,NumbPos); for i = 1:NumbPos PSSM(1,i) = (1 + length(find(Sequences(:,i) == 'G')))/NumbSeq; PSSM(2,i) = (1 + length(find(Sequences(:,i) == 'A')))/NumbSeq; PSSM(3,i) = (1 + length(find(Sequences(:,i) == 'V')))/NumbSeq; PSSM(4,i) = (1 + length(find(Sequences(:,i) == 'L')))/NumbSeq; PSSM(5,i) = (1 + length(find(Sequences(:,i) == 'I')))/NumbSeq; PSSM(6,i) = (1 + length(find(Sequences(:,i) == 'M')))/NumbSeq; PSSM(7,i) = (1 + length(find(Sequences(:,i) == 'F')))/NumbSeq; PSSM(8,i) = (1 + length(find(Sequences(:,i) == 'P')))/NumbSeq; PSSM(9,i) = (1 + length(find(Sequences(:,i) == 'S')))/NumbSeq; PSSM(10,i) = (1 + length(find(Sequences(:,i) == 'T')))/NumbSeq; PSSM(11,i) = (1 + length(find(Sequences(:,i) == 'Y')))/NumbSeq; PSSM(12,i) = (1 + length(find(Sequences(:,i) == 'H')))/NumbSeq; PSSM(13,i) = (1 + length(find(Sequences(:,i) == 'C')))/NumbSeq; PSSM(14,i) = (1 + length(find(Sequences(:,i) == 'N')))/NumbSeq; PSSM(15,i) = (1 + length(find(Sequences(:,i) == 'Q')))/NumbSeq; PSSM(16,i) = (1 + length(find(Sequences(:,i) == 'W')))/NumbSeq; PSSM(17,i) = (1 + length(find(Sequences(:,i) == 'D')))/NumbSeq; PSSM(18,i) = (1 + length(find(Sequences(:,i) == 'E')))/NumbSeq; PSSM(19,i) = (1 + length(find(Sequences(:,i) == 'R')))/NumbSeq; PSSM(20,i) = (1 + length(find(Sequences(:,i) == 'K')))/NumbSeq; PSSM(21,i) = (1 + length(find(Sequences(:,i) == '.')))/NumbSeq; PSSM(:,i) = log2(PSSM(:,i)); PSSMFlat(:,i) = PSSM(:,i); PSSMAvg(:,i) = PSSM(:,i) * aC(i); PSSM(:,i) = PSSM(:,i) * mC(i); end % ******* Fuzzy ******************************** % Calculate score of test sequence ProtChars = 'GAVLIMFPSTYHCNQWDERK.'; TestScore = 0; for i = 1:NumbPos CharNumb = find(ProtChars == TestSeq(i)); if length(CharNumb) == 1 TestScore = TestScore + PSSM(CharNumb,i); end end % Calculate score of randomized test sequence SumRandScore = 0; for j = 1:1000 RandScore = 0; for i = 1:NumbPos RandChar = TestSeq(ceil(NumbPos*rand(1))); CharNumb = find(ProtChars == RandChar); if length(CharNumb) == 1 RandScore = RandScore + PSSM(CharNumb,i); end end SumRandScore = SumRandScore + RandScore; end RandScore = SumRandScore / 1000; % Calculate significance of result Significance = TestScore - RandScore; % ******* Linear ********************************* % Calculate score of test sequence using average conservation TestScore = 0; for i = 1:NumbPos CharNumb = find(ProtChars == TestSeq(i)); if length(CharNumb) == 1 TestScore = TestScore + PSSMAvg(CharNumb,i); end end % Calculate average-conservation score of randomized test sequence SumRandScore = 0; for j = 1:1000 RandScore = 0; for i = 1:NumbPos RandChar = TestSeq(ceil(NumbPos*rand(1))); CharNumb = find(ProtChars == RandChar); if length(CharNumb) == 1 RandScore = RandScore + PSSMAvg(CharNumb,i); end end SumRandScore = SumRandScore + RandScore; end RandScore = SumRandScore / 1000; % Calculate significance of sequence-only result AvgSig = TestScore - RandScore; % ******* Flat (unweighted) ************************* % Calculate score of test sequence using unweighted PSSM TestScore = 0; for i = 1:NumbPos CharNumb = find(ProtChars == TestSeq(i)); if length(CharNumb) == 1 TestScore = TestScore + PSSMFlat(CharNumb,i); end end % Calculate average-conservation score of randomized test sequence SumRandScore = 0; for j = 1:1000 RandScore = 0; for i = 1:NumbPos RandChar = TestSeq(ceil(NumbPos*rand(1))); CharNumb = find(ProtChars == RandChar); if length(CharNumb) == 1 RandScore = RandScore + PSSMFlat(CharNumb,i); end end SumRandScore = SumRandScore + RandScore; end RandScore = SumRandScore / 1000; % Calculate significance of sequence-only result FlatSig = TestScore - RandScore; % Show results [TestSeqNumber Significance AvgSig FlatSig]