/* A program for taking a (possibly arbitrary) alignment

advertisement
Supplementary Notes
/* A program for taking a (possibly arbitrary) alignment score matrix
* and back-calculating the implied target frequencies p_ab.
*
* Doing this requires solving for a nonzero lambda in:
*
\sum_ab f_a f_b e^{\lambda s_ab} = 1
* and this is a good excuse to demo two methods of root-finding:
* bisection search and the Newton/Raphson method.
*
* The program is ANSI C, and should compile on any machine with a
* C compiler:
*
% cc -o lambda lambda.c -lm
*
* You need two datafiles. The <backgroundfile> lists the background
* residue frequencies; it is also used to determine what alphabet
* you're using (DNA, 4 letters; or amino acids, 20 letters). The
* <matrixfile> is the score matrix. The format of this is the same as
* any BLAST scoring matrix. (Any letters that aren't in your
* backgroundfile are ignored; this lets you read matrices that
* include scores for IUPAC ambiguity codes.)
*
* Example formats:
*
a <backgroundfile>:
*
A 0.25
*
C 0.25
*
G 0.25
*
T 0.25
*
*
a <matrixfile>:
*
A
C
G
T
*
A +1 -2 -2 -2
*
C -2 +1 -2 -2
*
G -2 -2 +1 -2
*
T -2 -2 -2 +1
*
* See the bottom of this file for amino acid examples you can
* cut out and save to files.
*
* To run the program:
*
% ./lambda <backgroundfile> <matrixfile>
*
* The program prints the implied target frequencies for the matrix,
* and some other statistics.
*
*-------------------------------------------------------------------*
* Organization of the code:
*
* section 1: main(), the main body of the program.
*
* section 2: The root-finding methods, bisection() and
*
newtonraphson(). Look here for an example of
*
how these methods work.
*
* section 3: output routines. Boring. But if you want to
*
hack something else into the output, either hack
*
main(), or hack one of the routines here.
*
* section 4: a couple of allocation routines.
*
* section 5: file format parsers. Equally boring. But if you
*
are having trouble with file formats, look here
*
for more format documentation.
*
* section 6: example file formats. Copies of the BLOSUM62 score
*
matrix, and the background frequencies used in
*
constructing BLOSUM62.
*
*-------------------------------------------------------------------*
* SRE, Sat Jun 26 09:47:59 2004
* This code is explicitly assigned to the public domain. You may
* reuse it as you wish.
* CVS $Id: lambda.c,v 1.7 2004/07/01 13:44:33 eddy Exp $
*/
#include
#include
#include
#include
#include
<stdlib.h>
<stdio.h>
<ctype.h>
<string.h>
<math.h>
#define MAXABET
#define TOLERANCE
20
1e-6
/* sets maximum alphabet size
*/
/* how close to 0 root-finders must get */
/* Function declarations (in order of appearance)
*/
static float val(float **s, float *f, int K, float lambda);
static float deriv(float **s, float *f, int K, float lambda);
static float bisection(float **s, float *f, int K);
static float newtonraphson(float **s, float *f, int K);
static void output_alphalabeled_matrix(float **mx, int K, char *alphabet);
static void output_alphalabeled_vector(float *f, int K, char *alphabet);
static float **FMX2Alloc(int rows, int cols);
static void
FMX2Free(float **mx);
static int ParseBackgroundFile(FILE *bfp, char *alphabet, int *ret_K, float
*fq);
static int ParseScoreMatrixFile(FILE *mfp, char *alphabet, int K, float **mx);
int
main(int argc, char **argv)
{
char *backgroundfile;
char *matrixfile;
FILE *fp;
char
alphabet[MAXABET+1]; /* alphabet order (e.g. ACGT)
int
K;
/* size of alphabet (4 or 20)
*/
int
a, b;
/* indices of two residues
float lambda;
/* the lambda we've solved for
*/
float **s;
/* score matrix s_ab
*/
*/
*/
float *f;
float **pab;
float
x;
float *vec;
float **mx;
/* background frequencies f_a, f_b */
/* solved target frequencies p_ab */
/* generic single number solution */
/* generic vector solution
*/
/* generic matrix solution
*/
/* Parse the command line.
*/
if (argc != 3) {
fprintf(stderr, "Usage: ./lambda <backgroundfile> <matrixfile>\n");
exit (1);
}
backgroundfile = argv[1];
matrixfile
= argv[2];
/* Allocations.
*/
if ((f
= malloc(sizeof(float) * MAXABET)) == NULL)
{ fprintf(stderr, "malloc failed\n"); exit(1); }
if ((s
= FMX2Alloc(MAXABET, MAXABET)) == NULL)
{ fprintf(stderr, "malloc failed\n"); exit(1); }
if ((pab = FMX2Alloc(MAXABET, MAXABET)) == NULL)
{ fprintf(stderr, "malloc failed\n"); exit(1); }
if ((vec = malloc(sizeof(float) * MAXABET)) == NULL)
{ fprintf(stderr, "malloc failed\n"); exit(1); }
if ((mx = FMX2Alloc(MAXABET, MAXABET)) == NULL)
{ fprintf(stderr, "malloc failed\n"); exit(1); }
/* Read in the background frequencies.
* This also sets K (alphabet size; 4 or 20) and
* the order of the alphabet (maybe ACGT or ACDEFGHIKLMNPQRSTVWY).
*/
if ((fp = fopen(backgroundfile, "r")) == NULL)
{ fprintf(stderr, "Failed to open background file %s for reading\n",
backgroundfile); exit(1); }
if (! ParseBackgroundFile(fp, alphabet, &K, f))
{ fprintf(stderr, "Failed to parse background file %s\n", backgroundfile);
exit(1); }
fclose(fp);
printf("Your input background frequencies:\n");
output_alphalabeled_vector(f, K, alphabet);
printf("\n");
/* Read in the scoring matrix, which must be KxK.
* Print it out (so user can make sure it's what they thought.)
*/
if ((fp = fopen(matrixfile, "r")) == NULL)
{ fprintf(stderr, "Failed to open matrix file %s for reading\n",
matrixfile); exit(1);}
if (! ParseScoreMatrixFile(fp, alphabet, K, s))
{ fprintf(stderr, "Failed to parse matrix file %s\n", matrixfile); exit(1);
}
fclose(fp);
printf("Your input scoring matrix:\n");
output_alphalabeled_matrix(s, K, alphabet);
printf("\n");
/* Verify the two necessary conditions for the matrix:
* 1. There must be at least one positive score.
* 2. The expected score must be negative.
*/
for (a = 0; a < K; a++)
for (b = 0; b < K; b++)
if (s[a][b] > 0.) goto HAVE_POSITIVE;
fprintf(stderr, "Score matrix has no positive score: can't solve lambda\n");
exit(1);
HAVE_POSITIVE: /* one of few excuses for a goto: break out of nested loop. */
x = 0.;
for (a = 0; a < K; a++)
for (b = 0; b < K; b++)
x += f[a] * f[b] * s[a][b];
if (x >= 0.) {
fprintf(stderr, "Score matrix has nonnegative expectation: can't solve
lambda\n");
exit(1);
}
/* Find lambda by Newton/Raphson algorithm.
* Replace w/ bisection(s,f,K) if you want to play with the other
* rootfinding method.
*/
lambda = newtonraphson(s, f, K);
printf("Solve for lambda (Newton/Raphson method):\n");
printf("lambda
= %f\n", lambda);
printf("f(lambda) = %f ~= 0\n", val(s, f, K, lambda));
printf("\n");
/* Use lambda to backcalculate target frequencies p_ab:
*
p_ab = f_a f_b e^{lambda s_ab}
*/
for (a = 0; a < K; a++)
for (b = 0; b < K; b++)
pab[a][b] = f[a]*f[b]* exp(lambda * s[a][b]);
printf("The implied target probabilities p_ab:\n");
output_alphalabeled_matrix(pab, K, alphabet);
printf("\n");
/* The marginal probabilities of residues p_a in
* the query sequence: not necessarily the same as
* the background frequencies.
*
p_a = \sum_b p_ab
*/
for (a = 0; a < K; a++) {
vec[a] = 0.;
for (b = 0; b < K; b++)
vec[a] += pab[a][b];
}
printf("The implied marginal query composition p_a:\n");
output_alphalabeled_vector(vec, K, alphabet);
printf("\n");
/* The conditional probabilities p(b | a) for
* probability of target residue b given query residue a:
*
p(b | a) = p_ab / p_a
*/
for (a = 0; a < K; a++)
for (b = 0; b < K; b++)
mx[a][b] = pab[a][b] / vec[a];
printf("Implied conditional probabilities p(b|a):\n");
output_alphalabeled_matrix(mx, K, alphabet);
printf("\n");
/* Find the expected score of the score matrix:
*
\sum_ab f_a f_b s_ab
* Report in bits: so, x lambda / log 2.
* (We already made sure this was negative, but now that we
* know lambda, we can report it in units of bits.)
*
* Note: for BLOSUM62, result is -.46 bits; but header of BLOSUM62
* file says expect = -.52. Discrepancy arises from a roundoff
* error. matblas (the program that constructed BLOSUM62) calculated
* the expected score using an unrounded version of s_ab, but later
* scaled and rounded to integers to get the BLOSUM62 we know and
* love. Same applies for discrepancy in lambda: should be .3466
* (log 2/2), but the actual integer BLOSUM62 gives .3208. The
* original unrounded blosum62.sij can be found on the Net.
*/
x = 0.;
for (a = 0; a < K; a++)
for (b = 0; b < K; b++)
x += f[a] * f[b] * s[a][b];
x *= lambda / log(2.);
printf("Expected score = %.4f bits\n", x);
/* Find the relative entropy (average score in true alignments)
*
H = \sum_ab p_ab s_ab
* Report in bits.
*/
x = 0.;
for (a = 0; a < K; a++)
for (b = 0; b < K; b++)
x += pab[a][b] * s[a][b];
x *= lambda / log(2.);
/* convert to bits */
printf("Relative entropy = %.4f bits\n", x);
/* Find the expected % sequence identity in homologous
* alignments:
*/
x = 0.;
for (a = 0; a < K; a++)
x += pab[a][a];
printf("Target %% identity = %.1f\n", x*100.);
FMX2Free(s);
FMX2Free(pab);
FMX2Free(mx);
free(f);
free(vec);
exit(0);
}
/* ================================================================
* Section 2: Solving for lambda with rootfinding methods.
* ================================================================
*
* We're looking for the nonzero \lambda that satisfies:
*
\sum_ab f_a f_b e^{\lambda s_ab} - 1 = 0
*
* And if s_ab has at least one positive score, and has a negative
* expectation, there is guaranteed to be one and only one nonzero
* solution.
*
* This is a rootfinding problem (find x that satisfies f(x) = 0).
* It sets up a nice, simple example for seeing how the bisection and
* Newton/Raphson methods work.
*
* val() calculates f(lambda) for a given lambda. This is the
*
function that we're trying to find a root for.
*
* deriv() is the first derivative f'(lambda) at a given lambda.
*
We need first derivative info for the Newton/Raphson
*
method.
*
* bisection() implements the bisection search, and
* newtonraphson() implements Newton/Raphson.
*
*
* A good general reference on numerical root finding methods is
* _Numerical Recipes in C_, WH Press et al., Cambridge Univ Press 1993.
*/
/* Calculate and return f(lambda): \sum_ab f_a f_b e^{lambda s_ab} - 1
*/
static float
val(float **s, float *f, int K, float lambda)
{
int
a,b;
float total = -1.;
for (a = 0; a < K; a++)
for (b = 0; b < K; b++)
total += f[a] * f[b] * exp(lambda * s[a][b]);
return total;
}
/* First derivative is:
*/
\sum_ab f_a f_b s_ab e^{lambda s_ab}
static float
deriv(float **s, float *f, int K, float lambda)
{
int
a,b;
float deriv = 0.;
for (a = 0; a < K; a++)
for (b = 0; b < K; b++)
deriv += f[a] * f[b] * s[a][b] * exp(lambda * s[a][b]);
return deriv;
}
/* The bisection method.
*
* The key idea of bisection is this: you bracket the root, finding an
* x1 for which f(x1) < 0, and an x2 for which f(x2) > 0 -- so the
* solution where f(x)=0 must lie in the interval (x1,x2). (Here, we
* also happen to know that x1 < x2; f(x) is an increasing function
* around the root we care about.) Then you guess a point x3 in
* between, x1 < x3 < x2. If f(x3) > 0, then the solution is still to
* our left; we reset the bracketed interval to (x1,x3). If f(x3) < 0,
* the solution is to our right, and we reset the interval to
* (x3,x2). We iterate this until we reach a suitably close f(x) ~ 0.
*
* Bisection is very robust; it is guaranteed to find at least one
* root, if at least one root exists.
*
* Numerically, we can't expect to reach f(x) = 0 exactly.
* We may reach two "adjacent" x's in the computer's floating point
* representation that bracket the root. We can either check
* that f(x) is "close enough" (within some TOLERANCE, say 1e-5, of 0)
* or we can directly test that x1 and x2 are adjacent floating
* point numbers: if (x1+x2)/2 = x2 || (x1+x2)/2 == x2, this is
* the case. We use both tests here.
*/
static float
bisection(float **s, float *f, int K)
{
float initial_guess;
float left_lambda, right_lambda, mid_lambda;
float fx;
int
iter;
/* Our initial guess can be anything, but it doesn't
* hurt to be close.
*/
initial_guess = 0.2;
/* Find lambda that gives a result f(lambda) < 0;
* this is the left side of our initial bracket.
*/
left_lambda = initial_guess;
for (iter = 0; iter < 100; iter++) {
if (val(s, f, K, left_lambda) < 0) break;
left_lambda /= 2.;
}
if (iter == 100)
{ fprintf(stderr, "Failed to find left bracket\n"); exit(1); }
/* Find lambda that gives a result of f(lambda) > 0;
* this is the right side of our initial bracket.
*/
right_lambda = initial_guess;
for (iter = 0; iter < 100; iter++) {
if (val(s, f, K, right_lambda) > 0) break;
right_lambda *= 2.;
}
if (iter == 100)
{ fprintf(stderr, "Failed to find right bracket\n"); exit(1); }
/* Now, the iterative bisection search.
*/
for (iter = 0; iter < 100; iter++) {
mid_lambda = (left_lambda + right_lambda) / 2;
/* check if interval is as tight as numerically possible: */
if (mid_lambda == left_lambda || mid_lambda == right_lambda) break;
fx = val(s, f, K, mid_lambda);
/* check if we're suitably close to f(lambda) == 0 */
if (fabs(fx) < TOLERANCE) break;
/* narrow the bracket: */
if (fx > 0) right_lambda = mid_lambda;
else
left_lambda = mid_lambda;
}
if (iter == 100)
{ fprintf(stderr, "Bisection search failed\n"); exit(1); }
return mid_lambda;
}
/* The Newton/Raphson method.
*
* The basic idea is that for any guess of x, the first derivative f'x
* should tell us which direction the root is in; and we can even make
* a good guess where, by assuming f(x) is a straight line defined by
* the slope f'(x). Our new guess for x is x - f(x)/f'(x). (If you
* don't see why, draw the situation on a graph; it should be easy to
* see geometrically.) We can iterate this until we find the root.
*
* Newton/Raphson is not robust in general; there are pathological
* f(x)'s that make it blow up. We have to be careful that our
* f(x) is amenable to Newton/Raphson. Here, f(lambda) is well behaved.
* The only trick is that we have two roots (one of them is lambda=0,
* and one of them is the lambda we want to find), so we don't
* want Newton/Raphson to find the lambda=0 solution. It's sufficient
* to make sure that we start with an initial guess to the right
* of our desired root, where f(lambda) > 0.
*
* Again, numerical convergence must be tested, finding when
* f(lambda) is within TOLERANCE (1e-5 or so) of 0.
*/
static float
newtonraphson(float **s, float *f, int K)
{
float lambda, newlambda;
int
iter;
float fx, dfx;
/* There's two zeros, not one: lambda=0 is a solution.
* To find the nonzero zero (ahem), we've got to make sure
* we start to the right of it: where the function is positive.
* Start with anything, then move right until fx > 0.
*/
lambda = 0.1;
for (iter = 0; iter < 100; iter++) {
if (val(s, f, K, lambda) > 0.) break;
lambda *= 2.;
}
if (iter == 100) {
fprintf(stderr, "failed to find a start pt for newton/raphson\n");
exit(1);
}
/* Now, starting from there, come back in with the Newton/Raphson
* algorithm. At least in theory, because we know what f(x) looks
* like, this should be well-behaved, smoothly converging to the
* solution.
*/
for (iter = 0; iter < 100; iter++) {
fx = val(s, f, K, lambda);
if (fabs(fx) < TOLERANCE) break;
/* success! */
dfx = deriv(s, f, K, lambda);
newlambda = lambda - fx / dfx; /* This update defines Newton/Raphson */
if (newlambda <= 0) newlambda = 0.000001; /* this shouldn't happen */
lambda = newlambda;
}
if (iter == 100)
{ fprintf(stderr, "Newton/Raphson search failed\n"); exit(1); }
return lambda;
}
/* ================================================================
* Section 3:
Output routines.
* ================================================================
*/
/* Output an alphabet-labeled square matrix of floating point #'s.
*/
static void
output_alphalabeled_matrix(float **mx, int K, char *alphabet)
{
int a,b;
printf(" ");
for (b = 0; b < K; b++)
printf("
%c ", alphabet[b]);
printf("\n");
for (a = 0; a < K; a++) {
printf("%c ", alphabet[a]);
for (b = 0; b < K; b++)
printf("%8.4f ", mx[a][b]);
printf("\n");
}
}
/* Output an alphabet-labeled vector of floating point #'s.
*/
static void
output_alphalabeled_vector(float *f, int K, char *alphabet)
{
int a;
printf(" ");
for (a = 0; a < K; a++) printf("
%c ", alphabet[a]);
printf("\n");
printf(" ");
for (a = 0; a < K; a++) printf("%8.4f ", f[a]);
printf("\n");
}
/* ================================================================
* Section 4:
Allocation routines.
* ================================================================
*/
static float **
FMX2Alloc(int rows, int cols)
{
float **mx;
int
r;
if ((mx = (float **) malloc(sizeof(float *) * rows)) == NULL)
return NULL;
if ((mx[0] = (float *) malloc(sizeof(float) * rows * cols)) == NULL)
{ free(mx); return NULL; }
for (r = 1; r < rows; r++)
mx[r] = mx[0] + r*cols;
return mx;
}
static void
FMX2Free(float **mx)
{
free(mx[0]);
free(mx);
}
/* ================================================================
* Section 5:
File parsers.
* ================================================================
*/
/* Function: ParseBackgroundFile()
* Incept:
SRE, Tue Jun 22 15:03:47 2004 [St. Louis]
*
* Purpose:
Parse a file of background residue frequencies.
*
* File format:
*
Blank lines are ignored. Lines w/ # as their
*
first non-whitespace are comments, and are also
*
ignored.
*
*
Expects lines in format:
*
<residue>
<frequency of residue>
*
*
Example:
*
----------------------*
A
0.33
*
C
0.17
*
G
0.17
*
T
0.33
*
----------------------*
*
This gets parsed into fq[], in the residue
*
order specified by alphabet. The parser also
*
stores the alphabet[], and the size of the
*
alphabet K.
*
*
Example:
*
----------------------*
float fq[4];
*
char alphabet[MAXABET+1];
*
int
K;
*
FILE *bfp;
*
*
bfp = fopen("datafile", "r");
*
ParseBackgroundFile(bfp, alphabet, &K, fq);
*
fclose(bfp);
*
----------------------*
*
after this, fq[0] is f_A; fq[1] is f_C; etc;
*
K=4; and alphabet="ACGT".
*
* Args:
bfp
- open file ptr for reading.
*
alphabet - RETURN: residue alphabet, in order of storage in fq;
*
caller allocates for MAXABET+1, MAXABET is the maximum
*
alphabet size (20).
*
ret_K
- RETURN: # of residues in alphabet
*
fq
- RETURN: frequencies, 0..K-1
*
Caller allocates this, and is responsible
*
for free'ing it.
*
* Returns:
1 on success; 0 on failure.
*
* Xref:
STL8 p.50
*/
static int
ParseBackgroundFile(FILE *bfp, char *alphabet, int *ret_K, float *fq)
{
char buffer[512];
/* input buffer for lines from fp */
char *s1;
/* tmp ptr into buffer, field 1 (residue) */
char *s2;
/* tmp ptr into buffer, field 2 (frequency) */
int
i;
int
n;
float sum;
n = 0;
while (fgets(buffer, 512, bfp) != NULL)
{
if ((s1 = strtok(buffer, " \t\n")) == NULL) continue; /* blank */
if (*s1 == '#')
continue; /* comment*/
if ((s2 = strtok(NULL,
" \t\n")) == NULL) return 0; /* oops, eol*/
if (n >= MAXABET)
return 0; /* out of space */
alphabet[n] = *s1;
fq[n]
= atof(s2);
n++;
}
alphabet[n] = '\0';
*ret_K = n;
/* Make sure it looks like a remotely plausible frequency
* distribution (it should sum to one); if it's at all close,
* normalize it.
*/
sum = 0.;
for (i = 0; i < n; i++) sum += fq[i];
if (sum < 0.9 || sum > 1.1) return 0;
for (i = 0; i < n; i++) fq[i] /= sum;
return 1;
}
/*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
*
Function:
Incept:
ParseScoreMatrixFile()
SRE, Tue Jun 22 13:17:16 2004 [St. Louis]
Purpose:
Takes an open score matrix file, and parses
it into an allocated matrix "mx", in the
order given by symbols in "alphabet".
Scores for symbols other than those in "alphabet"
are ignored. (For instance, score matrices often
include scores for IUPAC ambiguity codes and '*',
and maybe we only want to keep unambiguous standard
symbols like ACGT.)
Allows up to 30 rows/cols in the matrix.
Format of datafile:
Lines starting with '#' as their first non-blank
character are considered to be comments, and are
ignored. Blank lines are also ignored.
First data line is a header with residue symbols,
labeling the columns. The number of symbols in
the score matrix is determined from this line.
Next "nsymbols" valid lines are the data lines
of the matrix. They contain either nsymbols or
nsymbols+1 fields, because they are allowed to
have a leading residue label for the row (this
label, if present, is ignored). These rows
must be in the same order that the columns were
in.
*
*
Example:
*
*
-------------------------------*
A
C
G
T
*
*
A
4
-2
-2 -2 -4
*
C -2
4
-2 -2 -4
*
G -2
-2
4 -2 -4
*
T -2
-2
-2
4 -4
*
* -4
-4
-4 -4 -4
*
-------------------------------*
*
This has nsymbols=5.
*
*
Example of parsing it is:
*
*
--------------------------------*
float mx[4][4];
*
FILE *mfp;
*
*
mfp = fopen("datafile", "r");
*
ParseScoreMatrixFile(mfp, "ATGC", 4, mx);
*
fclose(mfp);
*
--------------------------------*
*
now mx[1][1] is the score for T/T;
*
mx[2][2] is the score for G/G; etc.
*
(that is, the rows/cols are in the order
*
specified by "ATGC", rearranged from how
*
they were in the file.)
*
* Args:
mfp
- open score matrix file for reading;
*
Caller must open; caller must close.
*
alphabet - which letters are to be parsed, in what order;
*
e.g. "ACGT"
*
K
- # of residues in the parsed alphabet.
*
mx
- RETURN: the score matrix.
*
Caller must allocate this KxK; and caller
*
is responsible for free'ing it.
*
* Returns:
1 on success; 0 on failure.
*
* Xref:
STL8 p.50.
*/
static int
ParseScoreMatrixFile(FILE *mfp, char *alphabet, int K, float **mx)
{
char buffer[512];
/* input buffer for lines from fp */
int
order[30];
/* order of fields, from header
*/
char *sptr;
/* tmp ptr into the buffer
*/
char *s2;
/* tmp ptr into alphabet
*/
int
nsymbols;
/* total # of symbols in the mx
*/
int
row,col;
/* counters for rows, columns
*/
int
i,n;
for (i = 0; i < 30; i++)
order[i] = -1;
/* Look for the first non-blank, non-comment line in the file.
* It gives single-letter codes in the order the PAM matrix
* is arrayed in the file.
* (fgets() and strtok() are both deprecated in real C
*
applications; only used here for a simple ANSI C example.)
*/
do {
if (fgets(buffer, 512, mfp) == NULL) return 0;
} while ((sptr = strtok(buffer, " \t\n")) == NULL || *sptr == '#');
/* Parse that line; figure out which rows/columns we're going
* to store, and in what order. (This lets the caller request only
* unambiguous, standard codes like a 4x4 matrix indexed for "ACGT",
* even if the matrix file contains scores for IUPAC ambiguity
* codes, and/or w/ symbols in a different order.)
*/
i = 0;
do {
s2 = strchr(alphabet, *sptr); /* is this a residue we'll store? */
if (s2 != NULL) order[i] = (int) (s2-alphabet); /* yes */
i++;
} while ((sptr = strtok(NULL, " \t\n")) != NULL);
nsymbols = i;
/* Doublecheck that we're going to store a KxK matrix.
*/
n=0;
for (i = 0; i < 30; i++) if (order[i] != -1) n++;
if (n != K) return 0;
/* The next nsymbols rows are the score matrix.
* The first field of each may or may not be a letter, depending
* on what format the matrix is in; if it's a letter, ignore it.
*/
row = 0;
do {
if (fgets(buffer, 512, mfp) == NULL)
return 0; /* oops, eof */
if ((sptr = strtok(buffer, " \t\n")) == NULL) continue; /* blank line */
if (*sptr == '#')
continue; /* comment
*/
if (isalpha(*sptr)) {
/* skip char */
if ((sptr = strtok(NULL, " \t\n")) == NULL) return 0; /* bad line
*/
}
col = 0;
do {
/* parse each field on the line */
if (sptr == NULL) return 0;
/* oops, eol */
if (order[row] != -1 && order[col] != -1)
/* a keeper? */
mx[order[row]][order[col]] = atof(sptr);
/* yes.
*/
sptr = strtok(NULL, " \t\n");
col++;
} while (col < nsymbols);
row++;
} while (row < nsymbols);
return 1;
}
/* ================================================================
* Section 6:
Example data files.
* ================================================================
*/
/*
*
*
*
*
#
#
#
#
#
#
---------------------------------------------------------------Example amino acid score matrix file (BLOSUM62)
Cut and save in your own file.
---------------------- <snip> ---------------------------------Matrix made by matblas from blosum62.iij
* column uses minimum score
BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
Blocks Database = /data/blocks_5.0/blocks.dat
Cluster Percentage: >= 62
Entropy =
0.6979, Expected = -0.5209
A R N D C Q E G H I L K M F P S T W Y V B Z
A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1
R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0
N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0
D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1
C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3
Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3
E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4
G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2
H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0
I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3
L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3
K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1
M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1
F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3
P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1
S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0
T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1
W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3
Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2
V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2
B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1
Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4
X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1
* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4
* ---------------------- <snip> ---------------------------------*
*/
X
0
-1
-1
-1
-2
-1
-1
-1
-1
-1
-1
-1
-1
-1
-2
0
0
-2
-1
-1
-1
-1
-1
-4
/* Example background frequency file.
* [These were the background frequencies used to construct BLOSUM62;
* from blosum62.out, the matblas output file from 1992.]
* Cut and save to a file.
* ---------------------- <snip> ---------------------------------A 0.074
*
-4
-4
-4
-4
-4
-4
-4
-4
-4
-4
-4
-4
-4
-4
-4
-4
-4
-4
-4
-4
-4
-4
-4
1
C
D
E
F
G
H
I
L
K
M
N
P
Q
R
S
T
V
W
Y
0.025
0.054
0.054
0.047
0.074
0.026
0.068
0.099
0.058
0.025
0.045
0.039
0.034
0.052
0.057
0.051
0.073
0.013
0.034
* ---------------------- <snip> ---------------------------------*
*/
Download