extern void PrintErrorAndQuit(char* message); /************************************************************************ * - function prototypes - *************************************************************************/ /* added by Gary Benson 9/2003 */ /***********************************************************************/ COMPOSITIONALIGNPAIR* GetKBestCompositionLocalAlignPairComplexMatchFunction(char* seq1, char* seq2, int len1, int len2, int alpha, int indel, int* submatrix, int limit, SUBSTRINGMATCHELEMENT* M, COMPOSITIONVECTOR *V1, int KBest); /* Produces the KBest (>=1) local composition alignments for two sequences. The aligned pair has an extra string (matchboundaries) which marks the boundaries of the shortest composition matching substrings. M[i][j] is a matrix (len1+1)x(len2+1) which holds the length of the shortest composition matching substrings (with zero< length < limit) ending at seq1[i] and seq2[j] or zero if no such matching substrings exist. The limit is the longest of these short substring that can be included in the alignment. M is computed with the function GetCompositionSubstringMatchLengths which includes the limit as a parameter. This function must be passed M as a parameter. In the alignment, a substring composition match is scored with a complex match function. KBest modification Gary Benson 9/2003: KBest is the count of local alignments that are to be computed. The recursion for this is outside of this function. Modification of M is made within this function. Zero is placed in M->substring_match_length for every index pair where a match (diagonal) or composition match (longdiagonal) occurs in the alignment. This prevents future alignments from using the same matches. */ COMPOSITIONALIGNPAIR* GetKBestCompositionLocalAlignPairSimpleMatchFunction(char* seq1, char* seq2, int len1, int len2, int alpha, int indel, int* submatrix, int limit, SUBSTRINGMATCHELEMENT* M, int KBest); /*********************************************************************** * - Function Definitions - ************************************************************************/ /***********************************************************************/ COMPOSITIONALIGNPAIR* GetKBestCompositionLocalAlignPairComplexMatchFunction(char* seq1, char* seq2, int len1, int len2, int alpha, int indel, int* submatrix, int limit, SUBSTRINGMATCHELEMENT* M, COMPOSITIONVECTOR *V1, int KBest) /* Produces the KBest (>=1) local composition alignments for two sequences. The aligned pair has an extra string (matchboundaries) which marks the boundaries of the shortest composition matching substrings. M[i][j] is a matrix (len1+1)x(len2+1) which holds the length of the shortest composition matching substrings (with zero< length < limit) ending at seq1[i] and seq2[j] or zero if no such matching substrings exist. The limit is the longest of these short substring that can be included in the alignment. M is computed with the function GetCompositionSubstringMatchLengths which includes the limit as a parameter. This function must be passed M as a parameter. In the alignment, a substring composition match is scored with a complex match function. KBest modification Gary Benson 9/2003: KBest is the count of local alignments that are to be computed. The recursion for this is outside of this function. Modification of M is made within this function. Zero is placed in M->substring_match_length for every index pair where a match (diagonal) or composition match (longdiagonal) occurs in the alignment. This prevents future alignments from using the same matches. */ { int col, row,length; int eb; /* equals e+indel */ int fb; /* equals f+indel */ int sa; /* equals s+match or mismatch */ int stemp; SD* S=NULL; int Slen = 0; SD *Sp, *Srm1p, *Scm1p, *Srcm1p; /* temporary pointers */ __int64 SlenCheck; //to check for overflow COMPOSITIONALIGNPAIR* ap; char *seq1pos,*seq2pos,*seq1sidepos,*seq2sidepos; int substringlen, totalsubstringlen, bestsubstringlen, matchscore, satype, i; char *mbpos; SUBSTRINGMATCHELEMENT *Mp; int bestscore,bestscorerow,bestscorecol; int k; /* check if the function has been set */ if (pf==NULL) return NULL; /* matchscore for substring composition match is the score of A vs A */ matchscore=submatrix[26*('A'-'A')+('A'-'A')]; /* this check was put here because of a problem of overflow when calculating the Slen and malloc() was just using the smaller number that came out as a result. A very hard and pesky error. Gelfand. */ SlenCheck=(__int64)(len1+1)*(__int64)(len2+1)*(__int64)(sizeof(SD)); if (SlenCheck>(__int64)INT_MAX) { return NULL; } /* allocate the alignment matrix */ Slen = (len1+1)*(len2+1); S = (SD*) malloc(sizeof(SD)*Slen); if(S==NULL) { return NULL; /* in case memory allocation fails */ } /* compute SDD Matrix */ /* set up for finding best score */ bestscore=0; bestscorerow=0; bestscorecol=0; /* first element */ Sp=S; Sp->score = 0; Sp->direction = START; /* across the top */ for(col=1; col<=len2; col++) { Sp++; Sp->score = 0; Sp->direction = START; } /* Down the left side */ Sp=S; for(row=1; row<=len1; row++) { Sp+=(len2+1); Sp->score = 0; Sp->direction = START; } /* body of matrix */ for(row=1;row<=len1;row++) { /* set all pointer around column one of current row */ Sp = S+1+row*(len2+1); Scm1p = Sp-1; Srm1p = Scm1p-len2; Srcm1p = Srm1p-1; Mp = M+1+row*(len2+1); for(col=1;col<=len2;col++) { /* E */ eb = Scm1p->score+indel; /* F */ fb = Srm1p->score+indel; /* S */ /* best substring match length is not necessarily the shortest with complex match scoring functions, so we have to do a search for the best score */ substringlen=Mp->substring_match_length; totalsubstringlen=0; /* test for mismatch and compute */ if(substringlen!=1) { sa = Srcm1p->score+submatrix[26*(seq1[row-1]-'A')+(seq2[col-1]-'A')]; satype=DIAGONAL; bestsubstringlen=1; /* Gary Benson, 10/13/03 for kbest, the substring length might have been reset to zero, even though the letters match, so reset sa to a dummy value in this case so sa is not selected in the max switch below */ if(seq1[row-1]==seq2[col-1]) { sa = -1; /* dummy value will not be chosen for local alignment */ bestsubstringlen=0; } } /* compute first extended match */ if((substringlen!=0)&&(substringlen<=limit)) { totalsubstringlen=substringlen; /* some complex match scoring functions require the Common composition of the matching substrings. Note that since the substrings match, we calculate the composition of only one*/ /* printf("\nrow=%d, totallen=%d ",row, totalsubstringlen);*/ for(k=0;kscore, row,col,totalsubstringlen,matchscore,matchscore); if((totalsubstringlen==1)||(stemp>sa)) /* better than mismatch score */ { sa=stemp; bestsubstringlen=totalsubstringlen; if(totalsubstringlen==1) satype=DIAGONAL; else satype=LONGDIAGONAL; } /* get next substring length looking backwards along diagonal */ substringlen=(Mp-totalsubstringlen*(len2+2))->substring_match_length; } while(substringlen!=0) { /* check that length isn't too long */ totalsubstringlen+=substringlen; if(totalsubstringlen>limit) break; /* some complex match scoring functions require the Common composition of the matching substrings. Note that since the substrings match, we calculate the composition of only one*/ for(k=0;kscore, row,col,totalsubstringlen,matchscore,matchscore); if(stemp>sa) { sa=stemp; satype=LONGDIAGONAL; bestsubstringlen=totalsubstringlen; } /* get next substring length looking backwards along diagonal */ substringlen=(Mp-totalsubstringlen*(len2+2))->substring_match_length; } /* store best align length */ Mp->align_match_length=bestsubstringlen; /* if((substringlen=*Mp)>1) { sa=(Sp-substringlen*(len2+2))->score+substringlen*matchscore; satype=LONGDIAGONAL; } else { sa = Srcm1p->score+ submatrix[26*(seq1[row-1]-'A')+(seq2[col-1]-'A')]; satype=DIAGONAL; } */ switch(max4switch(0,sa,eb,fb)) { case 1: Sp->score=0; Sp->direction=START; break; case 2: Sp->score = sa; Sp->direction = satype; /* DIAGONAL or LONGDIAGONAL */ break; case 3: Sp->score = eb; Sp->direction = RIGHT; break; case 4: Sp->score = fb; Sp->direction = DOWN; break; } if(Sp->score>bestscore) { bestscore=Sp->score; bestscorerow=row; bestscorecol=col; } /* update pointers */ Sp++; Srm1p++; Scm1p++; Srcm1p++; Mp++; } } /* get the length of the alignment */ Sp = S+bestscorerow*(len2+1)+bestscorecol; /* set at bestscore element */ Mp = M+bestscorerow*(len2+1)+bestscorecol; /* find out how long the alignment is */ length = 0; while(Sp->direction!=START) { switch(Sp->direction) { case START: break; case DIAGONAL: Sp = Sp -(len2+2); Mp = Mp -(len2+2); length++; break; case LONGDIAGONAL: substringlen=Mp->align_match_length; Sp=Sp-substringlen*(len2+2); Mp=Mp-substringlen*(len2+2); length+=substringlen; break; case RIGHT: Sp = Sp - 1; Mp = Mp - 1; length++; break; case DOWN: Sp = Sp - (len2+1); Mp = Mp - (len2+1); length++; break; } } /* allocate memory */ ap = (COMPOSITIONALIGNPAIR*) malloc(sizeof(COMPOSITIONALIGNPAIR)); if(ap==NULL) return NULL; ap->sequence1side = (char*) malloc((length+1)*sizeof(char)); if(ap->sequence1side==NULL) { free(ap); return NULL;} ap->sequence2side = (char*) malloc((length+1)*sizeof(char)); if(ap->sequence2side==NULL) { free(ap->sequence1side); free(ap); return NULL; } ap->matchboundaries = (char*) malloc((length+1)*sizeof(char)); if(ap->matchboundaries==NULL) { free(ap->sequence2side); free(ap->sequence1side); free(ap); return NULL; } /*************************** * trace back and fill in ap ****************************/ ap->length = length; Sp = S+bestscorerow*(len2+1)+bestscorecol; /* set at bestscore element */ Mp = M+bestscorerow*(len2+1)+bestscorecol; /* set at bestscore element */ ap->score = Sp->score; ap->sequence1end = bestscorerow; /* was len1 */ ap->sequence2end = bestscorecol; /* was len2 */ seq1pos = seq1+bestscorerow-1; /* position last character */ seq2pos = seq2+bestscorecol-1; seq1sidepos = ap->sequence1side+length; /* position at termination */ seq2sidepos = ap->sequence2side+length; *seq1sidepos=*seq2sidepos='\0'; /* terminate strings */ seq1sidepos--; /* move down to last char */ seq2sidepos--; mbpos=ap->matchboundaries+length; /* matchboundaries string */ *mbpos='\0'; mbpos--; /* Sp has been set to bestscore element above */ while(Sp->direction!=START) { switch(Sp->direction) { case DIAGONAL: *seq1sidepos = *seq1pos; *seq2sidepos = *seq2pos; if(*seq1sidepos==*seq2sidepos) *mbpos='|'; else *mbpos=' '; /* KBest modification Gary Benson 9/2003 Next line zeros out the substring match length for a diagonal match included in the alignment. Prevents using this diagonal match again. */ if((KBest>1)&&(*mbpos == '|')) /* second condition tests for match not mismatch */ Mp->substring_match_length=0; seq1pos--;seq2pos--;seq1sidepos--;seq2sidepos--; mbpos--; Sp = Sp -(len2+2); Mp = Mp -(len2+2); break; case LONGDIAGONAL: substringlen=Mp->align_match_length; /* KBest modification Gary Benson 9/2003 Next line zeros out the substring match length for a diagonal match included in the alignment. Prevents using this diagonal match again. */ if(KBest>1) Mp->substring_match_length=0; for(i=1;i<=substringlen;i++) { *seq1sidepos = *seq1pos; *seq2sidepos = *seq2pos; if(i==1) *mbpos='>'; else if (i==substringlen) *mbpos='<'; else *mbpos='-'; seq1pos--;seq2pos--;seq1sidepos--;seq2sidepos--; mbpos--; Sp = Sp -(len2+2); Mp = Mp -(len2+2); } break; case RIGHT: *seq1sidepos='-'; *seq2sidepos=*seq2pos; *mbpos=' '; seq1sidepos--; seq2sidepos--; seq2pos--; mbpos--; Sp = Sp - 1; Mp = Mp - 1; break; case DOWN: *seq1sidepos=*seq1pos; *seq2sidepos='-'; *mbpos=' '; seq1sidepos--; seq1pos--; seq2sidepos--; mbpos--; Sp = Sp - (len2+1); Mp = Mp - (len2+1); break; } } /* count characters in alignment to find start */ length=0; for(row=0;rowlength;row++) if(ap->sequence1side[row]!='-') length++; ap->sequence1start = ap->sequence1end-length+1; length=0; for(col=0;collength;col++) if(ap->sequence2side[col]!='-') length++; ap->sequence2start = ap->sequence2end-length+1; /* free alignment matrix */ free(S); return ap; } /***********************************************************************/ /***********************************************************************/ COMPOSITIONALIGNPAIR* GetKBestCompositionLocalAlignPairSimpleMatchFunction(char* seq1, char* seq2, int len1, int len2, int alpha, int indel, int* submatrix, int limit, SUBSTRINGMATCHELEMENT* M, int KBest) /* Produces a local composition alignment for two sequences. The aligned pair has an extra string (matchboundaries) which marks the boundaries of the shortest composition matching substrings. M[i][j] is a matrix (len1+1)x(len2+1) which holds the length of the shortest composition matching substrings (with zero< length < limit) ending at seq1[i] and seq2[j] or zero if no such matching substrings exist. The limit is the longest of these short substring that can be included in the alignment. M is computed with the function GetCompositionSubstringMatchLengths which includes the limit as a parameter. This function must be passed M as a parameter. In the alignment, a substring composition match is scored with a simple match function. KBest modification Gary Benson 9/2003: KBest is the count of local alignments that are to be computed. The recursion for this is outside of this function. Modification of M is made within this function. Zero is placed in M->substring_match_length for every index pair where a match (diagonal) or composition match (longdiagonal) occurs in the alignment. This prevents future alignments from using the same matches. */ { int col, row,length; int eb; /* equals e+indel */ int fb; /* equals f+indel */ int sa; /* equals s+match or mismatch */ int stemp; SD* S=NULL; int Slen = 0; SD *Sp, *Srm1p, *Scm1p, *Srcm1p; /* temporary pointers */ __int64 SlenCheck; //to check for overflow COMPOSITIONALIGNPAIR* ap; char *seq1pos,*seq2pos,*seq1sidepos,*seq2sidepos; int substringlen, matchscore, satype, i; char *mbpos; SUBSTRINGMATCHELEMENT *Mp; int bestscore,bestscorerow,bestscorecol; /* check if the function has been set */ if (pf==NULL) return NULL; /* matchscore for substring composition match is the score of A vs A */ matchscore=submatrix[26*('A'-'A')+('A'-'A')]; /* this check was put here because of a problem of overflow when calculating the Slen and malloc() was just using the smaller number that came out as a result. A very hard and pesky error. Gelfand. */ SlenCheck=(__int64)(len1+1)*(__int64)(len2+1)*(__int64)(sizeof(SD)); if (SlenCheck>(__int64)INT_MAX) { return NULL; } /* allocate the alignment matrix */ Slen = (len1+1)*(len2+1); S = (SD*) malloc(sizeof(SD)*Slen); if(S==NULL) { return NULL; /* in case memory allocation fails */ } /* compute SDD Matrix */ /* set up for finding best score */ bestscore=0; bestscorerow=0; bestscorecol=0; /* first element */ Sp=S; Sp->score = 0; Sp->direction = START; /* across the top */ for(col=1; col<=len2; col++) { Sp++; Sp->score = 0; Sp->direction = START; } /* Down the left side */ Sp=S; for(row=1; row<=len1; row++) { Sp+=(len2+1); Sp->score = 0; Sp->direction = START; } /* body of matrix */ for(row=1;row<=len1;row++) { /* set all pointer around column one of current row */ Sp = S+1+row*(len2+1); Scm1p = Sp-1; Srm1p = Scm1p-len2; Srcm1p = Srm1p-1; Mp = M+1+row*(len2+1); for(col=1;col<=len2;col++) { /* E */ eb = Scm1p->score+indel; /* F */ fb = Srm1p->score+indel; /* S */ /* changed S computation, Gary Benson, 10/13/03 to allow comparison of mismatch and composition match greater than length 1 */ /* test for mismatch and compute */ if((substringlen=Mp->substring_match_length)!=1) { sa = Srcm1p->score+submatrix[26*(seq1[row-1]-'A')+(seq2[col-1]-'A')]; satype=DIAGONAL; Mp->align_match_length=1; /* Gary Benson 10/13/03 for kbest, the substring length might have been reset to zero, even though the letters match, so reset sa to a dummy value in this case so sa is not selected in the max switch below */ if(seq1[row-1]==seq2[col-1]) { sa = -1; /* dummy value will not be chosen for local alignment */ Mp->align_match_length=0; } } else /* substring_match_length == 1 */ { /* calculate match score */ sa = Srcm1p->score+ submatrix[26*(seq1[row-1]-'A')+(seq2[col-1]-'A')]; satype=DIAGONAL; Mp->align_match_length=1; } if(substringlen>1) { stemp=pf((Sp-substringlen*(len2+2))->score,row,col, substringlen,matchscore,matchscore); if(stemp>sa) { sa=stemp; satype=LONGDIAGONAL; Mp->align_match_length=substringlen; } } switch(max4switch(0,sa,eb,fb)) { case 1: Sp->score=0; Sp->direction=START; break; case 2: Sp->score = sa; Sp->direction = satype; /* DIAGONAL or LONGDIAGONAL */ break; case 3: Sp->score = eb; Sp->direction = RIGHT; break; case 4: Sp->score = fb; Sp->direction = DOWN; break; } if(Sp->score>bestscore) { bestscore=Sp->score; bestscorerow=row; bestscorecol=col; } /* update pointers */ Sp++; Srm1p++; Scm1p++; Srcm1p++; Mp++; } } /* get the length of the alignment */ Sp = S+bestscorerow*(len2+1)+bestscorecol; /* set at bestscore element */ Mp = M+bestscorerow*(len2+1)+bestscorecol; /* find out how long the alignment is */ length = 0; while(Sp->direction!=START) { switch(Sp->direction) { case START: break; case DIAGONAL: Sp = Sp -(len2+2); Mp = Mp -(len2+2); length++; break; case LONGDIAGONAL: substringlen=Mp->align_match_length; Sp=Sp-substringlen*(len2+2); Mp=Mp-substringlen*(len2+2); length+=substringlen; break; case RIGHT: Sp = Sp - 1; Mp = Mp - 1; length++; break; case DOWN: Sp = Sp - (len2+1); Mp = Mp - (len2+1); length++; break; } } /* allocate memory */ ap = (COMPOSITIONALIGNPAIR*) malloc(sizeof(COMPOSITIONALIGNPAIR)); if(ap==NULL) return NULL; ap->sequence1side = (char*) malloc((length+1)*sizeof(char)); if(ap->sequence1side==NULL) { free(ap); return NULL;} ap->sequence2side = (char*) malloc((length+1)*sizeof(char)); if(ap->sequence2side==NULL) { free(ap->sequence1side); free(ap); return NULL; } ap->matchboundaries = (char*) malloc((length+1)*sizeof(char)); if(ap->matchboundaries==NULL) { free(ap->sequence2side); free(ap->sequence1side); free(ap); return NULL; } /*************************** * trace back and fill in ap ****************************/ ap->length = length; Sp = S+bestscorerow*(len2+1)+bestscorecol; /* set at bestscore element */ Mp = M+bestscorerow*(len2+1)+bestscorecol; /* set at bestscore element */ ap->score = Sp->score; ap->sequence1end = bestscorerow; /* was len1 */ ap->sequence2end = bestscorecol; /* was len2 */ seq1pos = seq1+bestscorerow-1; /* position last character */ seq2pos = seq2+bestscorecol-1; seq1sidepos = ap->sequence1side+length; /* position at termination */ seq2sidepos = ap->sequence2side+length; *seq1sidepos=*seq2sidepos='\0'; /* terminate strings */ seq1sidepos--; /* move down to last char */ seq2sidepos--; mbpos=ap->matchboundaries+length; /* matchboundaries string */ *mbpos='\0'; mbpos--; /* Sp has been set to bestscore element above */ while(Sp->direction!=START) { switch(Sp->direction) { case DIAGONAL: *seq1sidepos = *seq1pos; *seq2sidepos = *seq2pos; if(*seq1sidepos==*seq2sidepos) *mbpos='|'; else *mbpos=' '; /* KBest modification Gary Benson 9/2003 Next line zeros out the substring match length for a diagonal match included in the alignment. Prevents using this diagonal match again. */ if((KBest>1)&&(*mbpos == '|')) /* second condition tests for match not mismatch */ Mp->substring_match_length=0; seq1pos--;seq2pos--;seq1sidepos--;seq2sidepos--; mbpos--; Sp = Sp -(len2+2); Mp = Mp -(len2+2); break; case LONGDIAGONAL: substringlen=Mp->align_match_length; /* KBest modification Gary Benson 9/2003 Next line zeros out the substring match length for a diagonal match included in the alignment. Prevents using this diagonal match again. */ if(KBest>1) Mp->substring_match_length=0; for(i=1;i<=substringlen;i++) { *seq1sidepos = *seq1pos; *seq2sidepos = *seq2pos; if(i==1) *mbpos='>'; else if (i==substringlen) *mbpos='<'; else *mbpos='-'; seq1pos--;seq2pos--;seq1sidepos--;seq2sidepos--; mbpos--; Sp = Sp -(len2+2); Mp = Mp -(len2+2); } break; case RIGHT: *seq1sidepos='-'; *seq2sidepos=*seq2pos; *mbpos=' '; seq1sidepos--; seq2sidepos--; seq2pos--; mbpos--; Sp = Sp - 1; Mp = Mp - 1; break; case DOWN: *seq1sidepos=*seq1pos; *seq2sidepos='-'; *mbpos=' '; seq1sidepos--; seq1pos--; seq2sidepos--; mbpos--; Sp = Sp - (len2+1); Mp = Mp - (len2+1); break; } } /* count characters in alignment to find start */ length=0; for(row=0;rowlength;row++) if(ap->sequence1side[row]!='-') length++; ap->sequence1start = ap->sequence1end-length+1; length=0; for(col=0;collength;col++) if(ap->sequence2side[col]!='-') length++; ap->sequence2start = ap->sequence2end-length+1; /* free alignment matrix */ free(S); return ap; }