/************************************************************************* * -type definitions- *************************************************************************/ typedef struct { int substring_match_length; /* normally the shortest match length */ int align_match_length; /* match length chosen by alignment algorithm */ } SUBSTRINGMATCHELEMENT; typedef struct { double F[MAX_ALPHABETSIZ]; } COMPOSITIONVECTORFLOAT; /************************************************************************ * - Globals - *************************************************************************/ /* Composition vectors for entropy calculations on string composition */ COMPOSITIONVECTORFLOAT Background; /* a background composition for relative entropy */ COMPOSITIONVECTORFLOAT Common; /* the identical composition of a matching substrings */ /************************************************************************ * - function prototypes - *************************************************************************/ /* added by Gary Benson 3/2003 */ COMPOSITIONALIGNPAIR* GetCompositionGlobalAlignPairComplexMatchFunction(char* seq1, char* seq2, int len1, int len2, int alpha, int indel, int* submatrix, int limit, SUBSTRINGMATCHELEMENT* M, COMPOSITIONVECTOR *V1); /* added by Gary Benson 3/2003 */ COMPOSITIONALIGNPAIR* GetCompositionLocalAlignPairComplexMatchFunction(char* seq1, char* seq2, int len1, int len2, int alpha, int indel, int* submatrix, int limit, SUBSTRINGMATCHELEMENT* M, COMPOSITIONVECTOR *V1); /* added by Gary Benson 8/2003 */ COMPOSITIONALIGNPAIR* GetCompositionPatternGlobalTextLocalAlignPairComplexMatchFunction(char* seq1, char* seq2, int len1, int len2, int alpha, int indel, int* submatrix, int limit, SUBSTRINGMATCHELEMENT* M, COMPOSITIONVECTOR *V1); /* added by Gary Benson 3/2003 */ SUBSTRINGMATCHELEMENT* GetCompositionSubstringMatchLengthsComplexMatchFunction(char* seq1, char* seq2, int len1, int len2, int limit,int ALPHABETSIZE); /* added by Gelfand 3/2003 */ static int (*pf)(int,int,int,int,int, int)=NULL; static void setMatchFunction(int (*newPf)(int,int,int,int,int, int)) /* assinging the matching */ /* function pointer */ { pf=newPf; } static void* getMatchFunction() /* return the function pointer */ { return (void*)pf; } /* added by Gary Benson 3/2003 */ int functionextendedmatch1(int score, int row, int col, int length, int constant, int match); /* length of extended match times a constant */ /* added by Gary Benson 3/2003 */ int functionextendedmatch2(int score, int row, int col, int length, int constant, int match); /* square root of length of extended match times a constant */ /* added by Gary Benson 3/2003 */ int functionextendedmatch3(int score, int row, int col, int length, int constant, int match); /* log base 2 of length+1 of extended match times a constant */ /* added by Gary Benson 3/2003 */ int functionextendedmatch4(int score, int row, int col, int length, int constant, int match); /* relative entropy of substring composition with respect to background compositon */ /* times length of entended match times a constant */ /* added by Gary Benson 3/2003 */ int functionextendedmatch5(int score, int row, int col, int length, int constant, int match); /* relative entropy of substring composition with respect to background compositon */ /* times length of entended match times a constant. For length =1, normal match value */ /* prevents two identical sequences composed of only one letter from scoring zero if */ /* the background is the same letter. */ /* added by Gary Benson 3/2003 */ int functionextendedmatch6(int score, int row, int col, int length, int constant, int match); /* Shannon-Jensen entropy of substring composition versus the background composition */ /* times length of extended match times a constant. */ COMPOSITIONVECTOR* CreateSequenceCompositionVectorArray(char *sequence, int length, int ALPHABETSIZE); /* Creates an array of composition vectors, one for each prefix of the sequence. Note: Composition vector is zero based. Note Array is one based. the zero element is the composition of the null sequence */ void CalculateBackgroundCompositionFromSequenceCompVecs (COMPOSITIONVECTOR *V1, COMPOSITIONVECTOR *V2, int len1, int len2, int Alphabetsize); /* calculate the background composition (sum of the two sequence compositions) */ void CalculateCommonCompositionFromSequenceCompVecs (COMPOSITIONVECTOR *V1, COMPOSITIONVECTOR *V2, int len1, int len2, int Alphabetsize); /* calculate the composition (sum of the two sequence compositions) */ /* Common is a global parameter */ void CalculateCompositionOfCompositionAlignment(char *seq1, char *seq2, COMPOSITIONALIGNPAIR *ap, int Alphabetsize); /* puts composition in Common */ void CalculateBackgroundComposition(char *seq1, char *seq2, int len1, int len2, int Alphabetsize); void PrintBackgroundAndCommonComposition(int Alphabetsize); /*********************************************************************** * - Function Definitions - ************************************************************************/ int functionextendedmatch1(int score, int row, int col, int length, int constant, int match) /* the simplest function */ /* a constant times the length of the match */ { return(score+constant*length); } int functionextendedmatch2(int score, int row, int col, int length, int constant, int match) /* square root of length of extended match times a constant */ { return(score+(int)floor(sqrt((double)length)*(double)constant)); } int functionextendedmatch3(int score, int row, int col, int length, int constant, int match) /* log base 2 of length+1 of extended match times a constant */ { return(score+(int)floor((log((double)length+1.0)/log(2.0))*(double)constant)); } int functionextendedmatch4(int score, int row, int col, int length, int constant, int match) /* relative entropy of substring composition with respect to background compositon */ /* times length of entended match times a constant */ { double H,f,g; int k; /* compute relative entropy */ H=0.0; for(k=0;k(__int64)INT_MAX) { return NULL; } /* allocate the substring composition match array */ Mlen = (len1+1)*(len2+1); M = (SUBSTRINGMATCHELEMENT*) malloc(sizeof(SUBSTRINGMATCHELEMENT)*Mlen); if(M==NULL) { return NULL; /* in case memory allocation fails */ } /* fill M */ /* first fill zero row and zero column */ for(g=0;g<=len2;g++) { (M+g)->substring_match_length=0; (M+g)->align_match_length=0; } for(g=1;g<=len1;g++) { (M+g*(len2+1))->substring_match_length=0; (M+g*(len2+1))->align_match_length=0; } size=min(len1,len2); Diffvectors=(COMPOSITIONVECTOR *)malloc(sizeof(COMPOSITIONVECTOR)*(size+1)); if(Diffvectors==NULL) { return NULL; /* in case memory allocation fails */ } Radsort1=(int *)malloc(sizeof(int)*(size+1)); if(Radsort1==NULL) { return NULL; /* in case memory allocation fails */ } Radsort2=(int *)malloc(sizeof(int)*(size+1)); if(Radsort2==NULL) { return NULL; /* in case memory allocation fails */ } Counts=(int *)malloc(sizeof(int)*((2*size)+1)); if(Counts==NULL) { return NULL; /* in case memory allocation fails */ } Index=(int *)calloc(256,sizeof(int)); if(Index==NULL){return NULL;} if(ALPHABETSIZE == 5) { Index['A']=0; Index['C']=1; Index['G']=2; Index['T']=3; Index['X']=4; } else { Index['A']=0; Index['B']=1; Index['C']=2; Index['D']=3; Index['E']=4; Index['F']=5; Index['G']=6; Index['H']=7; Index['I']=8; Index['J']=9; Index['K']=10; Index['L']=11; Index['M']=12; Index['N']=13; Index['O']=14; Index['P']=15; Index['Q']=16; Index['R']=17; Index['S']=18; Index['T']=19; Index['U']=20; Index['V']=21; Index['W']=22; Index['X']=23; Index['Y']=24; } /******/ m=len1; n=len2; /* compute once for each diagonal of M */ for(d=-m+1;d<=n-1;d++) { /* calculate starting locations in X and Y (strings start at index zero) and the length of diagonal d */ if(d<0) { xstart=0; ystart=-d; length=min(n,m+d); } else { xstart=d; ystart=0; length=min(n-d,m); } /* compute substring differences for diagonal d and store in Diffvectors */ dfi=dfic=Diffvectors; Y=seq1+ystart; X=seq2+xstart; /* zeroth vector is all zeros */ for(k=0;kC[k]=0; /* step through the substrings computing the difference X-Y at each length */ for(g=0;gC[k]=dfi->C[k]; } /* and then adjusted for +X[g] and -Y[g] */ dfic->C[Index[X[g]]]++; dfic->C[Index[Y[g]]]--; dfi++; } /* radixsort the composition vectors in Diffvectors */ /* sort into Radsort1 using Radsort2 for swapping */ /* length+1 is number of vectors to sort */ /*********************/ /* initialize Radsort1 to the order of the vectors in Diffvectors */ for(g=0;g<=length;g++) Radsort1[g]=g; /* sort on each component starting with last */ for(k=ALPHABETSIZ-1;k>=0;k--) { /* get counts for counting sort */ /* first zero counts */ for(g=-length;g<=length;g++) Counts[g+length]=0; /* +length so 0<= index <= 2*length */ /* get counts in each component of vector */ dfi=Diffvectors; minval=0; maxval=0; for(g=0;g<=length;g++) { val=dfi->C[k]; Counts[val+length]++; if(val>maxval) maxval=val; if(vallimit) substring_length=0; /* store a default length of zero */ else { match=1; for(k=0;k=0) { j=current+d; i=current; } else { j=current; i=current-d; } /* store substring length at M[i][j].substring_match_length */ (M+i*(len2+1)+j)->substring_match_length=substring_length; (M+i*(len2+1)+j)->align_match_length=0; /* update pointers */ previous=current; /* uncomment for shortest substring */ rsi++; current=(*rsi); /**********************************/ /******/ } } free(Index); free(Diffvectors); free(Radsort1); free(Radsort2); free(Counts); return(M); } /***********************************************************************/ COMPOSITIONALIGNPAIR* GetCompositionGlobalAlignPairComplexMatchFunction (char* seq1, char* seq2, int len1, int len2, int alpha, int indel, int* submatrix, int limit, SUBSTRINGMATCHELEMENT* M, COMPOSITIONVECTOR *V1) /* Produces a composition alignment for two sequences. The aligned pair has an extra string (matchboundaries) which marks the boundaries of the shortest composition matching substrings. M[i][j] is a matrix (len1+1)x(len2+1) which holds the length of the shortest composition matching substrings (with zero< length < limit) ending at seq1[i] and seq2[j] or zero if no such matching substrings exist. The limit is the longest of these short substring that can be included in the alignment. M is computed with the function GetCompositionSubstringMatchLengths which includes the limit as a parameter. This function must be passed M as a parameter. In this version, the substring composition match has one of a number of complex scoring functions. Note that the composition vector V1 must be from sequence seq1. */ { int col, row,length; int eb; /* equals e+indel */ int fb; /* equals f+indel */ int sa; /* equals s+match or mismatch */ int stemp; SD* S=NULL; int Slen = 0; SD *Sp, *Srm1p, *Scm1p, *Srcm1p; /* temporary pointers */ __int64 SlenCheck; //to check for overflow COMPOSITIONALIGNPAIR* ap; char *seq1pos,*seq2pos,*seq1sidepos,*seq2sidepos; int substringlen, totalsubstringlen, bestsubstringlen, matchscore, satype, i; char *mbpos; SUBSTRINGMATCHELEMENT *Mp; int k; /* check if the function has been set */ if (pf==NULL) return NULL; /* matchscore for substring composition match is the score of A vs A */ matchscore=submatrix[26*('A'-'A')+('A'-'A')]; /* this check was put here because of a problem of overflow when calculating the Slen and malloc() was just using the smaller number that came out as a result. A very hard and pesky error. Gelfand. */ SlenCheck=(__int64)(len1+1)*(__int64)(len2+1)*(__int64)(sizeof(SD)); if (SlenCheck>(__int64)INT_MAX) { return NULL; } /* allocate the alignment matrix */ Slen = (len1+1)*(len2+1); S = (SD*) malloc(sizeof(SD)*Slen); if(S==NULL) { return NULL; /* in case memory allocation fails */ } /* compute SDD Matrix */ /* first element */ Sp=S; Sp->score = 0; Sp->direction = START; /* across the top */ for(col=1; col<=len2; col++) { Sp++; Sp->score = col*indel; Sp->direction = RIGHT; } /* Down the left side */ Sp=S; for(row=1; row<=len1; row++) { Sp+=(len2+1); Sp->score = row*indel; Sp->direction = DOWN; } /* body of matrix */ for(row=1;row<=len1;row++) { /* set all pointer around column one of current row */ Sp = S+1+row*(len2+1); Scm1p = Sp-1; Srm1p = Scm1p-len2; Srcm1p = Srm1p-1; Mp = M+1+row*(len2+1); for(col=1;col<=len2;col++) { /* E */ eb = Scm1p->score+indel; /* F */ fb = Srm1p->score+indel; /* S */ /* best substring match length is not necessarily the shortest with complex match scoring functions, so we have to do a search for the best score */ substringlen=Mp->substring_match_length; totalsubstringlen=0; /* test for mismatch and compute */ if(substringlen!=1) { sa = Srcm1p->score+submatrix[26*(seq1[row-1]-'A')+(seq2[col-1]-'A')]; satype=DIAGONAL; bestsubstringlen=1; } /* compute first extended match */ if((substringlen!=0)&&(substringlen<=limit)) { totalsubstringlen=substringlen; /* some complex match scoring functions require the Common composition of the matching substrings. Note that since the substrings match, we calculate the composition of only one*/ for(k=0;kscore, row,col,totalsubstringlen,matchscore,matchscore); if((totalsubstringlen==1)||(stemp>sa)) /* better than mismatch score */ { sa=stemp; bestsubstringlen=totalsubstringlen; if(totalsubstringlen==1) satype=DIAGONAL; else satype=LONGDIAGONAL; } /* get next substring length looking backwards along diagonal */ substringlen=(Mp-totalsubstringlen*(len2+2))->substring_match_length; } while(substringlen!=0) { /* check that length isn't too long */ totalsubstringlen+=substringlen; if(totalsubstringlen>limit) break; /* some complex match scoring functions require the Common composition of the matching substrings. Note that since the substrings match, we calculate the composition of only one*/ for(k=0;kscore, row,col,totalsubstringlen,matchscore,matchscore); if(stemp>sa) { sa=stemp; satype=LONGDIAGONAL; bestsubstringlen=totalsubstringlen; } /* get next substring length looking backwards along diagonal */ substringlen=(Mp-totalsubstringlen*(len2+2))->substring_match_length; } /* store best align length */ Mp->align_match_length=bestsubstringlen; /* if((substringlen=*Mp)>1) { sa=(Sp-substringlen*(len2+2))->score+substringlen*matchscore; satype=LONGDIAGONAL; } else { sa = Srcm1p->score+ submatrix[26*(seq1[row-1]-'A')+(seq2[col-1]-'A')]; satype=DIAGONAL; } */ switch(max3switch(sa,eb,fb)) { case 1: Sp->score = sa; Sp->direction = satype; /* DIAGONAL or LONGDIAGONAL */ break; case 2: Sp->score = eb; Sp->direction = RIGHT; break; case 3: Sp->score = fb; Sp->direction = DOWN; break; } /* update pointers */ Sp++; Srm1p++; Scm1p++; Srcm1p++; Mp++; } } /* get the length of the alignment */ Sp = S+(len2+1)*len1+len2; /* set at last element */ Mp = M+(len2+1)*len1+len2; /* find out how long the alignment is */ length = 0; while(Sp->direction!=START) { /* length ++; */ switch(Sp->direction) { case START: break; case DIAGONAL: Sp = Sp -(len2+2); Mp = Mp -(len2+2); length++; break; case LONGDIAGONAL: substringlen=Mp->align_match_length; Sp=Sp-substringlen*(len2+2); Mp=Mp-substringlen*(len2+2); length+=substringlen; break; case RIGHT: Sp = Sp - 1; Mp = Mp - 1; length++; break; case DOWN: Sp = Sp - (len2+1); Mp = Mp - (len2+1); length++; break; } } /* allocate memory */ ap = (COMPOSITIONALIGNPAIR*) malloc(sizeof(COMPOSITIONALIGNPAIR)); if(ap==NULL) return NULL; ap->sequence1side = (char*) malloc((length+1)*sizeof(char)); if(ap->sequence1side==NULL) { free(ap); return NULL;} ap->sequence2side = (char*) malloc((length+1)*sizeof(char)); if(ap->sequence2side==NULL) { free(ap->sequence1side); free(ap); return NULL; } ap->matchboundaries = (char*) malloc((length+1)*sizeof(char)); if(ap->matchboundaries==NULL) { free(ap->sequence2side); free(ap->sequence1side); free(ap); return NULL; } /*************************** * trace back and fill in ap ****************************/ ap->length = length; Sp = S+(len2+1)*len1+len2; /* set at last element */ Mp = M+(len2+1)*len1+len2; /* set at last element */ ap->score = Sp->score; ap->sequence1end = len1; ap->sequence2end = len2; ap->sequence1start = 1; ap->sequence2start = 1; seq1pos = seq1+len1-1; /* position last character */ seq2pos = seq2+len2-1; seq1sidepos = ap->sequence1side+length; /* position at termination */ seq2sidepos = ap->sequence2side+length; *seq1sidepos=*seq2sidepos='\0'; /* terminate strings */ seq1sidepos--; /* move down to last char */ seq2sidepos--; mbpos=ap->matchboundaries+length; /* matchboundaries string */ *mbpos='\0'; mbpos--; /* Sp has ben set to last element above */ while(Sp->direction!=START) { switch(Sp->direction) { case DIAGONAL: *seq1sidepos = *seq1pos; *seq2sidepos = *seq2pos; if(*seq1sidepos==*seq2sidepos) *mbpos='|'; else *mbpos=' '; seq1pos--;seq2pos--;seq1sidepos--;seq2sidepos--; mbpos--; Sp = Sp -(len2+2); Mp = Mp -(len2+2); break; case LONGDIAGONAL: substringlen=Mp->align_match_length; for(i=1;i<=substringlen;i++) { *seq1sidepos = *seq1pos; *seq2sidepos = *seq2pos; if(i==1) *mbpos='>'; else if (i==substringlen) *mbpos='<'; else *mbpos='-'; seq1pos--;seq2pos--;seq1sidepos--;seq2sidepos--; mbpos--; Sp = Sp -(len2+2); Mp = Mp -(len2+2); } break; case RIGHT: *seq1sidepos='-'; *seq2sidepos=*seq2pos; *mbpos=' '; seq1sidepos--; seq2sidepos--; seq2pos--; mbpos--; Sp = Sp - 1; Mp = Mp - 1; break; case DOWN: *seq1sidepos=*seq1pos; *seq2sidepos='-'; *mbpos=' '; seq1sidepos--; seq1pos--; seq2sidepos--; mbpos--; Sp = Sp - (len2+1); Mp = Mp - (len2+1); break; } } /* free alignment matrix */ free(S); return ap; } /***********************************************************************/ /***********************************************************************/ COMPOSITIONALIGNPAIR* GetCompositionLocalAlignPairComplexMatchFunction(char* seq1, char* seq2, int len1, int len2, int alpha, int indel, int* submatrix, int limit, SUBSTRINGMATCHELEMENT* M, COMPOSITIONVECTOR *V1) /* Produces a local composition alignment for two sequences. The aligned pair has an extra string (matchboundaries) which marks the boundaries of the shortest composition matching substrings. M[i][j] is a matrix (len1+1)x(len2+1) which holds the length of the shortest composition matching substrings (with zero< length < limit) ending at seq1[i] and seq2[j] or zero if no such matching substrings exist. The limit is the longest of these short substring that can be included in the alignment. M is computed with the function GetCompositionSubstringMatchLengths which includes the limit as a parameter. This function must be passed M as a parameter. In the alignment, a substring composition match is scored with a complex match function. */ { int col, row,length; int eb; /* equals e+indel */ int fb; /* equals f+indel */ int sa; /* equals s+match or mismatch */ int stemp; SD* S=NULL; int Slen = 0; SD *Sp, *Srm1p, *Scm1p, *Srcm1p; /* temporary pointers */ __int64 SlenCheck; //to check for overflow COMPOSITIONALIGNPAIR* ap; char *seq1pos,*seq2pos,*seq1sidepos,*seq2sidepos; int substringlen, totalsubstringlen, bestsubstringlen, matchscore, satype, i; char *mbpos; SUBSTRINGMATCHELEMENT *Mp; int bestscore,bestscorerow,bestscorecol; int k; /* check if the function has been set */ if (pf==NULL) return NULL; /* matchscore for substring composition match is the score of A vs A */ matchscore=submatrix[26*('A'-'A')+('A'-'A')]; /* this check was put here because of a problem of overflow when calculating the Slen and malloc() was just using the smaller number that came out as a result. A very hard and pesky error. Gelfand. */ SlenCheck=(__int64)(len1+1)*(__int64)(len2+1)*(__int64)(sizeof(SD)); if (SlenCheck>(__int64)INT_MAX) { return NULL; } /* allocate the alignment matrix */ Slen = (len1+1)*(len2+1); S = (SD*) malloc(sizeof(SD)*Slen); if(S==NULL) { return NULL; /* in case memory allocation fails */ } /* compute SDD Matrix */ /* set up for finding best score */ bestscore=0; bestscorerow=0; bestscorecol=0; /* first element */ Sp=S; Sp->score = 0; Sp->direction = START; /* across the top */ for(col=1; col<=len2; col++) { Sp++; Sp->score = 0; Sp->direction = START; } /* Down the left side */ Sp=S; for(row=1; row<=len1; row++) { Sp+=(len2+1); Sp->score = 0; Sp->direction = START; } /* body of matrix */ for(row=1;row<=len1;row++) { /* set all pointer around column one of current row */ Sp = S+1+row*(len2+1); Scm1p = Sp-1; Srm1p = Scm1p-len2; Srcm1p = Srm1p-1; Mp = M+1+row*(len2+1); for(col=1;col<=len2;col++) { /* E */ eb = Scm1p->score+indel; /* F */ fb = Srm1p->score+indel; /* S */ /* best substring match length is not necessarily the shortest with complex match scoring functions, so we have to do a search for the best score */ substringlen=Mp->substring_match_length; totalsubstringlen=0; /* test for mismatch and compute */ if(substringlen!=1) { sa = Srcm1p->score+submatrix[26*(seq1[row-1]-'A')+(seq2[col-1]-'A')]; satype=DIAGONAL; bestsubstringlen=1; } /* compute first extended match */ if((substringlen!=0)&&(substringlen<=limit)) { totalsubstringlen=substringlen; /* some complex match scoring functions require the Common composition of the matching substrings. Note that since the substrings match, we calculate the composition of only one*/ /* printf("\nrow=%d, totallen=%d ",row, totalsubstringlen);*/ for(k=0;kscore, row,col,totalsubstringlen,matchscore,matchscore); if((totalsubstringlen==1)||(stemp>sa)) /* better than mismatch score */ { sa=stemp; bestsubstringlen=totalsubstringlen; if(totalsubstringlen==1) satype=DIAGONAL; else satype=LONGDIAGONAL; } /* get next substring length looking backwards along diagonal */ substringlen=(Mp-totalsubstringlen*(len2+2))->substring_match_length; } while(substringlen!=0) { /* check that length isn't too long */ totalsubstringlen+=substringlen; if(totalsubstringlen>limit) break; /* some complex match scoring functions require the Common composition of the matching substrings. Note that since the substrings match, we calculate the composition of only one*/ for(k=0;kscore, row,col,totalsubstringlen,matchscore,matchscore); if(stemp>sa) { sa=stemp; satype=LONGDIAGONAL; bestsubstringlen=totalsubstringlen; } /* get next substring length looking backwards along diagonal */ substringlen=(Mp-totalsubstringlen*(len2+2))->substring_match_length; } /* store best align length */ Mp->align_match_length=bestsubstringlen; /* if((substringlen=*Mp)>1) { sa=(Sp-substringlen*(len2+2))->score+substringlen*matchscore; satype=LONGDIAGONAL; } else { sa = Srcm1p->score+ submatrix[26*(seq1[row-1]-'A')+(seq2[col-1]-'A')]; satype=DIAGONAL; } */ switch(max4switch(0,sa,eb,fb)) { case 1: Sp->score=0; Sp->direction=START; break; case 2: Sp->score = sa; Sp->direction = satype; /* DIAGONAL or LONGDIAGONAL */ break; case 3: Sp->score = eb; Sp->direction = RIGHT; break; case 4: Sp->score = fb; Sp->direction = DOWN; break; } if(Sp->score>bestscore) { bestscore=Sp->score; bestscorerow=row; bestscorecol=col; } /* update pointers */ Sp++; Srm1p++; Scm1p++; Srcm1p++; Mp++; } } /* get the length of the alignment */ Sp = S+bestscorerow*(len2+1)+bestscorecol; /* set at bestscore element */ Mp = M+bestscorerow*(len2+1)+bestscorecol; /* find out how long the alignment is */ length = 0; while(Sp->direction!=START) { switch(Sp->direction) { case START: break; case DIAGONAL: Sp = Sp -(len2+2); Mp = Mp -(len2+2); length++; break; case LONGDIAGONAL: substringlen=Mp->align_match_length; Sp=Sp-substringlen*(len2+2); Mp=Mp-substringlen*(len2+2); length+=substringlen; break; case RIGHT: Sp = Sp - 1; Mp = Mp - 1; length++; break; case DOWN: Sp = Sp - (len2+1); Mp = Mp - (len2+1); length++; break; } } /* allocate memory */ ap = (COMPOSITIONALIGNPAIR*) malloc(sizeof(COMPOSITIONALIGNPAIR)); if(ap==NULL) return NULL; ap->sequence1side = (char*) malloc((length+1)*sizeof(char)); if(ap->sequence1side==NULL) { free(ap); return NULL;} ap->sequence2side = (char*) malloc((length+1)*sizeof(char)); if(ap->sequence2side==NULL) { free(ap->sequence1side); free(ap); return NULL; } ap->matchboundaries = (char*) malloc((length+1)*sizeof(char)); if(ap->matchboundaries==NULL) { free(ap->sequence2side); free(ap->sequence1side); free(ap); return NULL; } /*************************** * trace back and fill in ap ****************************/ ap->length = length; Sp = S+bestscorerow*(len2+1)+bestscorecol; /* set at bestscore element */ Mp = M+bestscorerow*(len2+1)+bestscorecol; /* set at bestscore element */ ap->score = Sp->score; ap->sequence1end = bestscorerow; /* was len1 */ ap->sequence2end = bestscorecol; /* was len2 */ seq1pos = seq1+bestscorerow-1; /* position last character */ seq2pos = seq2+bestscorecol-1; seq1sidepos = ap->sequence1side+length; /* position at termination */ seq2sidepos = ap->sequence2side+length; *seq1sidepos=*seq2sidepos='\0'; /* terminate strings */ seq1sidepos--; /* move down to last char */ seq2sidepos--; mbpos=ap->matchboundaries+length; /* matchboundaries string */ *mbpos='\0'; mbpos--; /* Sp has been set to bestscore element above */ while(Sp->direction!=START) { switch(Sp->direction) { case DIAGONAL: *seq1sidepos = *seq1pos; *seq2sidepos = *seq2pos; if(*seq1sidepos==*seq2sidepos) *mbpos='|'; else *mbpos=' '; seq1pos--;seq2pos--;seq1sidepos--;seq2sidepos--; mbpos--; Sp = Sp -(len2+2); Mp = Mp -(len2+2); break; case LONGDIAGONAL: substringlen=Mp->align_match_length; for(i=1;i<=substringlen;i++) { *seq1sidepos = *seq1pos; *seq2sidepos = *seq2pos; if(i==1) *mbpos='>'; else if (i==substringlen) *mbpos='<'; else *mbpos='-'; seq1pos--;seq2pos--;seq1sidepos--;seq2sidepos--; mbpos--; Sp = Sp -(len2+2); Mp = Mp -(len2+2); } break; case RIGHT: *seq1sidepos='-'; *seq2sidepos=*seq2pos; *mbpos=' '; seq1sidepos--; seq2sidepos--; seq2pos--; mbpos--; Sp = Sp - 1; Mp = Mp - 1; break; case DOWN: *seq1sidepos=*seq1pos; *seq2sidepos='-'; *mbpos=' '; seq1sidepos--; seq1pos--; seq2sidepos--; mbpos--; Sp = Sp - (len2+1); Mp = Mp - (len2+1); break; } } /* count characters in alignment to find start */ length=0; for(row=0;rowlength;row++) if(ap->sequence1side[row]!='-') length++; ap->sequence1start = ap->sequence1end-length+1; length=0; for(col=0;collength;col++) if(ap->sequence2side[col]!='-') length++; ap->sequence2start = ap->sequence2end-length+1; /* free alignment matrix */ free(S); return ap; } /***********************************************************************/ COMPOSITIONALIGNPAIR* GetCompositionPatternGlobalTextLocalAlignPairComplexMatchFunction (char* seq1, char* seq2, int len1, int len2, int alpha, int indel, int* submatrix, int limit, SUBSTRINGMATCHELEMENT* M, COMPOSITIONVECTOR *V1) /* Produces a composition alignment for two sequences. Global for pattern, local for text. PATTERN MUST BE SEQUENCE 2. The aligned pair has an extra string (matchboundaries) which marks the boundaries of the shortest composition matching substrings. M[i][j] is a matrix (len1+1)x(len2+1) which holds the length of the shortest composition matching substrings (with zero< length < limit) ending at seq1[i] and seq2[j] or zero if no such matching substrings exist. The limit is the longest of these short substring that can be included in the alignment. M is computed with the function GetCompositionSubstringMatchLengths which includes the limit as a parameter. This function must be passed M as a parameter. In this version, the substring composition match has one of a number of complex scoring functions. Note that the composition vector V1 must be from sequence seq1. */ { int col, row,length; int eb; /* equals e+indel */ int fb; /* equals f+indel */ int sa; /* equals s+match or mismatch */ int stemp; SD* S=NULL; int Slen = 0; SD *Sp, *Srm1p, *Scm1p, *Srcm1p; /* temporary pointers */ __int64 SlenCheck; //to check for overflow COMPOSITIONALIGNPAIR* ap; char *seq1pos,*seq2pos,*seq1sidepos,*seq2sidepos; int substringlen, totalsubstringlen, bestsubstringlen, matchscore, satype, i; char *mbpos; SUBSTRINGMATCHELEMENT *Mp; int k; int bestscore,bestscorerow; /* check if the function has been set */ if (pf==NULL) return NULL; /* matchscore for substring composition match is the score of A vs A */ matchscore=submatrix[26*('A'-'A')+('A'-'A')]; /* this check was put here because of a problem of overflow when calculating the Slen and malloc() was just using the smaller number that came out as a result. A very hard and pesky error. Gelfand. */ SlenCheck=(__int64)(len1+1)*(__int64)(len2+1)*(__int64)(sizeof(SD)); if (SlenCheck>(__int64)INT_MAX) { return NULL; } /* allocate the alignment matrix */ Slen = (len1+1)*(len2+1); S = (SD*) malloc(sizeof(SD)*Slen); if(S==NULL) { return NULL; /* in case memory allocation fails */ } /* set up to find bestscore in last column */ bestscore=0; bestscorerow=0; /* compute SDD Matrix */ /* first element */ Sp=S; Sp->score = 0; Sp->direction = START; /* across the top */ for(col=1; col<=len2; col++) { Sp++; Sp->score = col*indel; Sp->direction = RIGHT; } /* Down the left side */ Sp=S; for(row=1; row<=len1; row++) { Sp+=(len2+1); Sp->score = 0; Sp->direction = START; } /* body of matrix */ for(row=1;row<=len1;row++) { /* set all pointer around column one of current row */ Sp = S+1+row*(len2+1); Scm1p = Sp-1; Srm1p = Scm1p-len2; Srcm1p = Srm1p-1; Mp = M+1+row*(len2+1); for(col=1;col<=len2;col++) { /* E */ eb = Scm1p->score+indel; /* F */ fb = Srm1p->score+indel; /* S */ /* best substring match length is not necessarily the shortest with complex match scoring functions, so we have to do a search for the best score */ substringlen=Mp->substring_match_length; totalsubstringlen=0; /* test for mismatch and compute */ if(substringlen!=1) { sa = Srcm1p->score+submatrix[26*(seq1[row-1]-'A')+(seq2[col-1]-'A')]; satype=DIAGONAL; bestsubstringlen=1; } /* compute first extended match */ if((substringlen!=0)&&(substringlen<=limit)) { totalsubstringlen=substringlen; /* some complex match scoring functions require the Common composition of the matching substrings. Note that since the substrings match, we calculate the composition of only one*/ for(k=0;kscore, row,col,totalsubstringlen,matchscore,matchscore); if((totalsubstringlen==1)||(stemp>sa)) /* better than mismatch score */ { sa=stemp; bestsubstringlen=totalsubstringlen; if(totalsubstringlen==1) satype=DIAGONAL; else satype=LONGDIAGONAL; } /* get next substring length looking backwards along diagonal */ substringlen=(Mp-totalsubstringlen*(len2+2))->substring_match_length; } while(substringlen!=0) { /* check that length isn't too long */ totalsubstringlen+=substringlen; if(totalsubstringlen>limit) break; /* some complex match scoring functions require the Common composition of the matching substrings. Note that since the substrings match, we calculate the composition of only one*/ for(k=0;kscore, row,col,totalsubstringlen,matchscore,matchscore); if(stemp>sa) { sa=stemp; satype=LONGDIAGONAL; bestsubstringlen=totalsubstringlen; } /* get next substring length looking backwards along diagonal */ substringlen=(Mp-totalsubstringlen*(len2+2))->substring_match_length; } /* store best align length */ Mp->align_match_length=bestsubstringlen; /* if((substringlen=*Mp)>1) { sa=(Sp-substringlen*(len2+2))->score+substringlen*matchscore; satype=LONGDIAGONAL; } else { sa = Srcm1p->score+ submatrix[26*(seq1[row-1]-'A')+(seq2[col-1]-'A')]; satype=DIAGONAL; } */ switch(max3switch(sa,eb,fb)) { case 1: Sp->score = sa; Sp->direction = satype; /* DIAGONAL or LONGDIAGONAL */ break; case 2: Sp->score = eb; Sp->direction = RIGHT; break; case 3: Sp->score = fb; Sp->direction = DOWN; break; } /* update pointers */ Sp++; Srm1p++; Scm1p++; Srcm1p++; Mp++; } /* store bestscore in final column */ if( (Sp-1)->score > bestscore ) { bestscore=(Sp-1)->score; bestscorerow=row; } } /* get the length of the alignment */ Sp = S+(len2+1)*bestscorerow+len2; /* set at bestscore element in final column */ Mp = M+(len2+1)*bestscorerow+len2; /* find out how long the alignment is */ length = 0; while(Sp->direction!=START) { /* length ++; */ switch(Sp->direction) { case START: break; case DIAGONAL: Sp = Sp -(len2+2); Mp = Mp -(len2+2); length++; break; case LONGDIAGONAL: substringlen=Mp->align_match_length; Sp=Sp-substringlen*(len2+2); Mp=Mp-substringlen*(len2+2); length+=substringlen; break; case RIGHT: Sp = Sp - 1; Mp = Mp - 1; length++; break; case DOWN: Sp = Sp - (len2+1); Mp = Mp - (len2+1); length++; break; } } /* allocate memory */ ap = (COMPOSITIONALIGNPAIR*) malloc(sizeof(COMPOSITIONALIGNPAIR)); if(ap==NULL) return NULL; ap->sequence1side = (char*) malloc((length+1)*sizeof(char)); if(ap->sequence1side==NULL) { free(ap); return NULL;} ap->sequence2side = (char*) malloc((length+1)*sizeof(char)); if(ap->sequence2side==NULL) { free(ap->sequence1side); free(ap); return NULL; } ap->matchboundaries = (char*) malloc((length+1)*sizeof(char)); if(ap->matchboundaries==NULL) { free(ap->sequence2side); free(ap->sequence1side); free(ap); return NULL; } /*************************** * trace back and fill in ap ****************************/ ap->length = length; Sp = S+(len2+1)*bestscorerow+len2; /* set at bestscore element in final column */ Mp = M+(len2+1)*bestscorerow+len2; ap->score = Sp->score; ap->sequence1end = bestscorerow; ap->sequence2end = len2; ap->sequence2start = 1; seq1pos = seq1+bestscorerow-1; /* position at bestscore character */ seq2pos = seq2+len2-1; seq1sidepos = ap->sequence1side+length; /* position at termination */ seq2sidepos = ap->sequence2side+length; *seq1sidepos=*seq2sidepos='\0'; /* terminate strings */ seq1sidepos--; /* move down to last char */ seq2sidepos--; mbpos=ap->matchboundaries+length; /* matchboundaries string */ *mbpos='\0'; mbpos--; /* Sp has ben set to bestscore element above */ while(Sp->direction!=START) { switch(Sp->direction) { case DIAGONAL: *seq1sidepos = *seq1pos; *seq2sidepos = *seq2pos; if(*seq1sidepos==*seq2sidepos) *mbpos='|'; else *mbpos=' '; seq1pos--;seq2pos--;seq1sidepos--;seq2sidepos--; mbpos--; Sp = Sp -(len2+2); Mp = Mp -(len2+2); break; case LONGDIAGONAL: substringlen=Mp->align_match_length; for(i=1;i<=substringlen;i++) { *seq1sidepos = *seq1pos; *seq2sidepos = *seq2pos; if(i==1) *mbpos='>'; else if (i==substringlen) *mbpos='<'; else *mbpos='-'; seq1pos--;seq2pos--;seq1sidepos--;seq2sidepos--; mbpos--; Sp = Sp -(len2+2); Mp = Mp -(len2+2); } break; case RIGHT: *seq1sidepos='-'; *seq2sidepos=*seq2pos; *mbpos=' '; seq1sidepos--; seq2sidepos--; seq2pos--; mbpos--; Sp = Sp - 1; Mp = Mp - 1; break; case DOWN: *seq1sidepos=*seq1pos; *seq2sidepos='-'; *mbpos=' '; seq1sidepos--; seq1pos--; seq2sidepos--; mbpos--; Sp = Sp - (len2+1); Mp = Mp - (len2+1); break; } } /* count characters in alignment to find start for sequence1 */ length=0; for(row=0;rowlength;row++) if(ap->sequence1side[row]!='-') length++; ap->sequence1start = ap->sequence1end-length+1; /* free alignment matrix */ free(S); return ap; } /***********************************************************************/ COMPOSITIONVECTOR* CreateSequenceCompositionVectorArray(char *sequence, int length, int ALPHABETSIZE) /* Creates an array of composition vectors, one for each prefix of the sequence. Note: Composition vector is zero based. Note Array is one based. the zero element is the composition of the null sequence */ { __int64 VlenCheck; //to check for overflow int g,k; COMPOSITIONVECTOR *Vprev, *Vcurr, *V; int *Index; /* this check was put here because of a problem of overflow when calculating the Vlen and malloc() was just using the smaller number that came out as a result. A very hard and pesky error. Gelfand. */ VlenCheck=(__int64)(length+1)*(__int64)(sizeof(COMPOSITIONVECTOR)); if (VlenCheck>(__int64)INT_MAX) { return NULL; } /* allocate the composition vector array */ V = (COMPOSITIONVECTOR*) malloc( sizeof(COMPOSITIONVECTOR) * (length+1) ); if(V==NULL) { return NULL; /* in case memory allocation fails */ } /* create index */ Index=(int *)calloc(256,sizeof(int)); if(Index==NULL){return NULL;} if(ALPHABETSIZE == 5) { Index['A']=0; Index['C']=1; Index['G']=2; Index['T']=3; Index['X']=4; } else { Index['A']=0; Index['B']=1; Index['C']=2; Index['D']=3; Index['E']=4; Index['F']=5; Index['G']=6; Index['H']=7; Index['I']=8; Index['J']=9; Index['K']=10; Index['L']=11; Index['M']=12; Index['N']=13; Index['O']=14; Index['P']=15; Index['Q']=16; Index['R']=17; Index['S']=18; Index['T']=19; Index['U']=20; Index['V']=21; Index['W']=22; Index['X']=23; Index['Y']=24; } /* fill zero entry */ for(k=0;kC[k]=Vprev->C[k]; } /* and then adjusted for new character */ Vcurr->C[Index[sequence[g-1]]]++; } /* test print vector array */ /* printf("\n\n"); for(k=0;ksequence1end-ap->sequence1start+1; len2=ap->sequence2end-ap->sequence2start+1; V1=CreateSequenceCompositionVectorArray(seq1 + ap->sequence1start - 1,len1,Alphabetsize); V2=CreateSequenceCompositionVectorArray(seq2 + ap->sequence2start - 1,len2,Alphabetsize); CalculateCommonCompositionFromSequenceCompVecs(V1,V2,len1,len2,Alphabetsize); free(V1); free(V2); } void CalculateBackgroundComposition(char *seq1, char *seq2, int len1, int len2, int Alphabetsize) /* puts composition in Common */ { COMPOSITIONVECTOR *V1,*V2; /* compostitional vector */ V1=CreateSequenceCompositionVectorArray(seq1,len1,Alphabetsize); V2=CreateSequenceCompositionVectorArray(seq2,len2,Alphabetsize); CalculateBackgroundCompositionFromSequenceCompVecs(V1,V2,len1,len2,Alphabetsize); free(V1); free(V2); } void PrintBackgroundAndCommonComposition(int Alphabetsize) { int i; printf("\nBackground: "); for(i=0;i