#include #include #include #include #include #include #include #include //#include "memorycheck.h" /************************************************************************* * * cau.c : Compostitional Alignment Utility * Copyright (c) 2003, Dr. Gary Benson * All rights reserved. * * This program takes a multiple sequence FASTA * file and aligns either pairs of seqs (odds to evens), * or everything against each other (based on flags) * using different kinds of compositional alignments. * * Note: each sequence is uppercased and all leading * and trailing Ns are removed. * * Created On : April 28, 2003. * Project Development : Gary Benson, Yevgeniy Gelfand, Alfredo Rodriguez, Madhu * Last Revision Date : October 21, 2003. * *************************************************************************/ #define VSNN -999999 /* for compatability */ void LBI_error(char *in) { printf("\n%s",in); } void LBI_exit(int code) { exit(code); } void PrintErrorAndQuit(char* in) { printf("\n%s",in); exit(0); } #include "lbisequence.h" #include "align.h" #include "comp.alignment.h" #include "complex.match.function.h" #include "kbest.local.composition.alignment.h" /* for speed reasons */ #include "simple.match.function.h" void compareTwoSequences( LBISEQUENCE *seq1, LBISEQUENCE *seq2 , int isDiNuc, int limit, int alpha,int gapextend,int *sm, int matchFunc, int isLocal, int minScore, int ratio, double ratioDouble ); /* used as arrays to hold resulting dna sequences converted from dinuc seqs */ char *Afirst; char *Asecond; char *Bfirst; char *Bsecond; /*******************************************************************************************************/ int main( int argc, char *argv[] ) { LBISEQUENCE *seqs, *curr, *curr2; char *pbuff, *src, *src2; int *sm; /* must be recieved from the command line */ int gapstart = 0; /* not used in this program */ int isDiNuc; int match, mismatch, gapextend, limit; int matchFunc; int isLocal; int isPaired; int minscore = VSNN; int ratio = 0; double ratioDouble; char minscoreString[100]="", ratioString[100]=""; /* check if all command line vars are there */ if ( argc < 10 ) { printf("\n\nPlease use: %s File Match Mismatch Delta Limit isDiNuc MatchFunc IsLocal IsPaired [-m (minscore)] [-r (scoreratio)] \n", argv[0]); printf("\nWhere: (all weights, penalties, and scores are positive)"); printf("\n File = multiple sequences input file"); printf("\n Match = matching weight"); printf("\n Mismatch = mismatching penalty"); printf("\n Delta = indel penalty"); printf("\n Limit = how many characters can be scrambled at a time"); printf("\n isDiNuc = 1 for nucleartide, 2 for dinucleartide"); printf("\n MatchFunc = function number between 1 and 6"); printf("\n IsLocal = 0 for Global, 1 for Local, 2 (or anything else) for PatternGlobalTextLocal "); printf("\n IsPaired = 0 to align everything against everything, 1 (or anything else) to align pairs from the input file,"); printf("\n -m (minscore) = use this switch to provide a minimum compositional score to report an alignment"); printf("\n -r (scoreratio) = use this switch to indicate a minimum composition/basic alignment score ratio to report an alignment"); printf("\n"); printf("\nNote the sequence file should be in FASTA format:"); printf("\n"); printf("\n>Name of sequence1"); printf("\n aggaaacctg ccatggcctc ctggtgagct gtcctcatcc actgctcgct gcctctccag"); printf("\n atactctgac ccatggatcc cctgggtgca gccaagccac aatggccatg gcgccgctgt"); printf("\n actcccaccc gccccaccct cctgatcctg ctatggacat ggcctttcca catccctgtg..."); printf("\n>Name of sequence2"); printf("\n aggaaacctg ccatggcctc ctggtgagct gtcctcatcc actgctcgct gcctctccag"); printf("\n atactctgac ccatggatcc cctgggtgca gccaagccac aatggccatg gcgccgctgt"); printf("\n actcccaccc gccccaccct cctgatcctg ctatggacat ggcctttcca catccctgtg..."); printf("\n"); printf("\nNote for PatternGlobalTextLocal alignment, pattern is the second sequence:"); printf("\n"); printf("\nNote match functions are as follows:"); printf("\n"); printf("\n Function 1:"); printf("\n The simplest function, a constant times the length of the match."); printf("\n Function 2:"); printf("\n Square root of length of extended match times a constant."); printf("\n Function 3:"); printf("\n Log base 2 of length+1 of extended match times a constant."); printf("\n Function 4:"); printf("\n Relative entropy of substring composition with respect to background"); printf("\n composition times length of extended match times a constant."); printf("\n Function 5:"); printf("\n Relative entropy of substring composition with respect to background"); printf("\n composition times length of extended match times a constant. For"); printf("\n length =1, normal match value prevents two identical sequences"); printf("\n composed of only one letter from scoring zero if the background is"); printf("\n the same letter. "); printf("\n Function 6:"); printf("\n Shannon-Jensen entropy of substring composition versus the"); printf("\n background composition times length of extended match times a"); printf("\n constant."); printf("\n"); exit(0); } if (argc >= 12 ) { if (0 == stricmp( argv[10], "-m")) { minscore = atoi( argv[11] ); sprintf(minscoreString,"minscore=%d ", minscore); } if (0 == stricmp( argv[10], "-r")) { ratioDouble = atof( argv[11] ); ratio = 1; sprintf(ratioString,"min_c/b_ratio=%.2lf ", ratioDouble); } } if (argc >= 14 ) { if (0 == stricmp( argv[10], "-m")) { minscore = atoi( argv[11] ); sprintf(minscoreString,"minscore=%d ", minscore); } if (0 == stricmp( argv[10], "-r")) { ratioDouble = atof( argv[11] ); ratio = 1; sprintf(ratioString,"min_c/b_ratio=%.2lf ", ratioDouble); } if (0 == stricmp( argv[12], "-m")) { minscore = atoi( argv[13] ); sprintf(minscoreString,"minscore=%d ", minscore); } if (0 == stricmp( argv[12], "-r")) { ratioDouble = atof( argv[13] ); ratio = 1; sprintf(ratioString,"min_c/b_ratio=%.2lf ", ratioDouble); } } /* get the variables from the commandline */ match = atoi( argv[2] ); mismatch = atoi( argv[3] ); gapstart = 0; gapextend = atoi( argv[4] ); limit = atoi( argv[5] ); isDiNuc = atoi( argv[6] ); if ( limit < 1 ) PrintErrorAndQuit("\nERROR: Limit must be larger than or equal to 1. Aborting!"); if ( isDiNuc != 1 && isDiNuc != 2 ) PrintErrorAndQuit("\nERROR: Invalid entry for isDiNuc. Aborting!"); matchFunc = atoi( argv[7] ); isLocal = atoi( argv[8] ); isPaired = atoi( argv[9] ); /* set the correct match function */ printf( "Compostition Alignment Utility V1.00\n" ); printf( "Copyright (c) 2003, Dr. Gary Benson\n" ); printf( "All rights reserved.\n\n" ); printf( "Using the following parameters: match=%d mismatch=%d indel=%d limit=%d isDiNuc=%d matchfunc=%d %s%s%s.", match, mismatch, gapextend, limit, isDiNuc, matchFunc, minscoreString, ratioString, isLocal ? "Using Local alignment" : "Using Global alignment" ); /* correct some values */ if( match < 0 ) match = -match; if( mismatch > 0 ) mismatch = -mismatch; if( gapstart > 0 ) gapstart = -gapstart; if( gapextend > 0 ) gapextend = -gapextend; /* create the substitution matrix */ sm = CreateSubstitutionMatrix(match,mismatch); if ( NULL == sm ) PrintErrorAndQuit("\nERROR: Could not get the similarity matrix. Aborting!"); /* parse the sequence file */ seqs = ReadSequencesFile( argv[1], &pbuff ); if( NULL == seqs ) { /* if no valid sequences in buffer, print error */ PrintErrorAndQuit( "\nERROR: Invalid format detected in sequence file! Make sure all sequences " "in the file contain a FASTA header. Aborting!"); } /* at least 2 must exist */ if ( NULL == seqs->next ) { /* if only one sequence, output an errer */ PrintErrorAndQuit( "\nERROR: You need more than one sequence inside your file to use this tool. Aborting!"); } /* perform some sequence modifications */ for ( curr = seqs; NULL != curr; curr = curr->next ) { src = curr->sequence; /* uppercase sequence and replace wierd chars with Ns */ while ( 0 != *src ) { *src = toupper ( *src ); if ( strchr( "ACGTN", *src ) == NULL ) *src = 'N'; src++; } src2 = src - 1; src = curr->sequence; /* remove leading Ns */ while ( 'N' == *src && src != src2 ) { src++; curr->length--; } /* remove trailing Ns */ while ( 'N' == *src2 && src != src2 ) { src2--; curr->length--; } *(src2+1) = 0; curr->sequence = src; if ( curr->length >= MAXSEQSIZE ) { /* If one of the sequences is too large. */ PrintErrorAndQuit( "\nERROR: One of the sequences is too large. Aborting!"); } } /* I don't know yet how large these must be, so let's allocate the max possible */ Afirst = (char *)calloc( MAXSEQSIZE + 1,sizeof(char)); Asecond = (char *)calloc( MAXSEQSIZE + 1,sizeof(char)); Bfirst = (char *)calloc( MAXSEQSIZE + 1,sizeof(char)); Bsecond = (char *)calloc( MAXSEQSIZE + 1,sizeof(char)); if ( NULL == Afirst || NULL == Asecond || NULL == Bfirst || NULL == Bsecond ) { PrintErrorAndQuit("Memory error."); } /* perform the comparison for pairs */ if (isPaired) { for ( curr = seqs; curr != NULL; curr = curr->next ) { curr2 = curr->next; if ( NULL == curr2 ) PrintErrorAndQuit( "\nCompStudy: The number of sequences in the input file must be ODD. Aborting!"); // printf("\ncomparing: %s (%d chars) and %s (%d chars)", curr->header, curr->length, curr2->header, curr2->length ); fflush(stdout); compareTwoSequences( curr, curr2, isDiNuc, limit, gapstart, gapextend, sm, matchFunc, isLocal, minscore, ratio, ratioDouble ); curr = curr2; } /* else do everything again everything */ } else { for ( curr = seqs->next; curr != NULL; curr = curr->next ) { for ( curr2 = seqs; curr2 != curr; curr2 = curr2->next ) { // printf("\ncomparing: %s (%d chars) and %s (%d chars)", curr->header, curr->length, curr2->header, curr2->length ); fflush(stdout); compareTwoSequences( curr, curr2, isDiNuc, limit, gapstart, gapextend, sm, matchFunc, isLocal, minscore, ratio, ratioDouble ); } } } /* cleanup */ if ( NULL != pbuff ) free( pbuff ); if ( NULL != seqs ) FreeSequencesOB( seqs ); return 1; } /*******************************************************************************************************/ void compareTwoSequences( LBISEQUENCE *seq1, LBISEQUENCE *seq2 , int isDiNuc, int limit, int alpha,int gapextend,int *sm, int matchfunc, int isLocal, int minScore, int ratio, double ratioDouble ) { SUBSTRINGMATCHELEMENT *Mcomp, *McompForBasic; /* for substring composition matches */ COMPOSITIONVECTOR *Vcomp,*Vcomp2; /* compostitional vecor */ SEQUENCES myseqs, myseqs2; /* holds initial dna and dinuc seqs */ SEQUENCES *pseqs, *pseqs2; /* to hold dinuc representations and their backward translations */ /* 2 for Dinuceatide, 1 for regular DNA */ COMPOSITIONALIGNPAIR *cmpap = NULL,*basap = NULL; int i; int compScore; int dseqlength, dseq2length; /* lengths of aligments */ char typeStr[100]; int (*tempFunc)(int,int,int,int,int, int) = NULL; /* get the dinucleatide sequences */ pseqs = &myseqs; pseqs2 = &myseqs2; if (GetDinSeq (pseqs, seq1->sequence, seq1->length) == -1) { PrintErrorAndQuit("compareTwoSequences: Could not get the dinucleartide sequence for first sequence."); } if (GetDinSeq (pseqs2, seq2->sequence, seq2->length) == -1) { PrintErrorAndQuit("compareTwoSequences: Could not get the dinucleartide sequence for second sequence."); } /* get compositional substring matches */ if (isDiNuc==2) { // dinuc ALPHABETSIZ = DIALPHSIZE; Mcomp=GetCompositionSubstringMatchLengthsComplexMatchFunction(pseqs->dinucseq,pseqs2->dinucseq, pseqs->dinuclength, pseqs2->dinuclength,limit,ALPHABETSIZ); } else if (isDiNuc==1) { // regular ALPHABETSIZ = DNAALPHSIZE; Mcomp=GetCompositionSubstringMatchLengthsComplexMatchFunction(seq1->sequence,seq2->sequence, seq1->length, seq2->length,limit,ALPHABETSIZ); } else PrintErrorAndQuit("isDiNuc input error"); if (Mcomp==NULL) PrintErrorAndQuit("compareTwoSequences: Could not get the compositional match lengths. Aborting!"); /* create composition vector from the sequences */ if (isDiNuc==2) { Vcomp=CreateSequenceCompositionVectorArray( pseqs->dinucseq, pseqs->dinuclength, ALPHABETSIZ ); Vcomp2=CreateSequenceCompositionVectorArray( pseqs2->dinucseq, pseqs2->dinuclength, ALPHABETSIZ ); } else { Vcomp=CreateSequenceCompositionVectorArray( seq1->sequence, seq1->length, ALPHABETSIZ); Vcomp2=CreateSequenceCompositionVectorArray( seq2->sequence, seq2->length, ALPHABETSIZ); } if (Vcomp==NULL||Vcomp2==NULL) PrintErrorAndQuit("compareTwoSequences: Could not get the compositional vecotr arrays. Aborting!"); /* calculate background (needed for functions 4-6), but lets do it for all to display for the user */ //if ( matchfunc >= 4 ) { if (isDiNuc==2) CalculateBackgroundCompositionFromSequenceCompVecs(Vcomp,Vcomp2,pseqs->dinuclength,pseqs2->dinuclength,ALPHABETSIZ); else CalculateBackgroundCompositionFromSequenceCompVecs(Vcomp,Vcomp2,seq1->length,seq2->length,ALPHABETSIZ); //} /* get compositional alignment */ { /* set the correct function */ if ( 1 == matchfunc ) setMatchFunction(functionextendedmatch1); else if ( 2 == matchfunc ) setMatchFunction(functionextendedmatch2); else if ( 3 == matchfunc) setMatchFunction(functionextendedmatch3); else if ( 4 == matchfunc) setMatchFunction(functionextendedmatch4); else if ( 5 == matchfunc) setMatchFunction(functionextendedmatch5); else if ( 6 == matchfunc) setMatchFunction(functionextendedmatch6); else PrintErrorAndQuit("compareTwoSequences: Illegal function detected. Aborting!"); /* DINUC */ if (isDiNuc==2) { /* local */ if (isLocal) { if (isLocal==1) { if ( matchfunc <= 3) { cmpap = GetCompositionLocalAlignPairSimpleMatchFunction(pseqs->dinucseq,pseqs2->dinucseq, pseqs->dinuclength, pseqs2->dinuclength,alpha,gapextend,sm, limit, Mcomp); } else { cmpap = GetCompositionLocalAlignPairComplexMatchFunction(pseqs->dinucseq,pseqs2->dinucseq, pseqs->dinuclength, pseqs2->dinuclength,alpha,gapextend,sm, limit, Mcomp, Vcomp); } /* PatternGlobalTextLocal */ } else { if ( matchfunc <= 3) { cmpap = GetCompositionPatternGlobalTextLocalAlignPairSimpleMatchFunction(pseqs->dinucseq,pseqs2->dinucseq, pseqs->dinuclength, pseqs2->dinuclength,alpha,gapextend,sm, limit, Mcomp); } else { cmpap = GetCompositionPatternGlobalTextLocalAlignPairComplexMatchFunction(pseqs->dinucseq,pseqs2->dinucseq, pseqs->dinuclength, pseqs2->dinuclength,alpha,gapextend,sm, limit, Mcomp, Vcomp); } } /* global */ } else { if ( matchfunc <= 3) { cmpap = GetCompositionGlobalAlignPairSimpleMatchFunction(pseqs->dinucseq,pseqs2->dinucseq, pseqs->dinuclength, pseqs2->dinuclength,alpha,gapextend,sm, limit, Mcomp); } else { cmpap = GetCompositionGlobalAlignPairComplexMatchFunction(pseqs->dinucseq,pseqs2->dinucseq, pseqs->dinuclength, pseqs2->dinuclength,alpha,gapextend,sm, limit, Mcomp, Vcomp); } } /* only needed for display, remove if speed is important */ CalculateCompositionOfCompositionAlignment(pseqs->dinucseq,pseqs2->dinucseq,cmpap,DIALPHSIZE); /* DNA */ } else { /* local */ if (isLocal) { if (isLocal==1) { if ( matchfunc <= 3) { cmpap = GetCompositionLocalAlignPairSimpleMatchFunction(seq1->sequence,seq2->sequence, seq1->length, seq2->length,alpha,gapextend,sm, limit, Mcomp); } else { cmpap = GetCompositionLocalAlignPairComplexMatchFunction(seq1->sequence,seq2->sequence, seq1->length, seq2->length,alpha,gapextend,sm, limit, Mcomp, Vcomp); } /* PatternGlobalTextLocal */ } else { if ( matchfunc <= 3) { cmpap = GetCompositionPatternGlobalTextLocalAlignPairSimpleMatchFunction(seq1->sequence,seq2->sequence, seq1->length, seq2->length,alpha,gapextend,sm, limit, Mcomp); } else { cmpap = GetCompositionPatternGlobalTextLocalAlignPairComplexMatchFunction(seq1->sequence,seq2->sequence, seq1->length, seq2->length,alpha,gapextend,sm, limit, Mcomp, Vcomp); } } /* global */ } else { if ( matchfunc <= 3) { cmpap = GetCompositionGlobalAlignPairSimpleMatchFunction(seq1->sequence,seq2->sequence, seq1->length, seq2->length,alpha,gapextend,sm, limit, Mcomp); } else { cmpap = GetCompositionGlobalAlignPairComplexMatchFunction(seq1->sequence,seq2->sequence, seq1->length, seq2->length,alpha,gapextend,sm, limit, Mcomp, Vcomp); } } /* only needed for display, remove if speed is important */ CalculateCompositionOfCompositionAlignment(seq1->sequence,seq2->sequence,cmpap,DNAALPHSIZE); } if (cmpap==NULL) PrintErrorAndQuit("compareTwoSequences: Could not get the compositional alignment. Aborting!"); /* remember the score */ compScore = cmpap->score; } /* get the regular Global or Local alignment too (which is equivalent to running the comp alignment with function one and limit of 1) */ printf("\n"); // set new match function to 1 setMatchFunction(functionextendedmatch1); // change alphabetsize ALPHABETSIZ = DNAALPHSIZE; // get a new mcomp with limit of 1 McompForBasic = GetCompositionSubstringMatchLengthsComplexMatchFunction(seq1->sequence,seq2->sequence, seq1->length, seq2->length,1,DNAALPHSIZE); if (isLocal) { if (isLocal==1) strcpy(typeStr,"Composition Local"); else strcpy(typeStr,"Composition PatternGlobalTextLocal"); basap = GetCompositionLocalAlignPairSimpleMatchFunction(seq1->sequence,seq2->sequence, seq1->length, seq2->length,alpha,gapextend,sm, 1, McompForBasic); if ( NULL == basap ) PrintErrorAndQuit("Could not get the regular Local alignment. Aborting!"); } else { strcpy(typeStr,"Composition Global"); basap = GetCompositionGlobalAlignPairSimpleMatchFunction(seq1->sequence,seq2->sequence, seq1->length, seq2->length,alpha,gapextend,sm, 1, McompForBasic); if ( NULL == basap ) PrintErrorAndQuit("Could not get the regular Global alignment. Aborting!"); } /* print the output info (note: no flushing, speed is important )*/ //printf("%d %d %s %s\n",basap->score, compScore, seq1->header, seq2->header ); /* print the alignments */ if (minScore == VSNN || cmpap->score >= minScore) { /* minscore check */ if (!ratio || (( basap->score * ratioDouble) <= cmpap->score)) { /* ratio check */ dseqlength = strlen(cmpap->sequence1side); dseq2length = strlen(cmpap->sequence2side); if (isDiNuc==2) { for( i = 0; i < dseqlength; i++) //finds the two letters that represent a dinuc letter and puts it into a point in an array { findLetters(cmpap->sequence1side[i],Afirst,Asecond,i); } for (i = 0; i < dseq2length; i++) { findLetters(cmpap->sequence2side[i],Bfirst,Bsecond,i); } Afirst[i]=Bfirst[i]=Asecond[i]=Bsecond[i]=0; } if (isDiNuc==2) { printf("\n%s (dn):\n%s\n%s\n%s\nComposition score=%d\n", // print the dinucleartine representation typeStr,cmpap->matchboundaries,cmpap->sequence1side,cmpap->sequence2side,cmpap->score); printf("\n%s DNA:\n%s\n%s\n%s\n%s\n%s\nComposition score=%d (basic=%d)\n" // print the DNA representation "sequence 1: start=%4d end=%4d;\nsequence 2: start=%4d end=%4d", typeStr,Afirst,Asecond,cmpap->matchboundaries,Bfirst,Bsecond,cmpap->score,basap->score, cmpap->sequence1start,cmpap->sequence1end,cmpap->sequence2start,cmpap->sequence2end); PrintBackgroundAndCommonComposition(DIALPHSIZE); } else { printf("\n%s DNA:\n%s\n%s\n%s\nComposition score=%d (basic=%d)\n" // print the DNA representation "sequence 1: start=%4d end=%4d;\nsequence 2: start=%4d end=%4d", typeStr,cmpap->sequence1side,cmpap->matchboundaries,cmpap->sequence2side,cmpap->score,basap->score, cmpap->sequence1start,cmpap->sequence1end,cmpap->sequence2start,cmpap->sequence2end); PrintBackgroundAndCommonComposition(DNAALPHSIZE); } fflush(stdout); } } /* cleanup*/ FreeCompositionAlignPair(cmpap); FreeCompositionAlignPair(basap); free(pseqs->dinucseq); free(pseqs2->dinucseq); free(Mcomp); free(McompForBasic); free(Vcomp); free(Vcomp2); }