#ifndef COMP_ALIGN_H #define COMP_ALIGN_H /* This file contains some untility functions for compositional alignment. * The real alignment routines are in the align.h */ /* originally started 1/10/03 */ //This program given two regular DNA sequences, transforms them into DiNucleotide sequences //of an alphabet of size 25 and then performs alignments on them. It then reconverts the //DiNucleotide sequence into a DNA sequence with each DiNucleotide letter represented as //two DNA letters stacked on top of each other. i.e. A in dinuc is "AA" in DNA and //Q in dinuc is the same as "TC" in DNA. #include #include #include #include #include #define ROWS 5 #define COLS 5 //#define MAXSEQLEN 200 #define DIALPHSIZE 25 //size of dinucleotide alphabet #define DNAALPHSIZE 5 //size of regular dna alphabet including 'X' /*Note -- the lengths in this structure refer to string lengths without nulls*/ typedef struct { //Holds dna and dinuc sequences and their lengths int dinuclength; int dnatranslength; char *dinucseq; char *dnatransseq; }SEQUENCES; /*function prototypes*/ int GetDinSeq (SEQUENCES *pseqs, char dnastr[], int dnastrlen);//calculates dinuc seq int GetDnaSeq (SEQUENCES *pseqs);//calculates dna seq based on dinuc seq int GetIndex(char seqletter);//helper function char GetSeqLetter(int indexnum);//helper function void findLetters(char letter, char first[], char second[], int index);//finds 2 letter dna sequence given a dinuc letter /********************************************************************************* * Transforms a dna sequence into a dinucleotide sequence, and then transforms * the dinucleotide sequence back into the dna sequence. * Note -- the dna sequence that is returned may differ from the original input because: * 1) All letters returned are capitalized. * 2) All 'bad' nucleotides (i.e. not 'A', 'C', 'G', 'T') are returned as the letter 'X'. * Example: aCTrAG (dna) --> BITUC (dinuc) --> ACTXAG (dna). * Memory allocated for BOTH sequences must be freed after use. *********************************************************************************/ int GetDinSeq (SEQUENCES *ptseqs, char dnastr[], int dnastrlen) { /*Given an input dna string, convert to a dinucleotide string. *Returns -1 if error in memory allocation occurs. */ char first, second; char *ptdinseq; char tabtransform[ROWS][COLS] = { {'A', 'B', 'C', 'D', 'E'}, {'F', 'G', 'H', 'I', 'J'}, {'K', 'L', 'M', 'N', 'O'}, {'P', 'Q', 'R', 'S', 'T'}, {'U', 'V', 'W', 'X', 'Y'} }; int rownum = 0, colnum =0, i=0, j=0; /*assign dinucleotide length to structure *dinucleotide length is always 1 less than its corresponding dna string length */ ptseqs ->dinuclength = dnastrlen-1; /*allocate memory for the dinucleotide sequence including space for null character*/ ptseqs->dinucseq = (char *)malloc(sizeof(char) * (dnastrlen)); if (ptseqs->dinucseq == NULL) { printf("Could not allocate memory for dinucleotide sequence\n"); return -1; } /*convert dna sequence to a dinucleotide sequence*/ for (i=0, ptdinseq = ptseqs->dinucseq; i<(ptseqs->dinuclength); i++, ptdinseq++) { first = toupper(dnastr[i]); second = toupper(dnastr[i+1]); rownum = GetIndex(first); colnum = GetIndex(second); *ptdinseq = tabtransform[rownum][colnum]; } /*add null to string*/ *ptdinseq = '\0'; return 0; } int GetDnaSeq (SEQUENCES *ptseqs) { /*Converts a dinucleotide string from the SEQUENCES structure to a dna string. *Returns -1 if error in memory allocation occurs. */ int i=0, j=0; int rownumber = 0; int dnalength =0; char *ptdnaseq; /*assign dna length to structure *dna length is always dinucleotide length +1 */ dnalength = ptseqs->dinuclength +1; ptseqs->dnatranslength = dnalength; /*allocate memory for the dna sequence (including space for null character)*/ ptseqs->dnatransseq = (char *)malloc(sizeof(char) * (dnalength+1)); if (ptseqs->dnatransseq == NULL) { printf("Could not allocate memory for dna sequence\n"); return -1; } ptdnaseq = ptseqs->dnatransseq; rownumber = (int)(floor((ptseqs->dinucseq[i] - 'A')/COLS)); ptdnaseq[0] = GetSeqLetter(rownumber); for(i=0, j=1; i<(ptseqs->dinuclength); i++, j++) { ptdnaseq[j] = GetSeqLetter((ptseqs->dinucseq[i] - 'A')%COLS); } /*add null to string*/ ptdnaseq[j] = '\0'; return 0; } int GetIndex(char seqletter) { /*Transforms a nucleotide letter to its corresponding index number*/ int indexnum; switch (seqletter) { case 'A': indexnum = 0; break; case 'C': indexnum = 1; break; case 'G': indexnum = 2; break; case 'T': indexnum = 3; break; default: indexnum = 4; break; } return indexnum; } char GetSeqLetter(int indexnum) { /*Transforms an index number to its corresponding sequence letter*/ char seqletter[5] = {'A', 'C', 'G', 'T', 'X'}; return seqletter[indexnum]; } void findLetters(char letter, char first[], char second[], int index) { int rownumber; if (letter == '-') { first[index] = '-'; second[index] = '-'; } else { rownumber = (int)(floor((letter - 'A')/COLS));//look at tabtransform matrix at GetDinSeq function to understand this first[index] = GetSeqLetter(rownumber); second[index] = GetSeqLetter((letter - 'A')%COLS); } //dinuc to dna looks like this basically: //A in dinuc == "AA" in dna //B in dinuc == "AC" in dna //C in dinuc == "AG" in dna //D in dinuc == "AT" in dna //E in dinuc == "AX" in dna....where X is any character other than ATCG //.....this goes all the way up to Y which represents "XX" } #endif