#ifndef COMP_ALIGN_H
#define COMP_ALIGN_H

/* This file contains some untility functions for compositional alignment.
 * The real alignment routines are in the align.h */

/* originally started 1/10/03 */
//This program given two regular DNA sequences, transforms them into DiNucleotide sequences
//of an alphabet of size 25 and then performs alignments on them.  It then reconverts the 
//DiNucleotide sequence into a DNA sequence with each DiNucleotide letter represented as 
//two DNA letters stacked on top of each other.  i.e. A in dinuc is "AA" in DNA and 
//Q in dinuc is the same as "TC" in DNA.
#include<stdlib.h>
#include<stdio.h>
#include<math.h>
#include <ctype.h>
#include <string.h>


#define ROWS 5
#define COLS 5
//#define MAXSEQLEN 200
#define DIALPHSIZE 25 //size of dinucleotide alphabet
#define DNAALPHSIZE 5 //size of regular dna alphabet including 'X'

/*Note -- the lengths in this structure refer to string lengths without nulls*/

typedef struct { //Holds dna and dinuc sequences and their lengths
	int dinuclength;
	int dnatranslength;
	char *dinucseq;
	char *dnatransseq;
}SEQUENCES;

/*function prototypes*/
int GetDinSeq (SEQUENCES *pseqs, char dnastr[], int dnastrlen);//calculates dinuc seq
int GetDnaSeq (SEQUENCES *pseqs);//calculates dna seq based on dinuc seq
int GetIndex(char seqletter);//helper function
char GetSeqLetter(int indexnum);//helper function
void findLetters(char letter, char first[], char second[], int index);//finds 2 letter dna sequence given a dinuc letter
/*********************************************************************************
 * Transforms a dna sequence into a dinucleotide sequence, and then transforms 
 * the dinucleotide sequence back into the dna sequence.
 * Note -- the dna sequence that is returned may differ from the original input because:
 * 1) All letters returned are capitalized.
 * 2) All 'bad' nucleotides (i.e. not 'A', 'C', 'G', 'T') are returned as the letter 'X'.
 * Example:   aCTrAG (dna) --> BITUC (dinuc) --> ACTXAG (dna).
 * Memory allocated for BOTH sequences must be freed after use.
 *********************************************************************************/

int GetDinSeq (SEQUENCES *ptseqs, char dnastr[], int dnastrlen) {
	
	/*Given an input dna string, convert to a dinucleotide string.
	 *Returns -1 if error in memory allocation occurs.
	 */
	char first, second;
	char *ptdinseq;
	char tabtransform[ROWS][COLS] = {
		{'A', 'B', 'C', 'D', 'E'},
		{'F', 'G', 'H', 'I', 'J'},
		{'K', 'L', 'M', 'N', 'O'},
		{'P', 'Q', 'R', 'S', 'T'},
		{'U', 'V', 'W', 'X', 'Y'}
	};
	int rownum = 0, colnum =0, i=0, j=0;

	/*assign dinucleotide length to structure
	 *dinucleotide length is always 1 less than its corresponding dna string length
	 */
	ptseqs ->dinuclength = dnastrlen-1;

	/*allocate memory for the dinucleotide sequence including space for null character*/
	ptseqs->dinucseq = (char *)malloc(sizeof(char) * (dnastrlen));
	if (ptseqs->dinucseq == NULL) {
		printf("Could  not allocate memory for dinucleotide sequence\n");
		return -1;			
	}
	
	/*convert dna sequence to a dinucleotide sequence*/
	for (i=0, ptdinseq = ptseqs->dinucseq; i<(ptseqs->dinuclength); i++, ptdinseq++) {
		first = toupper(dnastr[i]);
		second = toupper(dnastr[i+1]);
		rownum = GetIndex(first);
		colnum = GetIndex(second);
		*ptdinseq = tabtransform[rownum][colnum];		
	}

	/*add null to string*/
	*ptdinseq = '\0';
	
	return 0;
}

int GetDnaSeq (SEQUENCES *ptseqs) {
	/*Converts a dinucleotide string from the SEQUENCES structure to a dna string. 
	 *Returns -1 if error in memory allocation occurs.
	 */
	int i=0, j=0;
	int rownumber = 0;
	int dnalength =0;
	char *ptdnaseq;

	/*assign dna length to structure
	 *dna length is always dinucleotide length +1
	 */
	dnalength = ptseqs->dinuclength +1;
	ptseqs->dnatranslength = dnalength;
	
	/*allocate memory for the dna sequence (including space for null character)*/
	ptseqs->dnatransseq = (char *)malloc(sizeof(char) * (dnalength+1));
	if (ptseqs->dnatransseq == NULL) {
		printf("Could  not allocate memory for dna sequence\n");
		return -1;			
	}
	
	ptdnaseq = ptseqs->dnatransseq;
	
	rownumber = (int)(floor((ptseqs->dinucseq[i] - 'A')/COLS));
	ptdnaseq[0] = GetSeqLetter(rownumber);
	
	for(i=0, j=1; i<(ptseqs->dinuclength); i++, j++) {
		ptdnaseq[j] = GetSeqLetter((ptseqs->dinucseq[i] - 'A')%COLS);
	}
	/*add null to string*/
	ptdnaseq[j] = '\0';

	return 0;
}

int GetIndex(char seqletter) {

	/*Transforms a nucleotide letter to its corresponding index number*/
	int indexnum;
	
	switch (seqletter) {

	case 'A':
		indexnum = 0;
		break;
	
	case 'C':
		indexnum = 1;
		break;

	case 'G':
		indexnum = 2;
		break;

	case 'T':
		indexnum = 3;
		break;
	
	default:
		indexnum = 4;
		break;
	}
	
	return indexnum;
}

char GetSeqLetter(int indexnum) {
	
	/*Transforms an index number to its corresponding sequence letter*/
	char seqletter[5] = {'A', 'C', 'G', 'T', 'X'};

	return seqletter[indexnum];
}

void findLetters(char letter, char first[], char second[], int index)
{	
	int rownumber;
		
	if (letter == '-')
	{
		first[index] = '-';
		second[index] = '-';
	}
	else
	{
		rownumber = (int)(floor((letter - 'A')/COLS));//look at tabtransform matrix at GetDinSeq function to understand this
		first[index] = GetSeqLetter(rownumber);
		second[index] = GetSeqLetter((letter - 'A')%COLS);
	}
	//dinuc to dna looks like this basically:
	//A in dinuc == "AA" in dna
	//B in dinuc == "AC" in dna
	//C in dinuc == "AG" in dna
	//D in dinuc == "AT" in dna
	//E in dinuc == "AX" in dna....where X is any character other than ATCG 
	//.....this goes all the way up to Y which represents "XX"

}

#endif