#ifndef _LBI_SEQUENCE #define _LBI_SEQUENCE /* define structure for storing FASTA sequences as linked list */ typedef struct tagLBISEQUENCE { char *header; int length; char *sequence; struct tagLBISEQUENCE *next; } LBISEQUENCE; /* Use this interface when you want to read from a file * Don't forget to free the storage after you are done * with the sequences */ LBISEQUENCE* ReadSequencesFile( char * inputFile, char **storage ); /* use this interface when you want to read from memory */ LBISEQUENCE* GetSequencesOB( char* pbuff ); void FreeSequencesOB( LBISEQUENCE* head ); /******************************************************************************************* * Reads multiple sequences in FASTA format. */ LBISEQUENCE* ReadSequencesFile( char * inputFile, char **storage ) { LBISEQUENCE *seqs = NULL; /* to hold sequences */ char *pbuff; /* physicall sequence storage */ int bufferSize, retBufferSize; FILE *fp; /* used to open input file */ /* read the sequence into the buffer */ if (( fp = fopen(inputFile,"r")) == NULL ) { return NULL; } setmode( fileno(fp), _O_BINARY ); fseek( fp, 0, SEEK_END ); bufferSize = ftell( fp ); fseek( fp, 0, SEEK_SET ); if ( (pbuff = (char*)malloc( bufferSize + 1 )) == NULL ) { fclose(fp); return NULL; } retBufferSize = fread( (void*)pbuff, 1, bufferSize, fp ); fclose( fp ); if ( bufferSize != retBufferSize ) { free ( pbuff ); return NULL; } pbuff[ bufferSize ] = 0; /* parse the sequence file */ seqs = GetSequencesOB( pbuff ); if( NULL == seqs ) { /* if no valid sequences in buffer, print error */ free ( pbuff ); return NULL; } /* set the storage buffer so it can be freed later */ *storage = pbuff; return seqs; } /******************************************************************************************* /* This function parses buffer and returns a list of sequence structures */ /* with pointer set into the parsed buffer. OB= on buffer. */ LBISEQUENCE* GetSequencesOB(char* pbuff) { char *src,*dst; LBISEQUENCE *head,*tail,*current; src = pbuff; // skip until first printable character while(!isprint(*src)&&*src!='\0') src++; // if not a header marker then return null if(*src!='>') return NULL; // iterate and allocate an element for every sequence found head=NULL; while(1) { // allocate a new element current = (LBISEQUENCE*) malloc(sizeof(LBISEQUENCE)); if(current==NULL) { FreeSequencesOB(head); head=NULL; break; } // put it into the list if(head==NULL) { head = current; } else { tail->next = current; } tail = current; current->next=NULL; // set header to point to string after the marker src++; current->header = src; // skip to next carriage return while(*src!=13&&*src!=10&&*src!='\0') src++; // protect against files with only a header if(*src=='\0') { FreeSequencesOB(head); head=NULL; break; } *src='\0'; // continue to parse actual sequence src++; current->sequence = src; // perform in place translation of sequence dst = src; current->length = 0; while(*src!='\0') { // NOTE: paragraph below no longer makes the character UPPER, because we want to user repeat masker with low cases masked out if((*src>='A'&&*src<='Z')||(*src>='a'&&*src<='z')) { *dst=*src; current->length++; dst++; } //check for another sequence */ else if(*src=='>') { if(*(src-1)==10||*(src-1)==13) { break; } } src++; } if(current->length==0) /* return no seqs if empty sequence body */ { FreeSequencesOB(head); head=NULL; break; } // terminate sequence text *dst = '\0'; if(*src!='>') break; } return head; } /*******************************************************************************************/ void FreeSequencesOB(LBISEQUENCE* head) { LBISEQUENCE* current; while(head!=NULL) { current = head; head = head->next; free(current); } return; } #endif