Main Page   Modules   Namespace List   Class Hierarchy   Compound List   File List   Namespace Members   Compound Members   Examples  

fasta_seq.hpp

00001 #include <stdexcept>
00002 #include <fstream>
00003 #include <strstream>
00004 
00005 namespace std
00006 {
00007   //template <typename CharT, typename CharTraits, typename seqtype>
00008   template <typename seqtype>
00009   ostream&
00010   operator<<(ostream& os, const bioinfo::fasta_seq<seqtype>& fasta_seq) {
00011     os<<">"<<fasta_seq.get_header()<<endl;
00012     const seqtype& seq = fasta_seq.get_seq();
00013     unsigned cnt = 0;
00014     const unsigned num_chars = 60;
00015     while(cnt < seq.size()) {
00016       for(unsigned i = cnt; i < num_chars && i < seq.size(); 
00017           i+=num_chars, cnt+=num_chars) {
00018         const seqtype& sub_seq = seq.substr(i,num_chars);
00019         os<<sub_seq;
00020       }
00021       os<<endl;
00022     }
00023     return os;
00024   }
00025   
00026   
00033   template <typename CharT, typename CharTraits, typename seqtype>
00034   basic_istream<CharT,CharTraits>& 
00035   operator>>(basic_istream<CharT,CharTraits>& is, bioinfo::fasta_seq<seqtype>& fasta_seq) {
00036     string* strPtr = new string;
00037     if( !(is>>*strPtr) ) {
00038       delete strPtr;
00039       return is;
00040     }
00041     if( (*strPtr)[0] != '>' ) {
00042       string tmp(*strPtr);
00043       delete strPtr;
00044       throw runtime_error("invalid fasta_seq format: " + tmp);
00045     }
00046     // not very efficient are we...
00047     // dump the ">"
00048     strPtr->erase(strPtr->begin());
00049     fasta_seq.set_header(strPtr);
00050     string str,sum;
00051     seqtype& seq = fasta_seq.get_seq();
00052     std::istream::pos_type pos = is.tellg(); 
00053     const typename CharTraits::int_type eof = CharTraits::eof();
00054     //it's important to note that at this point there
00055     //has to be at least one line of sequence available
00056     //other wise this isn't a fasta_seq file, in this case
00057     //is should return in a failed state
00058     for(;;) {
00059       if(is>>str) {
00060         if( str[0] == '>' ) {
00061           is.seekg(pos);
00062           break;
00063         }
00064         pos = is.tellg(); 
00065         sum.append(str);
00066         // once we've succesfully gotten at least one line
00067         // we don't want to chomp eof, if there's nothing
00068         // left, leave it for the next call of >>
00069         if( '\n' == CharTraits::to_char_type(is.peek()) ) {
00070           //typename CharTraits::int_type val = is.get();
00071           is.get(); //chomp '\n'
00072           if( is.peek() == eof ) {
00073             //is.putback(val);
00074             is.unget();
00075             break;
00076           }
00077         }
00078       }
00079     }
00080     istrstream istrm(sum.c_str());
00081     istrm>>seq;
00082     return is;
00083   }
00084 }
00085 
00086 namespace bioinfo
00087 {
00088 
00089   using std::string;
00090   
00091   template <typename seqtype>
00092   fasta_seq<seqtype>::fasta_seq() {
00093     _fasta_seq.first.reset(new string);
00094     _fasta_seq.second.reset(new seqtype);
00095   }
00096 
00097   template <typename seqtype>
00098   fasta_seq<seqtype>::~fasta_seq() {
00099     // auto_ptrs get called here
00100   }
00101 
00102   template <typename seqtype>
00103   void 
00104   fasta_seq<seqtype>::write(const std::string& filename) const {
00105     std::ofstream ofs(filename.c_str());
00106     if( !ofs ) {
00107       throw runtime_error("unable to write fasta_seq file: " + filename);
00108     }
00109     ofs<<*this;
00110   }
00111 
00112   template <typename seqtype>
00113   void 
00114   fasta_seq<seqtype>::read(const std::string& filename) {
00115     std::ifstream ifs(filename.c_str());
00116     if( !ofs ) {
00117       throw runtime_error("unable to write fasta_seq file: " + filename);
00118     }
00119     ifs>>*this;
00120   }
00121 }
00122 
00123 #if 0
00124 
00125 void dsu::readFastaFile(string& sequenceResult, const string& fasta_seqFile) {
00126         const int BUFF_SIZE = 512000;
00127    ifstream ifs(fasta_seqFile.c_str());
00128    if(!ifs) {
00129                 ostrstream ostrm;
00130       ostrm<<"Unable to open (for reading): "<<endl;
00131                 throw runtime_error(ostrm.str());
00132    }
00133         sequenceResult = "";
00134    // load dna strings
00135         bool status = ifs.getline(stra,BUFF_SIZE);
00136         if( !status )
00137                 throw runtime_error("unable to read file");
00138         if( stra[0] != '>' ) 
00139                 throw runtime_error("fasta_seq does not start with header");
00140 
00141    while( ifs.getline(stra,BUFF_SIZE) ) {
00142       sequenceResult.append(stra);
00143    }
00144         
00145 }
00146 
00147 void dsu::readFastaHeader(string& headerResult, const string& fasta_seqFile) {
00148    ifstream ifs(fasta_seqFile.c_str());
00149    if(!ifs) {
00150       ostrstream ostrm; 
00151       ostrm<<"Unable to open (for reading): "<<endl;
00152       throw runtime_error(ostrm.str());
00153    }
00154    headerResult = "";
00155    // load dna strings
00156         if( ! (ifs>>headerResult) ) {
00157       throw runtime_error("unable to read file");
00158         } else if( headerResult[0] != '>') {
00159       throw runtime_error("fasta_seq does not start with header");
00160         }
00161 }
00162 
00163 #if 0
00164 void dsu::readMultiFastaFile(list<string>& strlst, const string& filename) {
00165    ifstream ifs(filename.c_str());
00166    if(!ifs) {
00167       throw runtime_error("Unable to read: "+filename);
00168    }
00169    const int bsize = 200;
00170    char buff[bsize];
00171    ifs.getline(buff,bsize);
00172    if( *buff != '>') {
00173      throw runtime_error("not multifasta_seq file");
00174    }
00175    string seq;
00176    while(ifs.getline(buff,bsize)) {
00177      if( *buff == '>') {
00178        strlst.push_back(seq);
00179        seq = "";
00180      } else {
00181        string strbuff = buff;
00182        seq += strbuff;
00183      }
00184    }
00185    strlst.push_back(seq);
00186 }
00187 #endif
00188 
00189 
00190 void readMultiFastaFile(mfasta_seq_t& strlst, const string& filename) {
00191    ifstream ifs(filename.c_str());
00192    if(!ifs) {
00193       throw runtime_error("Unable to read: "+filename);
00194    }
00195    const int bsize = 200;
00196    char buff[bsize];
00197    ifs.getline(buff,bsize);
00198    fasta_seq_t seq;
00199    if( *buff != '>') {
00200      throw runtime_error("not multifasta_seq file");
00201    } else 
00202      seq.first = buff;
00203    while(ifs.getline(buff,bsize)) {
00204      if( *buff == '>') {
00205        strlst.push_back(seq);
00206        seq.first = buff;
00207        seq.second = "";
00208      } else {
00209        string strbuff = buff;
00210        for(unsigned k = 0; k < strbuff.size(); ++k) {
00211          strbuff[k] = tolower(strbuff[k]);
00212        }
00213        seq.second += strbuff;
00214      }
00215    }
00216    strlst.push_back(seq);
00217 }
00218 
00219 
00220 
00221 Strand_t str2strnd(const string& parseStrand) {
00222      dsu::Strand_t theStrand = dsu::eEither;
00223      if(parseStrand == "-")
00224        theStrand = dsu::eNeg;
00225      else if(parseStrand == "+")
00226        theStrand = dsu::ePos;
00227      return theStrand;
00228 }
00229 
00230 dsu::Strand_t dsu::coord2strnd(int end5, int end3) {
00231   return end5 > end3 ? dsu::eNeg : dsu::ePos;
00232 }
00233 
00234 static const string posStr = "+";
00235 static const string negStr = "-";
00236 static const string eitherStr = "either";
00237 
00238 const string& dsu::strnd2str(dsu::Strand_t strnd) {
00239         switch(strnd) {
00240                 case ePos:
00241                         return posStr;
00242                 case eNeg:
00243                         return negStr;
00244                 default:
00245                         return eitherStr;
00246         }
00247 }
00248 
00249 }
00250 
00251 namespace std
00252 {
00253 
00254   ostream& operator<<(ostream& os, const dsu::fasta_seq_t& fasta_seq) {
00255     os<<">"<<fasta_seq.first<<endl;
00256     const unsigned linelen = 60;
00257     for(unsigned i = 0; i < fasta_seq.second.length(); i+=linelen) {
00258       string sub = fasta_seq.second.substr(i,linelen);
00259       os<<sub<<endl;
00260     }
00261     return os;
00262   }
00263 } //namespace std;
00264 #endif