001package com.hfg.bio.seq.format;
002
003import java.io.*;
004import java.util.Collection;
005import java.util.logging.Logger;
006import java.util.regex.Matcher;
007import java.util.regex.Pattern;
008
009import com.hfg.bio.seq.BioSequence;
010import com.hfg.bio.seq.BioSequenceFactory;
011import com.hfg.bio.seq.BioSequencePlus;
012import com.hfg.util.StringUtil;
013
014//------------------------------------------------------------------------------
015/**
016 FASTA sequence format. Allowed sequence characters are upper-case letters,
017 lower-case letters, '*' for stop codons, and '-' for gaps. Numbers and spaces
018 will be silently stripped from the sequence and any other characters will cause
019 a SeqFormatException.
020 <div>
021  @author J. Alex Taylor, hairyfatguy.com
022 </div>
023 */
024//------------------------------------------------------------------------------
025// com.hfg Library
026//
027// This library is free software; you can redistribute it and/or
028// modify it under the terms of the GNU Lesser General Public
029// License as published by the Free Software Foundation; either
030// version 2.1 of the License, or (at your option) any later version.
031//
032// This library is distributed in the hope that it will be useful,
033// but WITHOUT ANY WARRANTY; without even the implied warranty of
034// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
035// Lesser General Public License for more details.
036//
037// You should have received a copy of the GNU Lesser General Public
038// License along with this library; if not, write to the Free Software
039// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
040//
041// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
042// jataylor@hairyfatguy.com
043//------------------------------------------------------------------------------
044
045public class FASTA<T extends BioSequence> extends ReadableSeqFormatBase<T> implements WritableSeqFormat<T>
046{
047   private Integer mLineLength = sDefaultLineLength;
048
049   private int mMaxExceptionsPerRecord = 0;
050
051   private static int sDefaultLineLength = 75;
052
053   private static Pattern sHeaderLinePattern = Pattern.compile(">(\\S+)(?:\\s+(.*?))?");
054
055   private final static Logger LOGGER = Logger.getLogger(FASTA.class.getName());
056
057   //###########################################################################
058   // CONSTRUCTORS
059   //###########################################################################
060
061   //---------------------------------------------------------------------------
062   public FASTA()
063   {
064      super(null);
065   }
066
067   //---------------------------------------------------------------------------
068   public FASTA(BioSequenceFactory<T> inSeqFactory)
069   {
070      super(inSeqFactory);
071   }
072
073   //###########################################################################
074   // PUBLIC METHODS
075   //###########################################################################
076
077   //---------------------------------------------------------------------------
078   public static Logger getLogger()
079   {
080      return LOGGER;
081   }
082
083   //---------------------------------------------------------------------------
084   /**
085    Specify the maximum number of Exceptions to tolerate per record. Defaults to zero.
086    This mechanism will only work with sequences objects that implement the BioSequencePlus interface.
087    If a record produces less than the specified maximum number of Exceptions, the
088    Exceptions can be retrieved via the getParseExceptions() method on the
089    BioSequencePlus sequence object.
090    * @param inValue the maximum number of Exceptions to tolerate per record
091    * @return this format object to facilitate method chaining.
092    */
093   public FASTA<T> setMaxExceptionsPerRecord(int inValue)
094   {
095      mMaxExceptionsPerRecord = inValue;
096      return this;
097   }
098
099   //---------------------------------------------------------------------------
100   public FASTA<T> setLineLength(Integer inValue)
101   {
102      mLineLength = inValue;
103      return this;
104   }
105
106   //---------------------------------------------------------------------------
107   public Integer getLineLength()
108   {
109      return mLineLength;
110   }
111
112   //---------------------------------------------------------------------------
113   public T readRecord(BufferedReader inReader)
114         throws SeqIOException
115   {
116      if (null == getBioSequenceFactory())
117      {
118         throw new SeqIOException("No BioSequence factory has been specified!");
119      }
120
121      int lineCount = 0;
122      T seq = null;
123      try
124      {
125         seq = getBioSequenceFactory().createSeqObj();
126
127         boolean headerLineFound = false;
128
129         String line;
130         while ((line = inReader.readLine()) != null)
131         {
132            lineCount++;
133            
134            // Skip comment lines or blank lines
135            if (line.startsWith("#")
136                || line.startsWith("//")
137                || line.matches("\\s*"))
138            {
139               continue;
140            }
141
142            if (line.startsWith(">"))
143            {
144               headerLineFound = true;
145
146               line = line.trim();
147
148               if (seq.getID() != null)
149               {
150                  throw new SeqFormatException("Line " + lineCount + ": Multiple header lines found in the sequence record!");
151               }
152
153               Matcher m = sHeaderLinePattern.matcher(line);
154               if (m.matches())
155               {
156                  seq.setID(m.group(1));
157                  seq.setDescription(m.group(2));
158               }
159               else
160               {
161                  throw new SeqFormatException("Line " + lineCount + ": The header line" + StringUtil.singleQuote(line) + " is not in proper FASTA format!");
162               }
163
164               break;
165            }
166            else
167            {
168               throw new SeqFormatException("Invalid FASTA Format! Expected header line but found " + StringUtil.singleQuote(line) + "!");
169            }
170         }
171
172         if (! headerLineFound)
173         {
174            throw new SeqFormatException("No FASTA header line found!");
175         }
176
177         // The rest of the record should be sequence
178
179         // Cleanup the sequence to remove spaces and numbers
180         Reader filterReader = new FASTASeqFilterReader(seq, inReader);
181         seq.setSequence(filterReader);
182
183         filterReader.close();
184      }
185      catch (SeqFormatException e)
186      {
187         SeqIOException exception;
188         if (StringUtil.isSet(seq.getID()))
189         {
190            exception = new SeqIOException("Problem encountered while reading sequence "
191                                                 + StringUtil.singleQuote(seq.getID()) + "!", e);
192         }
193         else
194         {
195            exception = e;
196         }
197
198         if (mMaxExceptionsPerRecord > 0
199             && seq instanceof BioSequencePlus
200             && (! ((BioSequencePlus) seq).hadParseExceptions()
201                 || ((BioSequencePlus) seq).getParseExceptions().size() < mMaxExceptionsPerRecord))
202         {
203            ((BioSequencePlus) seq).addParseException(exception);
204            getLogger().warning(exception.getMessage());
205         }
206         else
207         {
208            throw exception;
209         }
210      }
211      catch (SeqIOException e)
212      {
213         throw e;
214      }
215      catch (Exception e)
216      {
217         throw new SeqIOException(e);
218      }
219
220      return seq;
221   }
222
223   //---------------------------------------------------------------------------
224   public boolean isEndOfRecord(String inLine)
225   {
226      return inLine.startsWith(">");
227   }
228
229   //---------------------------------------------------------------------------
230   public boolean hasJanusDelimiter()
231   {
232      return true;
233   }
234
235   //---------------------------------------------------------------------------
236   public String write(Collection<T> inSeqs)
237         throws SeqIOException
238   {
239      StringWriter writer = new StringWriter();
240      for (T seq : inSeqs)
241      {
242         write(seq, writer);
243      }
244
245      return writer.toString();
246   }
247
248   //---------------------------------------------------------------------------
249   public String write(T inSeq)
250         throws SeqIOException
251   {
252      StringWriter writer = new StringWriter();
253
254      write(inSeq, writer);
255
256      return writer.toString();
257   }
258
259   //---------------------------------------------------------------------------
260   public void write(T inSeq, OutputStream inStream)
261         throws SeqIOException
262   {
263      Writer writer = new OutputStreamWriter(inStream);
264      write(inSeq, writer);
265      try
266      {
267         writer.flush();
268      }
269      catch (Exception e)
270      {
271         throw new SeqIOException(e);
272      }
273   }
274
275   //---------------------------------------------------------------------------
276   public void write(T inSeq, Writer inWriter)
277         throws SeqIOException
278   {
279      Reader seqReader = null;
280      BufferedWriter writer = null;
281      try
282      {
283         try
284         {
285            if (inWriter instanceof BufferedWriter)
286            {
287               writer = (BufferedWriter) inWriter;
288            } else
289            {
290               writer = new BufferedWriter(inWriter, 8196);
291            }
292
293            // Write the header line
294            writer.write(">");
295            writer.write(inSeq.getID());
296            if (StringUtil.isSet(inSeq.getDescription()))
297            {
298               writer.write(" " + inSeq.getDescription());
299            }
300
301            writer.write("\n");
302
303            // Write the sequence lines
304
305            seqReader = inSeq.getSequenceReader();
306
307            // A null line length indicates that we should write the whole sequence on one line
308            int bufferSize = (mLineLength != null ? mLineLength : 2048);
309            char[] buffer = new char[bufferSize];
310            int numBytesRead;
311            while ((numBytesRead = seqReader.read(buffer)) != -1)
312            {
313               writer.write(buffer, 0, numBytesRead);
314               if (mLineLength != null)
315               {
316                  writer.write("\n");
317               }
318            }
319
320            if (null == mLineLength)
321            {
322               writer.write("\n");
323            }
324         }
325         finally
326         {
327            if (seqReader != null)
328            {
329               seqReader.close();
330            }
331
332            if (writer != null)
333            {
334               writer.flush();
335            }
336         }
337      }
338      catch (SeqIOException e)
339      {
340         throw e;
341      }
342      catch (Exception e)
343      {
344         throw new SeqIOException(e);
345      }
346   }
347
348   //###########################################################################
349   // INNER CLASS
350   //###########################################################################
351
352   private class FASTASeqFilterReader extends FilterReader
353   {
354      private BioSequence mSeq;
355      private char[]  mBuffer = new char[8196];
356      private int     mBufferLimit;
357      private int     mBufferIndex;
358      private boolean mEndOfStreamReached;
359      private int     mPrevChar = -1;
360      private int     mLineCount = 1;
361      private int     mCharacterCount;
362
363      //---------------------------------------------------------------------------
364      FASTASeqFilterReader(BioSequence inSeq, Reader inReader)
365      {
366         super(inReader);
367         mSeq = inSeq;
368      }
369
370      //---------------------------------------------------------------------------
371      @Override
372      public int read()
373            throws IOException
374      {
375         int returnChar;
376
377         do
378         {
379            returnChar = innerRead();
380            mCharacterCount++;
381
382            if (Character.isWhitespace(returnChar)
383                || Character.isDigit(returnChar))
384            {
385               if ('\n' == returnChar)
386               {
387                  mLineCount++;
388                  mCharacterCount = 1;
389               }
390
391               continue;
392            }
393            else if (! Character.isLetter(returnChar) // Allow letters
394                  && returnChar != '*' // Allow stop codons
395                  && returnChar != '-' // Allow gaps
396                  && returnChar != -1) // Allow EOF
397            {
398               String msg;
399
400               if ('>' == returnChar)
401               {
402                  // This is severe enough that we don't want to continue processing
403                  // the sequence as if it belongs to this record.
404                  throw new SeqFormatException("The FASTA record start character " + StringUtil.singleQuote((char) returnChar) + " following this record must occur as the first character on the line!");
405               }
406               else
407               {
408                  SeqFormatException e = new SeqFormatException("Illegal sequence character " + StringUtil.singleQuote((char) returnChar) + " encountered on sequence line " + mLineCount + " position " + mCharacterCount + "!");
409
410                  if (mMaxExceptionsPerRecord > 0
411                        && mSeq instanceof BioSequencePlus
412                        && (!((BioSequencePlus) mSeq).hadParseExceptions()
413                        || ((BioSequencePlus) mSeq).getParseExceptions()
414                                                   .size() < mMaxExceptionsPerRecord))
415                  {
416                     ((BioSequencePlus) mSeq).addParseException(e);
417                     getLogger().warning(e.getMessage());
418                  }
419                  else
420                  {
421                     throw e;
422                  }
423               }
424            }
425         }
426         while (false);
427
428         return returnChar;
429      }
430
431      //---------------------------------------------------------------------------
432      public int read(char[] inBuffer, int inOffset, int inMaxReadLength)
433            throws IOException
434      {
435         int theChar;
436         int numCharsRead = 0;
437         do
438         {
439            theChar = read();
440            if (theChar > 0)
441            {
442               inBuffer[inOffset++] = (char) theChar;
443               numCharsRead++;
444            }
445         }
446         while (theChar >= 0
447               && numCharsRead < inMaxReadLength);
448
449         return (theChar < 0 && 0 == numCharsRead ? -1 : numCharsRead);
450      }
451
452      //---------------------------------------------------------------------------
453      protected int innerRead()
454            throws IOException
455      {
456         if (mBufferIndex >= mBufferLimit)
457         {
458            fillBuffer();
459         }
460
461         return (mEndOfStreamReached ? -1 : mBuffer[mBufferIndex++]);
462      }
463
464      //---------------------------------------------------------------------------
465      private void fillBuffer()
466            throws IOException
467      {
468         mBufferLimit = super.in.read(mBuffer, 0, mBuffer.length);
469
470         if (-1 == mBufferLimit)
471         {
472            mEndOfStreamReached = true;
473         }
474
475         // Reset the index
476         mBufferIndex = 0;
477      }
478   }
479}