001package com.hfg.bio.seq.format; 002 003import java.io.*; 004import java.util.Collection; 005import java.util.logging.Logger; 006import java.util.regex.Matcher; 007import java.util.regex.Pattern; 008 009import com.hfg.bio.seq.BioSequence; 010import com.hfg.bio.seq.BioSequenceFactory; 011import com.hfg.bio.seq.BioSequencePlus; 012import com.hfg.util.StringUtil; 013 014//------------------------------------------------------------------------------ 015/** 016 FASTA sequence format. Allowed sequence characters are upper-case letters, 017 lower-case letters, '*' for stop codons, and '-' for gaps. Numbers and spaces 018 will be silently stripped from the sequence and any other characters will cause 019 a SeqFormatException. 020 <div> 021 @author J. Alex Taylor, hairyfatguy.com 022 </div> 023 */ 024//------------------------------------------------------------------------------ 025// com.hfg Library 026// 027// This library is free software; you can redistribute it and/or 028// modify it under the terms of the GNU Lesser General Public 029// License as published by the Free Software Foundation; either 030// version 2.1 of the License, or (at your option) any later version. 031// 032// This library is distributed in the hope that it will be useful, 033// but WITHOUT ANY WARRANTY; without even the implied warranty of 034// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 035// Lesser General Public License for more details. 036// 037// You should have received a copy of the GNU Lesser General Public 038// License along with this library; if not, write to the Free Software 039// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 040// 041// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com 042// jataylor@hairyfatguy.com 043//------------------------------------------------------------------------------ 044 045public class FASTA<T extends BioSequence> extends ReadableSeqFormatBase<T> implements WritableSeqFormat<T> 046{ 047 private Integer mLineLength = sDefaultLineLength; 048 049 private int mMaxExceptionsPerRecord = 0; 050 051 private static int sDefaultLineLength = 75; 052 053 private static Pattern sHeaderLinePattern = Pattern.compile(">(\\S+)(?:\\s+(.*?))?"); 054 055 private final static Logger LOGGER = Logger.getLogger(FASTA.class.getName()); 056 057 //########################################################################### 058 // CONSTRUCTORS 059 //########################################################################### 060 061 //--------------------------------------------------------------------------- 062 public FASTA() 063 { 064 super(null); 065 } 066 067 //--------------------------------------------------------------------------- 068 public FASTA(BioSequenceFactory<T> inSeqFactory) 069 { 070 super(inSeqFactory); 071 } 072 073 //########################################################################### 074 // PUBLIC METHODS 075 //########################################################################### 076 077 //--------------------------------------------------------------------------- 078 public static Logger getLogger() 079 { 080 return LOGGER; 081 } 082 083 //--------------------------------------------------------------------------- 084 /** 085 Specify the maximum number of Exceptions to tolerate per record. Defaults to zero. 086 This mechanism will only work with sequences objects that implement the BioSequencePlus interface. 087 If a record produces less than the specified maximum number of Exceptions, the 088 Exceptions can be retrieved via the getParseExceptions() method on the 089 BioSequencePlus sequence object. 090 * @param inValue the maximum number of Exceptions to tolerate per record 091 * @return this format object to facilitate method chaining. 092 */ 093 public FASTA<T> setMaxExceptionsPerRecord(int inValue) 094 { 095 mMaxExceptionsPerRecord = inValue; 096 return this; 097 } 098 099 //--------------------------------------------------------------------------- 100 public FASTA<T> setLineLength(Integer inValue) 101 { 102 mLineLength = inValue; 103 return this; 104 } 105 106 //--------------------------------------------------------------------------- 107 public Integer getLineLength() 108 { 109 return mLineLength; 110 } 111 112 //--------------------------------------------------------------------------- 113 public T readRecord(BufferedReader inReader) 114 throws SeqIOException 115 { 116 if (null == getBioSequenceFactory()) 117 { 118 throw new SeqIOException("No BioSequence factory has been specified!"); 119 } 120 121 int lineCount = 0; 122 T seq = null; 123 try 124 { 125 seq = getBioSequenceFactory().createSeqObj(); 126 127 boolean headerLineFound = false; 128 129 String line; 130 while ((line = inReader.readLine()) != null) 131 { 132 lineCount++; 133 134 // Skip comment lines or blank lines 135 if (line.startsWith("#") 136 || line.startsWith("//") 137 || line.matches("\\s*")) 138 { 139 continue; 140 } 141 142 if (line.startsWith(">")) 143 { 144 headerLineFound = true; 145 146 line = line.trim(); 147 148 if (seq.getID() != null) 149 { 150 throw new SeqFormatException("Line " + lineCount + ": Multiple header lines found in the sequence record!"); 151 } 152 153 Matcher m = sHeaderLinePattern.matcher(line); 154 if (m.matches()) 155 { 156 seq.setID(m.group(1)); 157 seq.setDescription(m.group(2)); 158 } 159 else 160 { 161 throw new SeqFormatException("Line " + lineCount + ": The header line" + StringUtil.singleQuote(line) + " is not in proper FASTA format!"); 162 } 163 164 break; 165 } 166 else 167 { 168 throw new SeqFormatException("Invalid FASTA Format! Expected header line but found " + StringUtil.singleQuote(line) + "!"); 169 } 170 } 171 172 if (! headerLineFound) 173 { 174 throw new SeqFormatException("No FASTA header line found!"); 175 } 176 177 // The rest of the record should be sequence 178 179 // Cleanup the sequence to remove spaces and numbers 180 Reader filterReader = new FASTASeqFilterReader(seq, inReader); 181 seq.setSequence(filterReader); 182 183 filterReader.close(); 184 } 185 catch (SeqFormatException e) 186 { 187 SeqIOException exception; 188 if (StringUtil.isSet(seq.getID())) 189 { 190 exception = new SeqIOException("Problem encountered while reading sequence " 191 + StringUtil.singleQuote(seq.getID()) + "!", e); 192 } 193 else 194 { 195 exception = e; 196 } 197 198 if (mMaxExceptionsPerRecord > 0 199 && seq instanceof BioSequencePlus 200 && (! ((BioSequencePlus) seq).hadParseExceptions() 201 || ((BioSequencePlus) seq).getParseExceptions().size() < mMaxExceptionsPerRecord)) 202 { 203 ((BioSequencePlus) seq).addParseException(exception); 204 getLogger().warning(exception.getMessage()); 205 } 206 else 207 { 208 throw exception; 209 } 210 } 211 catch (SeqIOException e) 212 { 213 throw e; 214 } 215 catch (Exception e) 216 { 217 throw new SeqIOException(e); 218 } 219 220 return seq; 221 } 222 223 //--------------------------------------------------------------------------- 224 public boolean isEndOfRecord(String inLine) 225 { 226 return inLine.startsWith(">"); 227 } 228 229 //--------------------------------------------------------------------------- 230 public boolean hasJanusDelimiter() 231 { 232 return true; 233 } 234 235 //--------------------------------------------------------------------------- 236 public String write(Collection<T> inSeqs) 237 throws SeqIOException 238 { 239 StringWriter writer = new StringWriter(); 240 for (T seq : inSeqs) 241 { 242 write(seq, writer); 243 } 244 245 return writer.toString(); 246 } 247 248 //--------------------------------------------------------------------------- 249 public String write(T inSeq) 250 throws SeqIOException 251 { 252 StringWriter writer = new StringWriter(); 253 254 write(inSeq, writer); 255 256 return writer.toString(); 257 } 258 259 //--------------------------------------------------------------------------- 260 public void write(T inSeq, OutputStream inStream) 261 throws SeqIOException 262 { 263 Writer writer = new OutputStreamWriter(inStream); 264 write(inSeq, writer); 265 try 266 { 267 writer.flush(); 268 } 269 catch (Exception e) 270 { 271 throw new SeqIOException(e); 272 } 273 } 274 275 //--------------------------------------------------------------------------- 276 public void write(T inSeq, Writer inWriter) 277 throws SeqIOException 278 { 279 Reader seqReader = null; 280 BufferedWriter writer = null; 281 try 282 { 283 try 284 { 285 if (inWriter instanceof BufferedWriter) 286 { 287 writer = (BufferedWriter) inWriter; 288 } else 289 { 290 writer = new BufferedWriter(inWriter, 8196); 291 } 292 293 // Write the header line 294 writer.write(">"); 295 writer.write(inSeq.getID()); 296 if (StringUtil.isSet(inSeq.getDescription())) 297 { 298 writer.write(" " + inSeq.getDescription()); 299 } 300 301 writer.write("\n"); 302 303 // Write the sequence lines 304 305 seqReader = inSeq.getSequenceReader(); 306 307 // A null line length indicates that we should write the whole sequence on one line 308 int bufferSize = (mLineLength != null ? mLineLength : 2048); 309 char[] buffer = new char[bufferSize]; 310 int numBytesRead; 311 while ((numBytesRead = seqReader.read(buffer)) != -1) 312 { 313 writer.write(buffer, 0, numBytesRead); 314 if (mLineLength != null) 315 { 316 writer.write("\n"); 317 } 318 } 319 320 if (null == mLineLength) 321 { 322 writer.write("\n"); 323 } 324 } 325 finally 326 { 327 if (seqReader != null) 328 { 329 seqReader.close(); 330 } 331 332 if (writer != null) 333 { 334 writer.flush(); 335 } 336 } 337 } 338 catch (SeqIOException e) 339 { 340 throw e; 341 } 342 catch (Exception e) 343 { 344 throw new SeqIOException(e); 345 } 346 } 347 348 //########################################################################### 349 // INNER CLASS 350 //########################################################################### 351 352 private class FASTASeqFilterReader extends FilterReader 353 { 354 private BioSequence mSeq; 355 private char[] mBuffer = new char[8196]; 356 private int mBufferLimit; 357 private int mBufferIndex; 358 private boolean mEndOfStreamReached; 359 private int mPrevChar = -1; 360 private int mLineCount = 1; 361 private int mCharacterCount; 362 363 //--------------------------------------------------------------------------- 364 FASTASeqFilterReader(BioSequence inSeq, Reader inReader) 365 { 366 super(inReader); 367 mSeq = inSeq; 368 } 369 370 //--------------------------------------------------------------------------- 371 @Override 372 public int read() 373 throws IOException 374 { 375 int returnChar; 376 377 do 378 { 379 returnChar = innerRead(); 380 mCharacterCount++; 381 382 if (Character.isWhitespace(returnChar) 383 || Character.isDigit(returnChar)) 384 { 385 if ('\n' == returnChar) 386 { 387 mLineCount++; 388 mCharacterCount = 1; 389 } 390 391 continue; 392 } 393 else if (! Character.isLetter(returnChar) // Allow letters 394 && returnChar != '*' // Allow stop codons 395 && returnChar != '-' // Allow gaps 396 && returnChar != -1) // Allow EOF 397 { 398 String msg; 399 400 if ('>' == returnChar) 401 { 402 // This is severe enough that we don't want to continue processing 403 // the sequence as if it belongs to this record. 404 throw new SeqFormatException("The FASTA record start character " + StringUtil.singleQuote((char) returnChar) + " following this record must occur as the first character on the line!"); 405 } 406 else 407 { 408 SeqFormatException e = new SeqFormatException("Illegal sequence character " + StringUtil.singleQuote((char) returnChar) + " encountered on sequence line " + mLineCount + " position " + mCharacterCount + "!"); 409 410 if (mMaxExceptionsPerRecord > 0 411 && mSeq instanceof BioSequencePlus 412 && (!((BioSequencePlus) mSeq).hadParseExceptions() 413 || ((BioSequencePlus) mSeq).getParseExceptions() 414 .size() < mMaxExceptionsPerRecord)) 415 { 416 ((BioSequencePlus) mSeq).addParseException(e); 417 getLogger().warning(e.getMessage()); 418 } 419 else 420 { 421 throw e; 422 } 423 } 424 } 425 } 426 while (false); 427 428 return returnChar; 429 } 430 431 //--------------------------------------------------------------------------- 432 public int read(char[] inBuffer, int inOffset, int inMaxReadLength) 433 throws IOException 434 { 435 int theChar; 436 int numCharsRead = 0; 437 do 438 { 439 theChar = read(); 440 if (theChar > 0) 441 { 442 inBuffer[inOffset++] = (char) theChar; 443 numCharsRead++; 444 } 445 } 446 while (theChar >= 0 447 && numCharsRead < inMaxReadLength); 448 449 return (theChar < 0 && 0 == numCharsRead ? -1 : numCharsRead); 450 } 451 452 //--------------------------------------------------------------------------- 453 protected int innerRead() 454 throws IOException 455 { 456 if (mBufferIndex >= mBufferLimit) 457 { 458 fillBuffer(); 459 } 460 461 return (mEndOfStreamReached ? -1 : mBuffer[mBufferIndex++]); 462 } 463 464 //--------------------------------------------------------------------------- 465 private void fillBuffer() 466 throws IOException 467 { 468 mBufferLimit = super.in.read(mBuffer, 0, mBuffer.length); 469 470 if (-1 == mBufferLimit) 471 { 472 mEndOfStreamReached = true; 473 } 474 475 // Reset the index 476 mBufferIndex = 0; 477 } 478 } 479}