001package com.hfg.bio.seq.format;
002
003import java.io.BufferedReader;
004import java.io.BufferedWriter;
005import java.io.OutputStream;
006import java.io.OutputStreamWriter;
007import java.io.StringWriter;
008import java.io.Writer;
009import java.util.regex.Matcher;
010import java.util.regex.Pattern;
011
012import com.hfg.bio.seq.BioSequenceFactory;
013import com.hfg.bio.seq.NucleicAcid;
014import com.hfg.bio.seq.SeqQualityScoreScheme;
015import com.hfg.bio.seq.SeqQualityScores;
016import com.hfg.exception.InvalidValueException;
017import com.hfg.util.StringUtil;
018
019//------------------------------------------------------------------------------
020/**
021 FASTQ sequence format encompassing sequences and their per-base sequencing quality scores.
022 <div>
023 From <a href='https://en.wikipedia.org/wiki/FASTQ_format'>Wikipedia</a>:
024 <pre>
025 "A FASTQ file normally uses four lines per sequence.
026
027 Line 1 begins with a '@' character and is followed by a sequence identifier and an optional description (like a FASTA title line).
028 Line 2 is the raw sequence letters.
029 Line 3 begins with a '+' character and is optionally followed by the same sequence identifier (and any description) again.
030 Line 4 encodes the quality values for the sequence in Line 2, and must contain the same number of symbols as letters in the sequence."
031 </pre>
032 </div>
033
034 <div>
035 @author J. Alex Taylor, hairyfatguy.com
036 </div>
037 */
038//------------------------------------------------------------------------------
039// com.hfg Library
040//
041// This library is free software; you can redistribute it and/or
042// modify it under the terms of the GNU Lesser General Public
043// License as published by the Free Software Foundation; either
044// version 2.1 of the License, or (at your option) any later version.
045//
046// This library is distributed in the hope that it will be useful,
047// but WITHOUT ANY WARRANTY; without even the implied warranty of
048// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
049// Lesser General Public License for more details.
050//
051// You should have received a copy of the GNU Lesser General Public
052// License along with this library; if not, write to the Free Software
053// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
054//
055// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
056// jataylor@hairyfatguy.com
057//------------------------------------------------------------------------------
058
059public class FASTQ<T extends NucleicAcid> extends ReadableSeqFormatBase<T> implements WritableSeqFormat<T>
060{
061   private SeqQualityScoreScheme mScheme = SeqQualityScoreScheme.sanger;
062
063   private static final Pattern sHeaderLinePattern = Pattern.compile("@(\\S+)(?:\\s+(.*?))?");
064   private static final Pattern sSequenceLinePattern = Pattern.compile("[A-Za-z\\*\\-]+");
065
066
067   private enum TargetLine
068   {
069      header,
070      sequence,
071      quality_header,
072      quality
073   }
074
075   //###########################################################################
076   // CONSTRUCTORS
077   //###########################################################################
078
079   //---------------------------------------------------------------------------
080   public FASTQ()
081   {
082      super(null);
083   }
084
085   //---------------------------------------------------------------------------
086   public FASTQ(BioSequenceFactory<T> inSeqFactory)
087   {
088      super(inSeqFactory);
089   }
090
091   //###########################################################################
092   // PUBLIC METHODS
093   //###########################################################################
094
095
096   //---------------------------------------------------------------------------
097   public FASTQ<T> setScheme(SeqQualityScoreScheme inValue)
098   {
099      if (null == inValue)
100      {
101         throw new InvalidValueException("The sequence scoring scheme cannot be set to null!");
102      }
103
104      mScheme = inValue;
105      return this;
106   }
107
108   //---------------------------------------------------------------------------
109   public SeqQualityScoreScheme getScheme()
110   {
111      return mScheme;
112   }
113
114   //---------------------------------------------------------------------------
115   public T readRecord(BufferedReader inReader)
116         throws SeqIOException
117   {
118      if (null == getBioSequenceFactory())
119      {
120         throw new SeqIOException("No BioSequence factory has been specified!");
121      }
122
123      int lineCount = 0;
124      T seq;
125      try
126      {
127         seq = getBioSequenceFactory().createSeqObj();
128
129         TargetLine targetLine = TargetLine.header;
130         String line;
131         while ((line = inReader.readLine()) != null)
132         {
133            lineCount++;
134
135            // Skip comment lines or blank lines. Note that quality lines sometimes start w/ '#'
136            // so we can't use that to indicate a comment line.
137            if (line.startsWith("//")
138                || line.matches("\\s*"))
139            {
140               continue;
141            }
142
143            switch (targetLine)
144            {
145               case header:
146                  parseHeaderLine(line, seq);
147                  targetLine = TargetLine.sequence;
148                  break;
149               case sequence:
150                  parseSequenceLine(line, seq);
151                  targetLine = TargetLine.quality_header;
152                  break;
153               case quality_header:
154                  parseQualityHeaderLine(line, seq);
155                  targetLine = TargetLine.quality;
156                  break;
157               case quality:
158                  parseQualityLine(line, seq);
159                  targetLine = null;
160                  break;
161            }
162
163            if (null == targetLine)
164            {
165               break;
166            }
167         }
168      }
169      catch (SeqIOException e)
170      {
171         throw e;
172      }
173      catch (Exception e)
174      {
175         throw new SeqIOException(e);
176      }
177
178      return seq;
179   }
180
181   //---------------------------------------------------------------------------
182   public boolean isEndOfRecord(String inLine)
183   {
184      return inLine.startsWith("@");
185   }
186
187   //---------------------------------------------------------------------------
188   public boolean hasJanusDelimiter()
189   {
190      return true;
191   }
192
193   //---------------------------------------------------------------------------
194   public String write(T inSeq)
195         throws SeqIOException
196   {
197      StringWriter writer = new StringWriter();
198
199      write(inSeq, writer);
200
201      return writer.toString();
202   }
203
204   //---------------------------------------------------------------------------
205   public void write(T inSeq, OutputStream inStream)
206         throws SeqIOException
207   {
208      Writer writer = new OutputStreamWriter(inStream);
209      write(inSeq, writer);
210      try
211      {
212         writer.flush();
213      }
214      catch (Exception e)
215      {
216         throw new SeqIOException(e);
217      }
218   }
219
220   //---------------------------------------------------------------------------
221   public void write(T inSeq, Writer inWriter)
222         throws SeqIOException
223   {
224      BufferedWriter writer = null;
225      try
226      {
227         try
228         {
229            if (writer instanceof BufferedWriter)
230            {
231               writer = (BufferedWriter) inWriter;
232            } else
233            {
234               writer = new BufferedWriter(inWriter, 8196);
235            }
236
237            // Write the header line
238            writer.write("@");
239            writer.write(inSeq.getID());
240            if (StringUtil.isSet(inSeq.getDescription()))
241            {
242               writer.write(" " + inSeq.getDescription());
243            }
244
245            writer.write("\n");
246
247            // Write the sequence line
248            writer.write(inSeq.getSequence());
249            writer.write("\n");
250
251            // Write the quality header line
252            writer.write("+\n");
253
254            // Write the quality line
255            if (inSeq.getSeqQualityScores() != null)
256            {
257               String encodedQualityString = inSeq.getSeqQualityScores().getEncodedQualityString();
258               if (encodedQualityString != null)
259               {
260                  writer.write(encodedQualityString);
261               }
262            }
263            writer.write("\n");
264         }
265         finally
266         {
267            if (writer != null)
268            {
269               writer.flush();
270            }
271         }
272      }
273      catch (SeqIOException e)
274      {
275         throw e;
276      }
277      catch (Exception e)
278      {
279         throw new SeqIOException(e);
280      }
281   }
282
283   //###########################################################################
284   // PROTECTED METHODS
285   //###########################################################################
286
287   //---------------------------------------------------------------------------
288   protected void parseHeaderLine(String inLine, T inSeq)
289   {
290      if (! inLine.startsWith("@"))
291      {
292         throw new SeqFormatException("Expected a FASTQ header line but found " + StringUtil.singleQuote(inLine) + "!");
293      }
294
295      Matcher m = sHeaderLinePattern.matcher(inLine);
296      if (m.matches())
297      {
298         inSeq.setID(m.group(1));
299         inSeq.setDescription(m.group(2));
300      }
301      else
302      {
303         throw new SeqFormatException("The header line" + StringUtil.singleQuote(inLine) + " is not in proper FASTQ format!");
304      }
305   }
306
307   //###########################################################################
308   // PRIVATE METHODS
309   //###########################################################################
310
311   //---------------------------------------------------------------------------
312   private void parseSequenceLine(String inLine, T inSeq)
313   {
314      String seqString = StringUtil.replaceWhitespace(inLine, "");
315
316      if (! sSequenceLinePattern.matcher(seqString).matches())
317      {
318         throw new SeqFormatException("Expected a FASTQ header line but found " + StringUtil.singleQuote(inLine) + "!");
319      }
320
321      inSeq.setSequence(seqString);
322   }
323
324   //---------------------------------------------------------------------------
325   private void parseQualityHeaderLine(String inLine, T inSeq)
326   {
327      if (! inLine.startsWith("+"))
328      {
329         throw new SeqFormatException("Expected a FASTQ quality header line but found " + StringUtil.singleQuote(inLine) + "!");
330      }
331
332      // For now, skip parsing any id or description from here
333   }
334
335   //---------------------------------------------------------------------------
336   private void parseQualityLine(String inLine, T inSeq)
337   {
338      if (inLine.length() != inSeq.length())
339      {
340         throw new SeqFormatException("The FASTQ quality string is not the same length as the sequence!");
341      }
342
343      SeqQualityScores seqQualityScores = new SeqQualityScores(inLine, getScheme());
344
345      inSeq.setSeqQualityScores(seqQualityScores);
346   }
347
348}