001package com.hfg.bio.seq.format;
002
003import com.hfg.bio.seq.BioSequenceFactory;
004import com.hfg.bio.seq.NucleicAcid;
005import com.hfg.util.BooleanUtil;
006import com.hfg.util.StringUtil;
007
008//------------------------------------------------------------------------------
009/**
010 FASTQ sequence format from Illumina. Fields are parsed from the header line into
011 attributes on the sequence object.
012 <div>
013 See <a href='https://support.illumina.com/help/BaseSpace_OLH_009008/Content/Source/Informatics/BS/FileFormat_FASTQ-files_swBS.htm'>Illumina's file format description</a>.
014 </div>
015
016 <div>
017 @author J. Alex Taylor, hairyfatguy.com
018 </div>
019 */
020//------------------------------------------------------------------------------
021// com.hfg Library
022//
023// This library is free software; you can redistribute it and/or
024// modify it under the terms of the GNU Lesser General Public
025// License as published by the Free Software Foundation; either
026// version 2.1 of the License, or (at your option) any later version.
027//
028// This library is distributed in the hope that it will be useful,
029// but WITHOUT ANY WARRANTY; without even the implied warranty of
030// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
031// Lesser General Public License for more details.
032//
033// You should have received a copy of the GNU Lesser General Public
034// License along with this library; if not, write to the Free Software
035// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
036//
037// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
038// jataylor@hairyfatguy.com
039//------------------------------------------------------------------------------
040
041public class IlluminaFASTQ<T extends NucleicAcid> extends FASTQ<T>
042{
043   //###########################################################################
044   // CONSTRUCTORS
045   //###########################################################################
046
047   //---------------------------------------------------------------------------
048   public IlluminaFASTQ()
049   {
050      super(null);
051   }
052
053   //---------------------------------------------------------------------------
054   public IlluminaFASTQ(BioSequenceFactory<T> inSeqFactory)
055   {
056      super(inSeqFactory);
057   }
058
059   //###########################################################################
060   // PROTECTED METHODS
061   //###########################################################################
062
063   //---------------------------------------------------------------------------
064   // See: https://support.illumina.com/help/BaseSpace_OLH_009008/Content/Source/Informatics/BS/FileFormat_FASTQ-files_swBS.htm
065   // Header line format:
066   // @<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos>:<UMI> <read>:<is filtered>:<control number>:<index>
067   //
068   // Ex: @HWI-M01141:63:A4NDL:1:1101:16668:1377 1:N:0:TATAGCGAGACACCGT
069   //     instrument = HWI-M01141
070   //     run number = 63
071   //     flowcell ID = A4NDL
072   //     lane = 1
073   //     tile = 1101
074   //     x-pos = 16668
075   //     y-pos = 1377
076   //     UMI (optional) =
077   //     read = 1
078   //     is filtered = N
079   //     control number = 0
080   //     index = TATAGCGAGACACCGT
081   protected void parseHeaderLine(String inLine, T inSeq)
082   {
083      // Let the super class break the header line into id and description
084      super.parseHeaderLine(inLine, inSeq);
085
086      // Extract Illumina fields from the id
087      String[] fields = inSeq.getID().split(":");
088      if (fields.length < 7
089          || fields.length > 8)
090      {
091         throw new SeqFormatException("Unexpected number of fields in the header id " + StringUtil.singleQuote(inSeq.getID()) + "!");
092      }
093
094      inSeq.setAttribute("instrument", fields[0]);
095      inSeq.setAttribute("run number", Integer.parseInt(fields[1]));
096      inSeq.setAttribute("flowcell ID", fields[2]);
097      inSeq.setAttribute("lane", Integer.parseInt(fields[3]));
098      inSeq.setAttribute("tile", Integer.parseInt(fields[4]));
099      inSeq.setAttribute("x-pos", Integer.parseInt(fields[5]));
100      inSeq.setAttribute("y-pos", Integer.parseInt(fields[6]));
101      if (8 == fields.length)
102      {
103         inSeq.setAttribute("UMI", fields[7]);
104      }
105
106      // Extract Illumina fields from the description
107      fields = inSeq.getDescription().split(":");
108      if (fields.length != 4)
109      {
110         throw new SeqFormatException("Unexpected number of fields in the header description " + StringUtil.singleQuote(inSeq.getDescription()) + "!");
111      }
112
113      inSeq.setAttribute("read", Integer.parseInt(fields[0]));
114      inSeq.setAttribute("is filtered", BooleanUtil.valueOf(fields[1]));
115      inSeq.setAttribute("control number", Integer.parseInt(fields[2]));
116      inSeq.setAttribute("index", fields[3]);
117
118   }
119
120}