001package com.hfg.bio.seq.format; 002 003import com.hfg.bio.seq.BioSequenceFactory; 004import com.hfg.bio.seq.NucleicAcid; 005import com.hfg.util.BooleanUtil; 006import com.hfg.util.StringUtil; 007 008//------------------------------------------------------------------------------ 009/** 010 FASTQ sequence format from Illumina. Fields are parsed from the header line into 011 attributes on the sequence object. 012 <div> 013 See <a href='https://support.illumina.com/help/BaseSpace_OLH_009008/Content/Source/Informatics/BS/FileFormat_FASTQ-files_swBS.htm'>Illumina's file format description</a>. 014 </div> 015 016 <div> 017 @author J. Alex Taylor, hairyfatguy.com 018 </div> 019 */ 020//------------------------------------------------------------------------------ 021// com.hfg Library 022// 023// This library is free software; you can redistribute it and/or 024// modify it under the terms of the GNU Lesser General Public 025// License as published by the Free Software Foundation; either 026// version 2.1 of the License, or (at your option) any later version. 027// 028// This library is distributed in the hope that it will be useful, 029// but WITHOUT ANY WARRANTY; without even the implied warranty of 030// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 031// Lesser General Public License for more details. 032// 033// You should have received a copy of the GNU Lesser General Public 034// License along with this library; if not, write to the Free Software 035// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 036// 037// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com 038// jataylor@hairyfatguy.com 039//------------------------------------------------------------------------------ 040 041public class IlluminaFASTQ<T extends NucleicAcid> extends FASTQ<T> 042{ 043 //########################################################################### 044 // CONSTRUCTORS 045 //########################################################################### 046 047 //--------------------------------------------------------------------------- 048 public IlluminaFASTQ() 049 { 050 super(null); 051 } 052 053 //--------------------------------------------------------------------------- 054 public IlluminaFASTQ(BioSequenceFactory<T> inSeqFactory) 055 { 056 super(inSeqFactory); 057 } 058 059 //########################################################################### 060 // PROTECTED METHODS 061 //########################################################################### 062 063 //--------------------------------------------------------------------------- 064 // See: https://support.illumina.com/help/BaseSpace_OLH_009008/Content/Source/Informatics/BS/FileFormat_FASTQ-files_swBS.htm 065 // Header line format: 066 // @<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos>:<UMI> <read>:<is filtered>:<control number>:<index> 067 // 068 // Ex: @HWI-M01141:63:A4NDL:1:1101:16668:1377 1:N:0:TATAGCGAGACACCGT 069 // instrument = HWI-M01141 070 // run number = 63 071 // flowcell ID = A4NDL 072 // lane = 1 073 // tile = 1101 074 // x-pos = 16668 075 // y-pos = 1377 076 // UMI (optional) = 077 // read = 1 078 // is filtered = N 079 // control number = 0 080 // index = TATAGCGAGACACCGT 081 protected void parseHeaderLine(String inLine, T inSeq) 082 { 083 // Let the super class break the header line into id and description 084 super.parseHeaderLine(inLine, inSeq); 085 086 // Extract Illumina fields from the id 087 String[] fields = inSeq.getID().split(":"); 088 if (fields.length < 7 089 || fields.length > 8) 090 { 091 throw new SeqFormatException("Unexpected number of fields in the header id " + StringUtil.singleQuote(inSeq.getID()) + "!"); 092 } 093 094 inSeq.setAttribute("instrument", fields[0]); 095 inSeq.setAttribute("run number", Integer.parseInt(fields[1])); 096 inSeq.setAttribute("flowcell ID", fields[2]); 097 inSeq.setAttribute("lane", Integer.parseInt(fields[3])); 098 inSeq.setAttribute("tile", Integer.parseInt(fields[4])); 099 inSeq.setAttribute("x-pos", Integer.parseInt(fields[5])); 100 inSeq.setAttribute("y-pos", Integer.parseInt(fields[6])); 101 if (8 == fields.length) 102 { 103 inSeq.setAttribute("UMI", fields[7]); 104 } 105 106 // Extract Illumina fields from the description 107 fields = inSeq.getDescription().split(":"); 108 if (fields.length != 4) 109 { 110 throw new SeqFormatException("Unexpected number of fields in the header description " + StringUtil.singleQuote(inSeq.getDescription()) + "!"); 111 } 112 113 inSeq.setAttribute("read", Integer.parseInt(fields[0])); 114 inSeq.setAttribute("is filtered", BooleanUtil.valueOf(fields[1])); 115 inSeq.setAttribute("control number", Integer.parseInt(fields[2])); 116 inSeq.setAttribute("index", fields[3]); 117 118 } 119 120}