001package com.hfg.bio.seq.format; 002 003import java.io.BufferedReader; 004import java.io.BufferedWriter; 005import java.io.OutputStream; 006import java.io.OutputStreamWriter; 007import java.io.StringWriter; 008import java.io.Writer; 009import java.util.regex.Matcher; 010import java.util.regex.Pattern; 011 012import com.hfg.bio.seq.BioSequenceFactory; 013import com.hfg.bio.seq.NucleicAcid; 014import com.hfg.bio.seq.SeqQualityScoreScheme; 015import com.hfg.bio.seq.SeqQualityScores; 016import com.hfg.exception.InvalidValueException; 017import com.hfg.util.StringUtil; 018 019//------------------------------------------------------------------------------ 020/** 021 FASTQ sequence format encompassing sequences and their per-base sequencing quality scores. 022 <div> 023 From <a href='https://en.wikipedia.org/wiki/FASTQ_format'>Wikipedia</a>: 024 <pre> 025 "A FASTQ file normally uses four lines per sequence. 026 027 Line 1 begins with a '@' character and is followed by a sequence identifier and an optional description (like a FASTA title line). 028 Line 2 is the raw sequence letters. 029 Line 3 begins with a '+' character and is optionally followed by the same sequence identifier (and any description) again. 030 Line 4 encodes the quality values for the sequence in Line 2, and must contain the same number of symbols as letters in the sequence." 031 </pre> 032 </div> 033 034 <div> 035 @author J. Alex Taylor, hairyfatguy.com 036 </div> 037 */ 038//------------------------------------------------------------------------------ 039// com.hfg Library 040// 041// This library is free software; you can redistribute it and/or 042// modify it under the terms of the GNU Lesser General Public 043// License as published by the Free Software Foundation; either 044// version 2.1 of the License, or (at your option) any later version. 045// 046// This library is distributed in the hope that it will be useful, 047// but WITHOUT ANY WARRANTY; without even the implied warranty of 048// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 049// Lesser General Public License for more details. 050// 051// You should have received a copy of the GNU Lesser General Public 052// License along with this library; if not, write to the Free Software 053// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 054// 055// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com 056// jataylor@hairyfatguy.com 057//------------------------------------------------------------------------------ 058 059public class FASTQ<T extends NucleicAcid> extends ReadableSeqFormatBase<T> implements WritableSeqFormat<T> 060{ 061 private SeqQualityScoreScheme mScheme = SeqQualityScoreScheme.sanger; 062 063 private static final Pattern sHeaderLinePattern = Pattern.compile("@(\\S+)(?:\\s+(.*?))?"); 064 private static final Pattern sSequenceLinePattern = Pattern.compile("[A-Za-z\\*\\-]+"); 065 066 067 private enum TargetLine 068 { 069 header, 070 sequence, 071 quality_header, 072 quality 073 } 074 075 //########################################################################### 076 // CONSTRUCTORS 077 //########################################################################### 078 079 //--------------------------------------------------------------------------- 080 public FASTQ() 081 { 082 super(null); 083 } 084 085 //--------------------------------------------------------------------------- 086 public FASTQ(BioSequenceFactory<T> inSeqFactory) 087 { 088 super(inSeqFactory); 089 } 090 091 //########################################################################### 092 // PUBLIC METHODS 093 //########################################################################### 094 095 096 //--------------------------------------------------------------------------- 097 public FASTQ<T> setScheme(SeqQualityScoreScheme inValue) 098 { 099 if (null == inValue) 100 { 101 throw new InvalidValueException("The sequence scoring scheme cannot be set to null!"); 102 } 103 104 mScheme = inValue; 105 return this; 106 } 107 108 //--------------------------------------------------------------------------- 109 public SeqQualityScoreScheme getScheme() 110 { 111 return mScheme; 112 } 113 114 //--------------------------------------------------------------------------- 115 public T readRecord(BufferedReader inReader) 116 throws SeqIOException 117 { 118 if (null == getBioSequenceFactory()) 119 { 120 throw new SeqIOException("No BioSequence factory has been specified!"); 121 } 122 123 int lineCount = 0; 124 T seq; 125 try 126 { 127 seq = getBioSequenceFactory().createSeqObj(); 128 129 TargetLine targetLine = TargetLine.header; 130 String line; 131 while ((line = inReader.readLine()) != null) 132 { 133 lineCount++; 134 135 // Skip comment lines or blank lines. Note that quality lines sometimes start w/ '#' 136 // so we can't use that to indicate a comment line. 137 if (line.startsWith("//") 138 || line.matches("\\s*")) 139 { 140 continue; 141 } 142 143 switch (targetLine) 144 { 145 case header: 146 parseHeaderLine(line, seq); 147 targetLine = TargetLine.sequence; 148 break; 149 case sequence: 150 parseSequenceLine(line, seq); 151 targetLine = TargetLine.quality_header; 152 break; 153 case quality_header: 154 parseQualityHeaderLine(line, seq); 155 targetLine = TargetLine.quality; 156 break; 157 case quality: 158 parseQualityLine(line, seq); 159 targetLine = null; 160 break; 161 } 162 163 if (null == targetLine) 164 { 165 break; 166 } 167 } 168 } 169 catch (SeqIOException e) 170 { 171 throw e; 172 } 173 catch (Exception e) 174 { 175 throw new SeqIOException(e); 176 } 177 178 return seq; 179 } 180 181 //--------------------------------------------------------------------------- 182 public boolean isEndOfRecord(String inLine) 183 { 184 return inLine.startsWith("@"); 185 } 186 187 //--------------------------------------------------------------------------- 188 public boolean hasJanusDelimiter() 189 { 190 return true; 191 } 192 193 //--------------------------------------------------------------------------- 194 public String write(T inSeq) 195 throws SeqIOException 196 { 197 StringWriter writer = new StringWriter(); 198 199 write(inSeq, writer); 200 201 return writer.toString(); 202 } 203 204 //--------------------------------------------------------------------------- 205 public void write(T inSeq, OutputStream inStream) 206 throws SeqIOException 207 { 208 Writer writer = new OutputStreamWriter(inStream); 209 write(inSeq, writer); 210 try 211 { 212 writer.flush(); 213 } 214 catch (Exception e) 215 { 216 throw new SeqIOException(e); 217 } 218 } 219 220 //--------------------------------------------------------------------------- 221 public void write(T inSeq, Writer inWriter) 222 throws SeqIOException 223 { 224 BufferedWriter writer = null; 225 try 226 { 227 try 228 { 229 if (writer instanceof BufferedWriter) 230 { 231 writer = (BufferedWriter) inWriter; 232 } else 233 { 234 writer = new BufferedWriter(inWriter, 8196); 235 } 236 237 // Write the header line 238 writer.write("@"); 239 writer.write(inSeq.getID()); 240 if (StringUtil.isSet(inSeq.getDescription())) 241 { 242 writer.write(" " + inSeq.getDescription()); 243 } 244 245 writer.write("\n"); 246 247 // Write the sequence line 248 writer.write(inSeq.getSequence()); 249 writer.write("\n"); 250 251 // Write the quality header line 252 writer.write("+\n"); 253 254 // Write the quality line 255 if (inSeq.getSeqQualityScores() != null) 256 { 257 String encodedQualityString = inSeq.getSeqQualityScores().getEncodedQualityString(); 258 if (encodedQualityString != null) 259 { 260 writer.write(encodedQualityString); 261 } 262 } 263 writer.write("\n"); 264 } 265 finally 266 { 267 if (writer != null) 268 { 269 writer.flush(); 270 } 271 } 272 } 273 catch (SeqIOException e) 274 { 275 throw e; 276 } 277 catch (Exception e) 278 { 279 throw new SeqIOException(e); 280 } 281 } 282 283 //########################################################################### 284 // PROTECTED METHODS 285 //########################################################################### 286 287 //--------------------------------------------------------------------------- 288 protected void parseHeaderLine(String inLine, T inSeq) 289 { 290 if (! inLine.startsWith("@")) 291 { 292 throw new SeqFormatException("Expected a FASTQ header line but found " + StringUtil.singleQuote(inLine) + "!"); 293 } 294 295 Matcher m = sHeaderLinePattern.matcher(inLine); 296 if (m.matches()) 297 { 298 inSeq.setID(m.group(1)); 299 inSeq.setDescription(m.group(2)); 300 } 301 else 302 { 303 throw new SeqFormatException("The header line" + StringUtil.singleQuote(inLine) + " is not in proper FASTQ format!"); 304 } 305 } 306 307 //########################################################################### 308 // PRIVATE METHODS 309 //########################################################################### 310 311 //--------------------------------------------------------------------------- 312 private void parseSequenceLine(String inLine, T inSeq) 313 { 314 String seqString = StringUtil.replaceWhitespace(inLine, ""); 315 316 if (! sSequenceLinePattern.matcher(seqString).matches()) 317 { 318 throw new SeqFormatException("Expected a FASTQ header line but found " + StringUtil.singleQuote(inLine) + "!"); 319 } 320 321 inSeq.setSequence(seqString); 322 } 323 324 //--------------------------------------------------------------------------- 325 private void parseQualityHeaderLine(String inLine, T inSeq) 326 { 327 if (! inLine.startsWith("+")) 328 { 329 throw new SeqFormatException("Expected a FASTQ quality header line but found " + StringUtil.singleQuote(inLine) + "!"); 330 } 331 332 // For now, skip parsing any id or description from here 333 } 334 335 //--------------------------------------------------------------------------- 336 private void parseQualityLine(String inLine, T inSeq) 337 { 338 if (inLine.length() != inSeq.length()) 339 { 340 throw new SeqFormatException("The FASTQ quality string is not the same length as the sequence!"); 341 } 342 343 SeqQualityScores seqQualityScores = new SeqQualityScores(inLine, getScheme()); 344 345 inSeq.setSeqQualityScores(seqQualityScores); 346 } 347 348}