001package com.hfg.bio.seq.format; 002 003 004import java.io.*; 005import java.util.Collection; 006 007import com.hfg.bio.AminoAcid; 008import com.hfg.bio.AminoAcidSet; 009import com.hfg.bio.seq.BioSequence; 010import com.hfg.bio.seq.BioSequenceFactory; 011import com.hfg.bio.seq.BioSequencePlus; 012import com.hfg.bio.seq.BioSequenceType; 013import com.hfg.bio.taxonomy.ncbi.NCBITaxon; 014import com.hfg.util.StringBuilderPlus; 015import com.hfg.util.StringUtil; 016 017//------------------------------------------------------------------------------ 018/** 019 Sequence format based on WIPO Standard ST.25. 020 <div> 021 @author J. Alex Taylor, hairyfatguy.com 022 </div> 023 */ 024//------------------------------------------------------------------------------ 025// com.hfg Library 026// 027// This library is free software; you can redistribute it and/or 028// modify it under the terms of the GNU Lesser General Public 029// License as published by the Free Software Foundation; either 030// version 2.1 of the License, or (at your option) any later version. 031// 032// This library is distributed in the hope that it will be useful, 033// but WITHOUT ANY WARRANTY; without even the implied warranty of 034// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 035// Lesser General Public License for more details. 036// 037// You should have received a copy of the GNU Lesser General Public 038// License along with this library; if not, write to the Free Software 039// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 040// 041// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com 042// jataylor@hairyfatguy.com 043//------------------------------------------------------------------------------ 044 045public class WIPO_ST_25<T extends BioSequence> implements WritableSeqFormat<T> 046{ 047 private int mSeqIdNo; 048 private String mSpacer = " "; 049 050 private static int sMaxProtResiduesPerLine = 16; 051 private static int sMaxNucResiduesPerLine = 60; 052 private static int sMaxDescriptionCharsPerLine = 68; 053 054 //########################################################################### 055 // CONSTRUCTORS 056 //########################################################################### 057 058 //--------------------------------------------------------------------------- 059 public WIPO_ST_25() 060 { 061 } 062 063 //--------------------------------------------------------------------------- 064 public WIPO_ST_25(BioSequenceFactory<T> inSeqFactory) 065 { 066 } 067 068 //########################################################################### 069 // PUBLIC METHODS 070 //########################################################################### 071 072 //--------------------------------------------------------------------------- 073 public String write(Collection<T> inSeqs) 074 throws SeqIOException 075 { 076 // Reset the SEQ ID NO. 077 resetSeqIdNo(); 078 079 StringWriter writer = new StringWriter(); 080 081 for (T seq : inSeqs) 082 { 083 innerWrite(seq, writer); 084 } 085 086 return writer.toString(); 087 } 088 089 //--------------------------------------------------------------------------- 090 public void write(Collection<T> inSeqs, Writer inWriter) 091 throws SeqIOException 092 { 093 // Reset the SEQ ID NO. 094 resetSeqIdNo(); 095 096 for (T seq : inSeqs) 097 { 098 innerWrite(seq, inWriter); 099 } 100 } 101 102 103 //--------------------------------------------------------------------------- 104 public String write(T inSeq) 105 throws SeqIOException 106 { 107 // Reset the SEQ ID NO. 108 resetSeqIdNo(); 109 110 StringWriter writer = new StringWriter(); 111 112 innerWrite(inSeq, writer); 113 114 return writer.toString(); 115 } 116 117 //--------------------------------------------------------------------------- 118 public void write(T inSeq, OutputStream inStream) 119 throws SeqIOException 120 { 121 // Reset the SEQ ID NO. 122 resetSeqIdNo(); 123 124 Writer writer = new OutputStreamWriter(inStream); 125 126 innerWrite(inSeq, writer); 127 try 128 { 129 writer.flush(); 130 } 131 catch (Exception e) 132 { 133 throw new SeqIOException(e); 134 } 135 } 136 137 //--------------------------------------------------------------------------- 138 public void write(T inSeq, Writer inWriter) 139 throws SeqIOException 140 { 141 // Reset the SEQ ID NO. 142 resetSeqIdNo(); 143 144 innerWrite(inSeq, inWriter); 145 } 146 147 148 //--------------------------------------------------------------------------- 149 public void writeInitialIdentifiers(OutputStream inStream, int inNumSeqs) 150 throws SeqIOException 151 { 152 Writer writer = new OutputStreamWriter(inStream); 153 154 writeInitialIdentifiers(writer, inNumSeqs); 155 156 try 157 { 158 writer.flush(); 159 } 160 catch (Exception e) 161 { 162 throw new SeqIOException(e); 163 } 164 } 165 166 //--------------------------------------------------------------------------- 167 public void writeInitialIdentifiers(Writer inWriter, int inNumSeqs) 168 throws SeqIOException 169 { 170 try 171 { 172 inWriter.write(String.format("<110>%s%s\n", mSpacer, "Insert applicant name(s) here")); 173 // Blank line between sections 174 inWriter.write("\n"); 175 176 inWriter.write(String.format("<120>%s%s\n", mSpacer, "Insert title of invention here")); 177 // Blank line between sections 178 inWriter.write("\n"); 179 180 inWriter.write(String.format("<130>%s%s\n", mSpacer, "Insert file reference here")); 181 // Blank line between sections 182 inWriter.write("\n"); 183 184 inWriter.write(String.format("<160>%s%d\n", mSpacer, inNumSeqs)); 185 // Blank line between sections 186 inWriter.write("\n"); 187 188 inWriter.write(String.format("<170>%s%s\n", mSpacer, "Insert name of software here")); 189 // Blank line between sections 190 inWriter.write("\n"); 191 } 192 catch (IOException e) 193 { 194 throw new SeqIOException("Problem adding initial identifiers!", e); 195 } 196 } 197 198 199 200 //########################################################################### 201 // PRIVATE METHODS 202 //########################################################################### 203 204 205 //--------------------------------------------------------------------------- 206 private void innerWrite(T inSeq, Writer inWriter) 207 throws SeqIOException 208 { 209 Reader seqReader = null; 210 BufferedWriter writer = null; 211 try 212 { 213 try 214 { 215 if (writer instanceof BufferedWriter) 216 { 217 writer = (BufferedWriter) inWriter; 218 } else 219 { 220 writer = new BufferedWriter(inWriter, 8196); 221 } 222 223 // Write the SEQ ID NO line 224 writer.write(String.format("<210>%s%d\n", mSpacer, mSeqIdNo)); 225 // Write the sequence length line 226 writer.write(String.format("<211>%s%d\n", mSpacer, inSeq.length())); 227 // Write the sequence length line 228 writer.write(String.format("<212>%s%s\n", mSpacer, getSeqTypeString(inSeq.getType()))); 229 // Write the organism line 230 writer.write(String.format("<213>%s%s\n", mSpacer, getOrganismString(inSeq))); 231 232 // Blank line between sections 233 writer.write("\n"); 234 235 if (StringUtil.isSet(inSeq.getDescription())) 236 { 237 writer.write("<220>\n"); 238 writeOtherInformation(writer, inSeq.getDescription()); 239 240 // Blank line between sections 241 writer.write("\n"); 242 } 243 244 245 // Write the sequence line 246 writer.write(String.format("<400>%s%d\n", mSpacer, mSeqIdNo)); 247 248 switch (inSeq.getType()) 249 { 250 case PROTEIN: 251 writeProtSequence(writer, inSeq); 252 break; 253 case NUCLEIC_ACID: 254 writeNucSequence(writer, inSeq); 255 break; 256 default: 257 throw new SeqIOException("BioSequenceType " + inSeq.getType() + " is not currently supported!"); 258 } 259 260 261 mSeqIdNo++; 262 } 263 finally 264 { 265 if (seqReader != null) 266 { 267 seqReader.close(); 268 } 269 270 if (writer != null) 271 { 272 writer.flush(); 273 } 274 } 275 } 276 catch (SeqIOException e) 277 { 278 throw e; 279 } 280 catch (Exception e) 281 { 282 throw new SeqIOException(e); 283 } 284 } 285 286 //--------------------------------------------------------------------------- 287 private String getSeqTypeString(BioSequenceType inType) 288 { 289 String value; 290 switch (inType) 291 { 292 case PROTEIN: 293 value = "PRT"; 294 break; 295 case NUCLEIC_ACID: 296 value = "DNA"; 297 break; 298 default: 299 throw new SeqIOException("BioSequenceType " + inType + " is not currently supported!"); 300 } 301 302 return value; 303 } 304 305 //--------------------------------------------------------------------------- 306 private String getOrganismString(BioSequence inSeq) 307 { 308 String value = "Unknown"; 309 if (inSeq instanceof BioSequencePlus) 310 { 311 NCBITaxon taxon = ((BioSequencePlus) inSeq).getNCBITaxon(); 312 if (taxon != null) 313 { 314 if (taxon.equals(NCBITaxon.SYNTHETIC_CONSTRUCT)) 315 { 316 value = "Artificial Sequence"; 317 } 318 else if (taxon != NCBITaxon.UNKNOWN) 319 { 320 value = taxon.getScientificName(); 321 } 322 } 323 } 324 325 return value; 326 } 327 328 //--------------------------------------------------------------------------- 329 private void writeOtherInformation(Writer inWriter, String inDescription) 330 throws IOException 331 { 332 String wrappedDescription = StringUtil.wrap(inDescription, sMaxDescriptionCharsPerLine); 333 wrappedDescription = StringUtil.replaceAll(wrappedDescription, "\n", "\n " + mSpacer); 334 335 inWriter.write(String.format("<223>%s%s\n", mSpacer, wrappedDescription)); 336 } 337 338 //--------------------------------------------------------------------------- 339 private void writeProtSequence(Writer inWriter, BioSequence inSeq) 340 throws IOException 341 { 342 Reader seqReader = null; 343 344 try 345 { 346 StringBuilderPlus seqLineBuffer = new StringBuilderPlus().setDelimiter(" "); 347 StringBuilderPlus numLineBuffer = new StringBuilderPlus().setDelimiter(" "); 348 349 seqReader = inSeq.getSequenceReader(); 350 char[] buffer = new char[sMaxProtResiduesPerLine]; 351 int numBytesRead; 352 int residueNum = 1; 353 while ((numBytesRead = seqReader.read(buffer)) != -1) 354 { 355 numLineBuffer.append(StringUtil.polyChar(' ', numBytesRead * 3 * 2 - 3)); 356 357 for (int i = 0; i < numBytesRead; i++) 358 { 359 AminoAcid aa = AminoAcidSet.STANDARD.getAA(buffer[i]); 360 seqLineBuffer.delimitedAppend(aa.getThreeLetterCode()); 361 362 if (1 == residueNum 363 || residueNum%5 == 0) 364 { 365 int numLength = (residueNum + "").length(); 366 367 int start = (i * 6) + 2 - numLength + 1; 368 if (start < 0) 369 { 370 start = 0; 371 } 372 373 numLineBuffer.replace(start, start + numLength - 1, residueNum + ""); 374 } 375 376 residueNum++; 377 } 378 seqLineBuffer.append("\n"); 379 numLineBuffer.append("\n"); 380 381 inWriter.write(seqLineBuffer.toString()); 382 inWriter.write(numLineBuffer.toString()); 383 inWriter.write("\n"); 384 385 seqLineBuffer.setLength(0); 386 numLineBuffer.setLength(0); 387 } 388 389 inWriter.flush(); 390 } 391 finally 392 { 393 if (seqReader != null) 394 { 395 seqReader.close(); 396 } 397 } 398 } 399 400 //--------------------------------------------------------------------------- 401 private void writeNucSequence(Writer inWriter, BioSequence inSeq) 402 throws IOException 403 { 404 Reader seqReader = null; 405 406 try 407 { 408 StringBuilderPlus seqLineBuffer = new StringBuilderPlus().setDelimiter(" "); 409 410 seqReader = inSeq.getSequenceReader(); 411 char[] buffer = new char[sMaxNucResiduesPerLine]; 412 int numBytesRead; 413 int residueNum = 1; 414 while ((numBytesRead = seqReader.read(buffer)) != -1) 415 { 416 for (int i = 0; i < numBytesRead; i++) 417 { 418 seqLineBuffer.append(buffer[i]); 419 420 if (residueNum%10 == 0 421 && i < numBytesRead - 1) 422 { 423 seqLineBuffer.append(" "); 424 } 425 426 residueNum++; 427 } 428 429 seqLineBuffer.append(String.format(" %11d\n", residueNum - 1)); 430 431 inWriter.write(seqLineBuffer.toString()); 432 inWriter.write("\n"); 433 434 seqLineBuffer.setLength(0); 435 } 436 437 inWriter.flush(); 438 } 439 finally 440 { 441 if (seqReader != null) 442 { 443 seqReader.close(); 444 } 445 } 446 } 447 448 //--------------------------------------------------------------------------- 449 private void resetSeqIdNo() 450 { 451 // Reset the SEQ ID NO. 452 mSeqIdNo = 1; 453 } 454}