001package com.hfg.bio.seq.format;
002
003
004import java.io.*;
005import java.util.Collection;
006
007import com.hfg.bio.AminoAcid;
008import com.hfg.bio.AminoAcidSet;
009import com.hfg.bio.seq.BioSequence;
010import com.hfg.bio.seq.BioSequenceFactory;
011import com.hfg.bio.seq.BioSequencePlus;
012import com.hfg.bio.seq.BioSequenceType;
013import com.hfg.bio.taxonomy.ncbi.NCBITaxon;
014import com.hfg.util.StringBuilderPlus;
015import com.hfg.util.StringUtil;
016
017//------------------------------------------------------------------------------
018/**
019 Sequence format based on WIPO Standard ST.25.
020 <div>
021 @author J. Alex Taylor, hairyfatguy.com
022 </div>
023 */
024//------------------------------------------------------------------------------
025// com.hfg Library
026//
027// This library is free software; you can redistribute it and/or
028// modify it under the terms of the GNU Lesser General Public
029// License as published by the Free Software Foundation; either
030// version 2.1 of the License, or (at your option) any later version.
031//
032// This library is distributed in the hope that it will be useful,
033// but WITHOUT ANY WARRANTY; without even the implied warranty of
034// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
035// Lesser General Public License for more details.
036//
037// You should have received a copy of the GNU Lesser General Public
038// License along with this library; if not, write to the Free Software
039// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
040//
041// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
042// jataylor@hairyfatguy.com
043//------------------------------------------------------------------------------
044
045public class WIPO_ST_25<T extends BioSequence> implements WritableSeqFormat<T>
046{
047   private int mSeqIdNo;
048   private String mSpacer = "         ";
049
050   private static int sMaxProtResiduesPerLine = 16;
051   private static int sMaxNucResiduesPerLine = 60;
052   private static int sMaxDescriptionCharsPerLine = 68;
053
054   //###########################################################################
055   // CONSTRUCTORS
056   //###########################################################################
057
058   //---------------------------------------------------------------------------
059   public WIPO_ST_25()
060   {
061   }
062
063   //---------------------------------------------------------------------------
064   public WIPO_ST_25(BioSequenceFactory<T> inSeqFactory)
065   {
066   }
067
068   //###########################################################################
069   // PUBLIC METHODS
070   //###########################################################################
071
072   //---------------------------------------------------------------------------
073   public String write(Collection<T> inSeqs)
074         throws SeqIOException
075   {
076      // Reset the SEQ ID NO.
077      resetSeqIdNo();
078
079      StringWriter writer = new StringWriter();
080
081      for (T seq : inSeqs)
082      {
083         innerWrite(seq, writer);
084      }
085
086      return writer.toString();
087   }
088
089   //---------------------------------------------------------------------------
090   public void write(Collection<T> inSeqs, Writer inWriter)
091         throws SeqIOException
092   {
093      // Reset the SEQ ID NO.
094      resetSeqIdNo();
095
096      for (T seq : inSeqs)
097      {
098         innerWrite(seq, inWriter);
099      }
100   }
101
102
103   //---------------------------------------------------------------------------
104   public String write(T inSeq)
105         throws SeqIOException
106   {
107      // Reset the SEQ ID NO.
108      resetSeqIdNo();
109
110      StringWriter writer = new StringWriter();
111
112      innerWrite(inSeq, writer);
113
114      return writer.toString();
115   }
116
117   //---------------------------------------------------------------------------
118   public void write(T inSeq, OutputStream inStream)
119         throws SeqIOException
120   {
121      // Reset the SEQ ID NO.
122      resetSeqIdNo();
123
124      Writer writer = new OutputStreamWriter(inStream);
125
126      innerWrite(inSeq, writer);
127      try
128      {
129         writer.flush();
130      }
131      catch (Exception e)
132      {
133         throw new SeqIOException(e);
134      }
135   }
136
137   //---------------------------------------------------------------------------
138   public void write(T inSeq, Writer inWriter)
139         throws SeqIOException
140   {
141      // Reset the SEQ ID NO.
142      resetSeqIdNo();
143
144      innerWrite(inSeq, inWriter);
145   }
146
147
148   //---------------------------------------------------------------------------
149   public void writeInitialIdentifiers(OutputStream inStream, int inNumSeqs)
150         throws SeqIOException
151   {
152      Writer writer = new OutputStreamWriter(inStream);
153
154      writeInitialIdentifiers(writer, inNumSeqs);
155
156      try
157      {
158         writer.flush();
159      }
160      catch (Exception e)
161      {
162         throw new SeqIOException(e);
163      }
164   }
165
166   //---------------------------------------------------------------------------
167   public void writeInitialIdentifiers(Writer inWriter, int inNumSeqs)
168         throws SeqIOException
169   {
170      try
171      {
172         inWriter.write(String.format("<110>%s%s\n", mSpacer, "Insert applicant name(s) here"));
173         // Blank line between sections
174         inWriter.write("\n");
175
176         inWriter.write(String.format("<120>%s%s\n", mSpacer, "Insert title of invention here"));
177         // Blank line between sections
178         inWriter.write("\n");
179
180         inWriter.write(String.format("<130>%s%s\n", mSpacer, "Insert file reference here"));
181         // Blank line between sections
182         inWriter.write("\n");
183
184         inWriter.write(String.format("<160>%s%d\n", mSpacer, inNumSeqs));
185         // Blank line between sections
186         inWriter.write("\n");
187
188         inWriter.write(String.format("<170>%s%s\n", mSpacer, "Insert name of software here"));
189         // Blank line between sections
190         inWriter.write("\n");
191      }
192      catch (IOException e)
193      {
194         throw new SeqIOException("Problem adding initial identifiers!", e);
195      }
196   }
197
198
199
200   //###########################################################################
201   // PRIVATE METHODS
202   //###########################################################################
203
204
205   //---------------------------------------------------------------------------
206   private void innerWrite(T inSeq, Writer inWriter)
207         throws SeqIOException
208   {
209      Reader seqReader = null;
210      BufferedWriter writer = null;
211      try
212      {
213         try
214         {
215            if (writer instanceof BufferedWriter)
216            {
217               writer = (BufferedWriter) inWriter;
218            } else
219            {
220               writer = new BufferedWriter(inWriter, 8196);
221            }
222
223            // Write the SEQ ID NO line
224            writer.write(String.format("<210>%s%d\n", mSpacer, mSeqIdNo));
225            // Write the sequence length line
226            writer.write(String.format("<211>%s%d\n", mSpacer, inSeq.length()));
227            // Write the sequence length line
228            writer.write(String.format("<212>%s%s\n", mSpacer, getSeqTypeString(inSeq.getType())));
229            // Write the organism line
230            writer.write(String.format("<213>%s%s\n", mSpacer, getOrganismString(inSeq)));
231
232            // Blank line between sections
233            writer.write("\n");
234
235            if (StringUtil.isSet(inSeq.getDescription()))
236            {
237               writer.write("<220>\n");
238               writeOtherInformation(writer, inSeq.getDescription());
239
240               // Blank line between sections
241               writer.write("\n");
242            }
243
244
245            // Write the sequence line
246            writer.write(String.format("<400>%s%d\n", mSpacer, mSeqIdNo));
247
248            switch (inSeq.getType())
249            {
250               case PROTEIN:
251                  writeProtSequence(writer, inSeq);
252                  break;
253               case NUCLEIC_ACID:
254                  writeNucSequence(writer, inSeq);
255                  break;
256               default:
257                  throw new SeqIOException("BioSequenceType " + inSeq.getType() + " is not currently supported!");
258            }
259
260
261            mSeqIdNo++;
262         }
263         finally
264         {
265            if (seqReader != null)
266            {
267               seqReader.close();
268            }
269
270            if (writer != null)
271            {
272               writer.flush();
273            }
274         }
275      }
276      catch (SeqIOException e)
277      {
278         throw e;
279      }
280      catch (Exception e)
281      {
282         throw new SeqIOException(e);
283      }
284   }
285
286   //---------------------------------------------------------------------------
287   private String getSeqTypeString(BioSequenceType inType)
288   {
289      String value;
290      switch (inType)
291      {
292         case PROTEIN:
293            value = "PRT";
294            break;
295         case NUCLEIC_ACID:
296            value = "DNA";
297            break;
298         default:
299            throw new SeqIOException("BioSequenceType " + inType + " is not currently supported!");
300      }
301
302      return value;
303   }
304
305   //---------------------------------------------------------------------------
306   private String getOrganismString(BioSequence inSeq)
307   {
308      String value = "Unknown";
309      if (inSeq instanceof BioSequencePlus)
310      {
311         NCBITaxon taxon = ((BioSequencePlus) inSeq).getNCBITaxon();
312         if (taxon != null)
313         {
314            if (taxon.equals(NCBITaxon.SYNTHETIC_CONSTRUCT))
315            {
316               value = "Artificial Sequence";
317            }
318            else if (taxon != NCBITaxon.UNKNOWN)
319            {
320               value = taxon.getScientificName();
321            }
322         }
323      }
324
325      return value;
326   }
327
328   //---------------------------------------------------------------------------
329   private void writeOtherInformation(Writer inWriter, String inDescription)
330         throws IOException
331   {
332      String wrappedDescription = StringUtil.wrap(inDescription, sMaxDescriptionCharsPerLine);
333      wrappedDescription = StringUtil.replaceAll(wrappedDescription, "\n", "\n     " + mSpacer);
334
335      inWriter.write(String.format("<223>%s%s\n", mSpacer, wrappedDescription));
336   }
337
338   //---------------------------------------------------------------------------
339   private void writeProtSequence(Writer inWriter, BioSequence inSeq)
340      throws IOException
341   {
342      Reader seqReader = null;
343
344      try
345      {
346         StringBuilderPlus seqLineBuffer = new StringBuilderPlus().setDelimiter("   ");
347         StringBuilderPlus numLineBuffer = new StringBuilderPlus().setDelimiter("   ");
348
349         seqReader = inSeq.getSequenceReader();
350         char[] buffer = new char[sMaxProtResiduesPerLine];
351         int numBytesRead;
352         int residueNum = 1;
353         while ((numBytesRead = seqReader.read(buffer)) != -1)
354         {
355            numLineBuffer.append(StringUtil.polyChar(' ', numBytesRead * 3 * 2 - 3));
356
357            for (int i = 0; i < numBytesRead; i++)
358            {
359               AminoAcid aa = AminoAcidSet.STANDARD.getAA(buffer[i]);
360               seqLineBuffer.delimitedAppend(aa.getThreeLetterCode());
361
362               if (1 == residueNum
363                   || residueNum%5 == 0)
364               {
365                  int numLength = (residueNum + "").length();
366
367                  int start = (i * 6) + 2 - numLength + 1;
368                  if (start < 0)
369                  {
370                     start = 0;
371                  }
372
373                  numLineBuffer.replace(start, start + numLength - 1, residueNum + "");
374               }
375
376               residueNum++;
377            }
378            seqLineBuffer.append("\n");
379            numLineBuffer.append("\n");
380
381            inWriter.write(seqLineBuffer.toString());
382            inWriter.write(numLineBuffer.toString());
383            inWriter.write("\n");
384
385            seqLineBuffer.setLength(0);
386            numLineBuffer.setLength(0);
387         }
388
389         inWriter.flush();
390      }
391      finally
392      {
393         if (seqReader != null)
394         {
395            seqReader.close();
396         }
397      }
398   }
399
400   //---------------------------------------------------------------------------
401   private void writeNucSequence(Writer inWriter, BioSequence inSeq)
402      throws IOException
403   {
404      Reader seqReader = null;
405
406      try
407      {
408         StringBuilderPlus seqLineBuffer = new StringBuilderPlus().setDelimiter("   ");
409
410         seqReader = inSeq.getSequenceReader();
411         char[] buffer = new char[sMaxNucResiduesPerLine];
412         int numBytesRead;
413         int residueNum = 1;
414         while ((numBytesRead = seqReader.read(buffer)) != -1)
415         {
416            for (int i = 0; i < numBytesRead; i++)
417            {
418               seqLineBuffer.append(buffer[i]);
419
420               if (residueNum%10 == 0
421                     && i < numBytesRead - 1)
422               {
423                  seqLineBuffer.append("   ");
424               }
425
426               residueNum++;
427            }
428
429            seqLineBuffer.append(String.format(" %11d\n", residueNum - 1));
430
431            inWriter.write(seqLineBuffer.toString());
432            inWriter.write("\n");
433
434            seqLineBuffer.setLength(0);
435         }
436
437         inWriter.flush();
438      }
439      finally
440      {
441         if (seqReader != null)
442         {
443            seqReader.close();
444         }
445      }
446   }
447
448   //---------------------------------------------------------------------------
449   private void resetSeqIdNo()
450   {
451      // Reset the SEQ ID NO.
452      mSeqIdNo = 1;
453   }
454}