001package com.hfg.chem.format;
002
003
004import java.io.BufferedReader;
005import java.util.ArrayList;
006import java.util.List;
007import java.util.regex.Matcher;
008import java.util.regex.Pattern;
009
010import com.hfg.chem.Atom;
011import com.hfg.chem.CovalentBond;
012import com.hfg.chem.Element;
013import com.hfg.chem.Molecule;
014import com.hfg.bio.seq.format.SeqIOException;
015import com.hfg.chem.ValenceModel;
016import com.hfg.util.StringBuilderPlus;
017import com.hfg.util.StringUtil;
018
019//------------------------------------------------------------------------------
020/**
021 Basic implementation of the MDL SDF format.
022 <div>
023  @author J. Alex Taylor, hairyfatguy.com
024 </div>
025 */
026//------------------------------------------------------------------------------
027// com.hfg Library
028//
029// This library is free software; you can redistribute it and/or
030// modify it under the terms of the GNU Lesser General Public
031// License as published by the Free Software Foundation; either
032// version 2.1 of the License, or (at your option) any later version.
033//
034// This library is distributed in the hope that it will be useful,
035// but WITHOUT ANY WARRANTY; without even the implied warranty of
036// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
037// Lesser General Public License for more details.
038//
039// You should have received a copy of the GNU Lesser General Public
040// License along with this library; if not, write to the Free Software
041// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
042//
043// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
044// jataylor@hairyfatguy.com
045//------------------------------------------------------------------------------
046
047public class MDL_SDF<T extends Molecule> extends ReadableChemFormatBase<T>
048{
049   private static final Pattern ATTRIBUTE_HEADER_PATTERN = Pattern.compile(">\\s+<(\\S+)>");
050
051   // TODO: Add a way to specify the valence model or whether the MDL file uses the pre or post 2017 valence models
052   private ValenceModel mValenceModel = ValenceModel.MDL_2017;
053
054   private List<String> mMolLines = new ArrayList<>(50);
055   private Integer mAtomCount;
056   private Integer mBondCount;
057
058
059   //###########################################################################
060   // CONSTRUCTORS
061   //###########################################################################
062
063   //---------------------------------------------------------------------------
064   public MDL_SDF()
065   {
066      super(null);
067   }
068
069   //---------------------------------------------------------------------------
070   public MDL_SDF(MoleculeFactory<T> inMoleculeFactory)
071   {
072      super(inMoleculeFactory);
073   }
074
075   //###########################################################################
076   // PUBLIC METHODS
077   //###########################################################################
078
079   //---------------------------------------------------------------------------
080   @Override
081   public boolean hasJanusDelimiter()
082   {
083      return false;
084   }
085
086   //---------------------------------------------------------------------------
087   @Override
088   public boolean isEndOfRecord(String inLine)
089   {
090      return inLine.trim().equals("$$$$");
091   }
092
093   //---------------------------------------------------------------------------
094   @Override
095   public synchronized T readRecord(BufferedReader inReader) throws ChemIOException
096   {
097      if (null == getMoleculeFactory())
098      {
099         throw new SeqIOException("No BioSequence factory has been specified!");
100      }
101
102      T mol;
103      try
104      {
105         mol = getMoleculeFactory().createMoleculeObj();
106
107         mMolLines.clear();
108         mAtomCount = null;
109         mBondCount = null;
110
111         StringBuilderPlus currentAttributeValue = new StringBuilderPlus().setDelimiter("\n");
112         String currentAttributeName = null;
113         String line;
114         boolean structureComplete = false;
115
116         int lineCount = 1;
117         while ((line = inReader.readLine()) != null)
118         {
119            if (! structureComplete)
120            {
121               // The might not be an molfile section
122               Matcher m = ATTRIBUTE_HEADER_PATTERN.matcher(line);
123               if (m.matches())
124               {
125                  structureComplete = true;
126
127                  if (mMolLines.size() > 0)
128                  {
129                     setStructure(mol, mMolLines);
130                  }
131               }
132               else
133               {
134                  mMolLines.add(line);
135
136                  if (line.trim().matches("M\\s+END"))
137                  {
138                     structureComplete = true;
139                     setStructure(mol, mMolLines);
140                     continue;
141                  }
142               }
143            }
144
145            if (structureComplete)
146            {
147               line = line.trim();
148
149               // The molecule record can optionally be followed by attributes
150
151               if (!StringUtil.isSet(line)) // A blank line is used to separate attributes
152               {
153                  if (StringUtil.isSet(currentAttributeName))
154                  {
155                     mol.setAttribute(currentAttributeName, (currentAttributeValue.length() > 0 ? currentAttributeValue.toString() : null));
156                     currentAttributeName = null;
157                  }
158               }
159               else
160               {
161                  Matcher m = ATTRIBUTE_HEADER_PATTERN.matcher(line);
162                  if (m.matches())
163                  {
164                     if (currentAttributeName != null)
165                     {
166                        // The blank line between attributes may have been missing
167                        mol.setAttribute(currentAttributeName, (currentAttributeValue.length() > 0 ? currentAttributeValue.toString() : null));
168                     }
169
170                     currentAttributeName = m.group(1);
171                     currentAttributeValue.setLength(0);
172                  }
173                  else if (currentAttributeName != null)
174                  {
175                     currentAttributeValue.delimitedAppend(line);
176                  }
177               }
178            }
179         }
180      }
181      catch (Exception e)
182      {
183         if (e instanceof ChemIOException)
184         {
185            throw (ChemIOException) e;
186         }
187         else
188         {
189            throw new ChemIOException(e);
190         }
191      }
192
193      return mol;
194   }
195
196   //---------------------------------------------------------------------------
197   private void setStructure(T inMolecule, List<String> inMolLines)
198   {
199//      inMolecule.setAttribute("molfile", inMolLines);
200
201      parseMolHeader(inMolecule, inMolLines);
202
203      // Examine the Counts line
204      // Ex: '  9  8  0     0  0  0  0  0  0999 V2000'
205      parseCountsLine(inMolecule, inMolLines);
206
207      parseAtomsBlock(inMolecule, inMolLines);
208
209      parseBondsBlock(inMolecule, inMolLines);
210
211      parseProperties(inMolecule, inMolLines);
212      
213      if (inMolecule.getAtoms() != null)
214      {
215         // Set implicit hydrogen counts
216         for (Atom molAtom : new ArrayList<>(inMolecule.getAtoms()))
217         {
218            int implicitHCount = mValenceModel.calculateImplicitHCount(molAtom);
219            if (implicitHCount > 0)
220            {
221               for (int i = 0; i < implicitHCount; i++)
222               {
223                  Atom hAtom = new Atom(Element.HYDROGEN);
224                  inMolecule.addAtom(hAtom);
225                  CovalentBond bond = new CovalentBond(molAtom, hAtom);
226                  molAtom.addBond(bond);
227                  hAtom.addBond(bond);
228               }
229            }
230         }
231      }
232   }
233
234   //---------------------------------------------------------------------------
235   private void parseMolHeader(T inMolecule, List<String> inMolLines)
236   {
237      if (inMolLines.size() >= 3)
238      {
239         // The first 3 line constitute the record header
240
241         // The first line may contain the name of the molecule
242         String structureStringName = inMolLines.get(0).trim();
243         if (StringUtil.isSet(structureStringName)
244             && ! structureStringName.equals("NO STRUCTURE"))
245         {
246            inMolecule.setName(structureStringName);
247         }
248
249         // Line 2 optionally contains the details of the software used to generate the record
250
251         // Line 3 contains an optional comment
252      }
253   }
254
255   //---------------------------------------------------------------------------
256   private void parseCountsLine(T inMolecule, List<String> inMolLines)
257   {
258      // The 4th line may contain the Counts line.
259      // The Counts line is composed of 12 fixed-length fields - the first eleven
260      //  are 3 characters long, and the last 6 characters long.
261      //  The first two fields are the number of atoms and bonds respectively.
262      // Ex: '  9  8  0     0  0  0  0  0  0999 V2000'
263      if (inMolLines.size() >= 4)
264      {
265         String countsLine = inMolLines.get(3);
266         if (StringUtil.isSet(countsLine))
267         {
268            if (countsLine.length() != 39)
269            {
270               throw new ChemIOException("Unexpected Counts line length for " + StringUtil.singleQuote(countsLine) + "!");
271            }
272
273            // The first field is the number of atoms
274            mAtomCount = Integer.parseInt(countsLine.substring(0, 3).trim());
275            // The first field is the number of bonds
276            mBondCount = Integer.parseInt(countsLine.substring(3, 6).trim());
277
278            // TODO: Chirality should be the 5th field (but 4th could be empty?)
279         }
280      }
281   }
282
283   //---------------------------------------------------------------------------
284   private void parseAtomsBlock(T inMolecule, List<String> inMolLines)
285   {
286      // The 5th line may be the first atom line
287      // Ex: '    1.9050   -0.7932    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0'
288      if (mAtomCount != null
289          && inMolLines.size() >= 4 + mAtomCount)
290      {
291         for (int i = 0; i < mAtomCount; i++)
292         {
293            String atomLine = inMolLines.get(4 + i);
294            String[] fields = atomLine.trim().split("\\s+");
295
296            Element element = Element.valueOf(fields[3]);
297            Atom atom = new Atom(element)
298                  .setXCoordinate(Float.valueOf(fields[0]))
299                  .setYCoordinate(Float.valueOf(fields[1]))
300                  .setZCoordinate(Float.valueOf(fields[2]));
301
302            int chargeValue = Integer.parseInt(fields[5]);
303            if (chargeValue != 0)
304            {
305               int charge = 0;
306               switch (chargeValue)
307               {
308                  case 7:
309                     charge = -3;
310                     break;
311                  case 6:
312                     charge = -2;
313                     break;
314                  case 5:
315                     charge = -1;
316                     break;
317                  case 3:
318                     charge = 1;
319                     break;
320                  case 2:
321                     charge = 2;
322                     break;
323                  case 1:
324                     charge = 3;
325                     break;
326                  // TODO: 4 ==> Doublet radical
327               }
328
329               atom.setCharge(charge);
330            }
331
332            inMolecule.addAtom(atom);
333         }
334      }
335   }
336
337   //---------------------------------------------------------------------------
338   private void parseBondsBlock(T inMolecule, List<String> inMolLines)
339   {
340      // Bond lines may follow the atom lines
341      // Ex: '  2  1  1  0  0  0  0'
342      if (mBondCount != null
343          && inMolLines.size() >= 4 + mAtomCount + mBondCount)
344      {
345         List<Atom> atoms = inMolecule.getAtoms();
346
347         for (int i = 0; i < mBondCount; i++)
348         {
349            String bondLine = inMolLines.get(4 + mAtomCount + i);
350            String[] fields = bondLine.trim().split("\\s+");
351
352            int atom1Num = Integer.parseInt(fields[0]);
353            int atom2Num = Integer.parseInt(fields[1]);
354
355            Atom atom1 = atoms.get(atom1Num - 1);
356            Atom atom2 = atoms.get(atom2Num - 1);
357
358            CovalentBond bond = new CovalentBond(atom1, atom2);
359
360            int bondOrder = Integer.parseInt(fields[2]);
361            if (bondOrder <= 3)
362            {
363               bond.setBondOrder(bondOrder);
364            }
365            else if (bondOrder == 4)
366            {
367               bond.setBondOrder(1);
368               atom1.setIsAromatic(true);
369               atom2.setIsAromatic(true);
370               bond.setIsAromatic();
371            }
372
373            atom1.addBond(bond);
374            atom2.addBond(bond);
375         }
376      }
377   }
378
379   //---------------------------------------------------------------------------
380   private void parseProperties(T inMolecule, List<String> inMolLines)
381   {
382      // Property lines may follow the Bonds block and will start with an 'M'
383      // Charge Ex: 'M  CHG  1   1   2'
384      // Isotope Ex: 'M  ISO  1   1   2'
385      if (mAtomCount != null
386          && mBondCount != null
387          && inMolLines.size() >= 4 + mAtomCount + mBondCount)
388      {
389         for (int i = 4 + mAtomCount + mBondCount; i < inMolLines.size(); i++)
390         {
391            String propertyLine = inMolLines.get(i);
392
393            if (propertyLine.startsWith("M  CHG"))
394            {
395               // Charge
396               // The 1st field specifies the number of defined charges (up to 8).
397               // Ea. defined charge consists of the atom # (1-based) and a charge
398               String[] fields = propertyLine.split("\\s+");
399               for (int index = 3; index < fields.length - 1; index+=2)
400               {
401                  int atomNum = Integer.parseInt(fields[index]);
402                  int charge = Integer.parseInt(fields[index + 1]);
403
404                  Atom atom = inMolecule.getAtoms().get(atomNum - 1);
405                  atom.setCharge(charge);
406               }
407            }
408            else if (propertyLine.startsWith("M  ISO"))
409            {
410               // Isotope
411               // TODO
412            }
413         }
414      }
415   }
416}