001package com.hfg.chem.format;
002
003import java.util.ArrayList;
004import java.util.HashMap;
005import java.util.List;
006import java.util.Map;
007
008import com.hfg.chem.Atom;
009import com.hfg.chem.CovalentBond;
010import com.hfg.chem.Element;
011import com.hfg.chem.Isotope;
012import com.hfg.chem.MolecularStructureException;
013import com.hfg.chem.Molecule;
014import com.hfg.chem.ValenceModel;
015import com.hfg.util.StringUtil;
016import com.hfg.util.collection.CollectionUtil;
017
018
019//------------------------------------------------------------------------------
020/**
021 Class for parsing a Simplified Molecular-Input Line-Entry System (SMILES)
022 string into a Molecule.
023
024 @see <a href='http://opensmiles.org/'>opensmiles.org</a>
025 <div>
026  @author J. Alex Taylor, hairyfatguy.com
027 </div>
028 */
029//------------------------------------------------------------------------------
030// com.hfg Library
031//
032// This library is free software; you can redistribute it and/or
033// modify it under the terms of the GNU Lesser General Public
034// License as published by the Free Software Foundation; either
035// version 2.1 of the License, or (at your option) any later version.
036//
037// This library is distributed in the hope that it will be useful,
038// but WITHOUT ANY WARRANTY; without even the implied warranty of
039// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
040// Lesser General Public License for more details.
041//
042// You should have received a copy of the GNU Lesser General Public
043// License along with this library; if not, write to the Free Software
044// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
045//
046// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
047// jataylor@hairyfatguy.com
048//------------------------------------------------------------------------------
049// SOME PARSING RULES:
050// - A SMILES string is terminated by a whitespace terminator character (space, tab,
051//   newline, carriage-return), or by the end of the string.
052// - A SMILES parser should accept at least four digits for the atom class, and the values 0 to 9999.
053// - Ring-closure numbers can be reused.
054
055public class SMILES_Parser
056{
057   private String mSMILES;
058   private int mSMILES_Length;
059   private int mCurrentIndex;
060   private boolean mEND;
061   private Map<Integer, CovalentBond> mRingClosureMap;
062   private ValenceModel mValenceModel = ValenceModel.MDL_2017;  // TODO: Switch to SMILES-specific valence model
063
064   //###########################################################################
065   // PUBLIC METHODS
066   //###########################################################################
067
068   //---------------------------------------------------------------------------
069   public synchronized Molecule parse(String inValue)
070   {
071      Molecule molecule = null;
072
073      if (StringUtil.isSet(inValue))
074      {
075         init();
076
077         mSMILES = inValue.trim();
078         mSMILES_Length = mSMILES.length();
079
080         List<Atom> atoms = new ArrayList<>(10);
081
082         try
083         {
084            Atom atom = parseAtom();
085
086            atoms.add(atom);
087
088            Integer hCount = atom.getHCount();
089            if (hCount != null)
090            {
091               for (int i = 0; i < hCount; i++)
092               {
093                  Atom hAtom = new Atom(Element.HYDROGEN);
094                  atoms.add(hAtom);
095                  CovalentBond hBond = new CovalentBond(atom, hAtom);
096                  atom.addBond(hBond);
097                  hAtom.addBond(hBond);
098               }
099            }
100
101
102            List<Atom> atomList = parseSection(atom, (char) -1);
103            if (CollectionUtil.hasValues(atomList))
104            {
105               atoms.addAll(atomList);
106            }
107
108            // Check to see if any rings were left open
109            if (CollectionUtil.hasValues(mRingClosureMap))
110            {
111               throw new SMILES_ParseException("Ring " + mRingClosureMap.keySet().iterator().next() + " left unclosed in " + mSMILES + "!");
112            }
113
114            // We've finished parsing the string. Now build the molecule.
115            molecule = new Molecule().addAtoms(atoms);
116
117            // Set implicit hydrogen counts
118            for (Atom molAtom : new ArrayList<>(molecule.getAtoms()))
119            {
120               int implicitHCount = mValenceModel.calculateImplicitHCount(molAtom);
121               if (implicitHCount > 0)
122               {
123                  for (int i = 0; i < implicitHCount; i++)
124                  {
125                     Atom hAtom = new Atom(Element.HYDROGEN);
126                     molecule.addAtom(hAtom);
127                     CovalentBond bond = new CovalentBond(molAtom, hAtom);
128                     molAtom.addBond(bond);
129                     hAtom.addBond(bond);
130                  }
131               }
132            }
133         }
134         catch (MolecularStructureException e)
135         {
136            throw new SMILES_ParseException("Problem parsing " + StringUtil.singleQuote(mSMILES) + "!", e);
137         }
138      }
139      
140      return molecule;
141   }
142
143   //###########################################################################
144   // PRIVATE METHODS
145   //###########################################################################
146
147   //---------------------------------------------------------------------------
148   private void init()
149   {
150      mCurrentIndex = 0;
151      mEND = false;
152      mRingClosureMap = new HashMap<>(4);
153   }
154
155   //---------------------------------------------------------------------------
156   private char peek()
157   {
158      char theChar = (char) -1;
159      if (mCurrentIndex < mSMILES_Length)
160      {
161         theChar = mSMILES.charAt(mCurrentIndex);
162      }
163      else
164      {
165         mEND = true;
166      }
167
168      return theChar;
169   }
170
171   //---------------------------------------------------------------------------
172   private char nextChar()
173   {
174      char theChar;
175      if (mCurrentIndex >= mSMILES_Length)
176      {
177         mEND = true;
178         theChar = (char) -1;
179      }
180      else
181      {
182         theChar = mSMILES.charAt(mCurrentIndex++);
183         if (mCurrentIndex >= mSMILES_Length)
184         {
185            mEND = true;
186         }
187      }
188
189      return theChar;
190   }
191
192   //---------------------------------------------------------------------------
193   private List<Atom> parseSection(Atom inPrevAtom, char inEndingChar)
194   {
195      List<Atom> atoms = new ArrayList<>(5);
196
197      CovalentBond bond = null;
198      Atom atom;
199      Atom prevAtom = inPrevAtom;
200      Integer hCount;
201      boolean ionicBond = false;
202
203      char theChar = peek();
204
205      while (! mEND
206             && theChar != inEndingChar)
207      {
208         if ('[' == theChar
209             || Character.isLetter(theChar))
210         {
211            // Letter indicates an atom (and a default single bond between them)
212            atom = parseAtom();
213            atoms.add(atom);
214
215            hCount = atom.getHCount();
216            if (hCount != null)
217            {
218               for (int i = 0; i < hCount; i++)
219               {
220                  Atom hAtom = new Atom(Element.HYDROGEN);
221                  atoms.add(hAtom);
222                  CovalentBond hBond = new CovalentBond(atom, hAtom);
223                  atom.addBond(hBond);
224                  hAtom.addBond(hBond);
225               }
226            }
227
228            if (ionicBond)
229            {
230               ionicBond = false;
231            }
232            else
233            {
234               if (null == bond)
235               {
236                  bond = new CovalentBond(prevAtom, atom); // A bond order of 1 is assumed
237                  prevAtom.addBond(bond);
238               }
239               else
240               {
241                  bond.setSecondAtom(atom);
242               }
243
244               atom.addBond(bond);
245            }
246
247            prevAtom = atom;
248            bond = null;
249         }
250         else if ('(' == theChar)
251         {
252            atoms.addAll(parseBranch(prevAtom));
253         }
254         else if (Character.isDigit(theChar)
255                  || '%' == theChar) // '%' preceeds the use of a 2-digit ring-closure number
256         {
257            String ringClosureNumString = "";
258
259            if ('%' == theChar)
260            {
261               nextChar(); // Consume the '%'
262               theChar = peek();
263
264               ringClosureNumString += theChar;
265               nextChar(); // Consume the firstDigit
266               theChar = peek();
267            }
268
269            ringClosureNumString += theChar;
270
271
272            if (! StringUtil.isNumber(ringClosureNumString))
273            {
274               throw new SMILES_ParseException("The '%' at position " + (mCurrentIndex)
275                                               + " of " + StringUtil.singleQuote(mSMILES)
276                                               + " should precede a 2-digit ring-closure number!");
277            }
278
279            // Ring-closure number
280            int ringClosureNum = Integer.parseInt(ringClosureNumString);
281            CovalentBond ringClosureBond = mRingClosureMap.remove(ringClosureNum);
282            if (ringClosureBond != null)
283            {
284               // Don't leave a half-processed bond (we didn't know yet that it was a ring bond)
285               Integer specifiedBondOrder = null;
286               if (bond != null)
287               {
288                  specifiedBondOrder = bond.getSpecifiedBondOrder();
289                  prevAtom.removeBond(bond);
290
291                  if (specifiedBondOrder != null
292                      && ringClosureBond.getSpecifiedBondOrder() != null
293                      && ! ringClosureBond.getSpecifiedBondOrder().equals(specifiedBondOrder))
294                  {
295                     throw new SMILES_ParseException("Ring-closure bond mismatch at position " + (mCurrentIndex + 1) + " of " + mSMILES + "!");
296                  }
297
298                  ringClosureBond.setBondOrder(specifiedBondOrder);
299               }
300
301               // Close the ring
302               ringClosureBond.setSecondAtom(prevAtom);
303               prevAtom.addBond(ringClosureBond);
304            }
305            else
306            {
307               // Start the ring
308               if (null == bond)
309               {
310                  bond = new CovalentBond(prevAtom, null);
311                  prevAtom.addBond(bond);
312                  mRingClosureMap.put(ringClosureNum, bond);
313               }
314
315               mRingClosureMap.put(ringClosureNum, bond);
316               bond = null;
317            }
318
319            nextChar();   // Consume the peeked char
320
321         }
322         else
323         {
324            switch (theChar)
325            {
326               case '-': // Single bond
327                  bond = new CovalentBond(prevAtom, 1);
328                  prevAtom.addBond(bond);
329                  break;
330
331               case '=': // Double bond
332                  bond = new CovalentBond(prevAtom, 2);
333                  prevAtom.addBond(bond);
334                  break;
335
336               case '#': // Triple bond
337                  bond = new CovalentBond(prevAtom, 3);
338                  prevAtom.addBond(bond);
339                  break;
340
341               case '$': // Quadruple bond
342                  bond = new CovalentBond(prevAtom, 4);
343                  prevAtom.addBond(bond);
344                  break;
345
346               case ':': // aromatic bond
347                  bond = new CovalentBond(prevAtom, 1).setIsAromatic();
348                  prevAtom.addBond(bond);
349                  break;
350
351               case '.': // Ionic bond (non-covalent)
352                  bond = null;
353                  ionicBond = true;
354                  break;
355
356               case '/': // Cis/Trans "up" bond
357                  bond = new CovalentBond(prevAtom, 1).setIsUp();
358                  prevAtom.addBond(bond);
359                  break;
360
361               case '\\': // Cis/Trans "down" bond
362                  bond = new CovalentBond(prevAtom, 1).setIsDown();
363                  prevAtom.addBond(bond);
364                  break;
365
366               case ' ':
367               case '\t':
368               case '\r':
369               case '\n':
370               case (char) -1:
371                  mEND = true;
372                  break;
373
374               default:
375                  throw new SMILES_ParseException("Unexpected character '" + theChar + "' at position " + (mCurrentIndex + 1) + " in " + StringUtil.singleQuote(mSMILES) + "!");
376            }
377
378            nextChar();  // Consume the peeked char
379         }
380
381         theChar = peek();
382      }
383
384      if (theChar == inEndingChar)
385      {
386         nextChar(); // Consume it
387      }
388
389      return atoms;
390   }
391
392   //---------------------------------------------------------------------------
393   private Atom parseAtom()
394   {
395      char theChar = nextChar();
396      boolean inBrackets = (theChar == '[');
397      if (inBrackets)
398      {
399         theChar = nextChar();
400      }
401
402      String isotopeString = "";
403
404      while (! mEND
405             && Character.isDigit(theChar))
406      {
407         isotopeString += theChar;
408         theChar = nextChar();
409      }
410
411      String symbol = "";
412      boolean aromatic = false;
413
414      if (Character.isLowerCase(theChar))
415      {
416         // Lowercase symbols indicate aromatic atoms
417
418         if ("bcnospat".indexOf(theChar) < 0)  // s for S or Se; a for As, t for Te
419         {
420            throw new SMILES_ParseException("Unexpected aromatic symbol '" + theChar + "' at position " + (mCurrentIndex + 1) + " of " + StringUtil.singleQuote(mSMILES) + "!");
421         }
422
423         symbol += Character.toUpperCase(theChar);
424         aromatic = true;
425      }
426      else
427      {
428         symbol += theChar;
429      }
430
431      theChar = peek();
432      if (inBrackets || ! aromatic)
433      {
434         // Two char element symbols are possible
435
436         if (Character.isLetter(theChar))
437         {
438            // It could be a second letter of an element symbol or it could be the next element
439            if (Element.valueOf(symbol + theChar) != null)
440            {
441               symbol += theChar;
442               nextChar();   // Consume the second symbol letter
443               theChar = peek();
444            }
445         }
446      }
447
448      Element element = Element.valueOf(symbol);
449
450      if (isotopeString.length() > 0)
451      {
452         element = Isotope.valueOf(element, Integer.parseInt(isotopeString));
453      }
454
455      Atom atom = new Atom(element);
456      if (aromatic)
457      {
458         atom.setIsAromatic(true);
459      }
460
461      // Chirality (optional)
462      if ('@' == theChar)
463      {
464         Atom.ChiralityOrder chiralityOrder = Atom.ChiralityOrder.anticlockwise;
465         nextChar();   // Consume the character
466
467         theChar = peek();
468         if ('@' == theChar)
469         {
470            chiralityOrder = Atom.ChiralityOrder.clockwise;
471            nextChar();   // Consume the character
472            theChar = peek();
473         }
474
475         atom.setChiralityOrder(chiralityOrder);
476      }
477
478      // Explicit hydrogen count (optional)
479      if ('H' == theChar)
480      {
481         int hCount = 1;
482         nextChar();   // Consume the character
483         theChar = peek();
484         if (Character.isDigit(theChar))
485         {
486            hCount = Integer.parseInt(theChar + "");
487            nextChar();   // Consume the character
488            theChar = peek();
489         }
490
491         atom.setHCount(hCount);
492      }
493      else if (inBrackets)
494      {
495         atom.setHCount(0);
496      }
497
498      // Charge (optional)
499      if (inBrackets)
500      {
501         if ('+' == theChar)
502         {
503            int charge = 1;
504            nextChar();   // Consume the character
505            theChar = peek();
506
507
508            if ('+' == theChar) // ++
509            {
510               charge = 2;
511               nextChar();   // Consume the character
512               theChar = peek();
513            }
514            else
515            {
516               String chargeString = "";
517               while (Character.isDigit(theChar))
518               {
519                  chargeString += theChar;
520                  nextChar();   // Consume the character
521                  theChar = peek();
522               }
523
524               if (chargeString.length() > 0)
525               {
526                  charge = Integer.parseInt(chargeString);
527               }
528            }
529
530            atom.setCharge(charge);
531         }
532         else if ('-' == theChar)
533         {
534            int charge = -1;
535            nextChar();   // Consume the character
536            theChar = peek();
537
538
539            if ('-' == theChar) // ++
540            {
541               charge = -2;
542               nextChar();   // Consume the character
543               theChar = peek();
544            }
545            else
546            {
547               String chargeString = "";
548               while (Character.isDigit(theChar))
549               {
550                  chargeString += theChar;
551                  nextChar();   // Consume the character
552                  theChar = peek();
553               }
554
555               if (chargeString.length() > 0)
556               {
557                  charge = -1 * Integer.parseInt(chargeString);
558               }
559            }
560
561            atom.setCharge(charge);
562         }
563
564         // Atom class
565         if (':' == theChar)
566         {
567            nextChar();   // Consume the character
568            theChar = peek();
569
570            String classString = "";
571
572            while (Character.isDigit(theChar))
573            {
574               classString += theChar;
575               nextChar();   // Consume the character
576               theChar = peek();
577            }
578
579            atom.setAtomClass(Integer.parseInt(classString));
580         }
581      }
582
583      if (']' == theChar)
584      {
585         if (inBrackets)
586         {
587            // We were expecting this ending bracket
588            ++mCurrentIndex;
589         }
590         else
591         {
592            throw new SMILES_ParseException("Unexpected ending atom bracket at position " + (mCurrentIndex + 1) + "!");
593         }
594      }
595      else if (inBrackets)
596      {
597         throw new SMILES_ParseException("Missing ending atom bracket at position " + (mCurrentIndex + 1) + "!");
598      }
599
600      return atom;
601   }
602
603   //---------------------------------------------------------------------------
604   private List<Atom> parseBranch(Atom inPrevAtom)
605   {
606      char theChar = nextChar();
607      if ('(' != theChar)
608      {
609         throw new SMILES_ParseException("SMILES branch missing starting parenthesis at position " + (mCurrentIndex + 1) + "!");
610      }
611
612      return parseSection(inPrevAtom, ')');
613   }
614}