001package com.hfg.chem.format; 002 003import java.util.ArrayList; 004import java.util.HashMap; 005import java.util.List; 006import java.util.Map; 007 008import com.hfg.chem.Atom; 009import com.hfg.chem.CovalentBond; 010import com.hfg.chem.Element; 011import com.hfg.chem.Isotope; 012import com.hfg.chem.MolecularStructureException; 013import com.hfg.chem.Molecule; 014import com.hfg.chem.ValenceModel; 015import com.hfg.util.StringUtil; 016import com.hfg.util.collection.CollectionUtil; 017 018 019//------------------------------------------------------------------------------ 020/** 021 Class for parsing a Simplified Molecular-Input Line-Entry System (SMILES) 022 string into a Molecule. 023 024 @see <a href='http://opensmiles.org/'>opensmiles.org</a> 025 <div> 026 @author J. Alex Taylor, hairyfatguy.com 027 </div> 028 */ 029//------------------------------------------------------------------------------ 030// com.hfg Library 031// 032// This library is free software; you can redistribute it and/or 033// modify it under the terms of the GNU Lesser General Public 034// License as published by the Free Software Foundation; either 035// version 2.1 of the License, or (at your option) any later version. 036// 037// This library is distributed in the hope that it will be useful, 038// but WITHOUT ANY WARRANTY; without even the implied warranty of 039// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 040// Lesser General Public License for more details. 041// 042// You should have received a copy of the GNU Lesser General Public 043// License along with this library; if not, write to the Free Software 044// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 045// 046// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com 047// jataylor@hairyfatguy.com 048//------------------------------------------------------------------------------ 049// SOME PARSING RULES: 050// - A SMILES string is terminated by a whitespace terminator character (space, tab, 051// newline, carriage-return), or by the end of the string. 052// - A SMILES parser should accept at least four digits for the atom class, and the values 0 to 9999. 053// - Ring-closure numbers can be reused. 054 055public class SMILES_Parser 056{ 057 private String mSMILES; 058 private int mSMILES_Length; 059 private int mCurrentIndex; 060 private boolean mEND; 061 private Map<Integer, CovalentBond> mRingClosureMap; 062 private ValenceModel mValenceModel = ValenceModel.MDL_2017; // TODO: Switch to SMILES-specific valence model 063 064 //########################################################################### 065 // PUBLIC METHODS 066 //########################################################################### 067 068 //--------------------------------------------------------------------------- 069 public synchronized Molecule parse(String inValue) 070 { 071 Molecule molecule = null; 072 073 if (StringUtil.isSet(inValue)) 074 { 075 init(); 076 077 mSMILES = inValue.trim(); 078 mSMILES_Length = mSMILES.length(); 079 080 List<Atom> atoms = new ArrayList<>(10); 081 082 try 083 { 084 Atom atom = parseAtom(); 085 086 atoms.add(atom); 087 088 Integer hCount = atom.getHCount(); 089 if (hCount != null) 090 { 091 for (int i = 0; i < hCount; i++) 092 { 093 Atom hAtom = new Atom(Element.HYDROGEN); 094 atoms.add(hAtom); 095 CovalentBond hBond = new CovalentBond(atom, hAtom); 096 atom.addBond(hBond); 097 hAtom.addBond(hBond); 098 } 099 } 100 101 102 List<Atom> atomList = parseSection(atom, (char) -1); 103 if (CollectionUtil.hasValues(atomList)) 104 { 105 atoms.addAll(atomList); 106 } 107 108 // Check to see if any rings were left open 109 if (CollectionUtil.hasValues(mRingClosureMap)) 110 { 111 throw new SMILES_ParseException("Ring " + mRingClosureMap.keySet().iterator().next() + " left unclosed in " + mSMILES + "!"); 112 } 113 114 // We've finished parsing the string. Now build the molecule. 115 molecule = new Molecule().addAtoms(atoms); 116 117 // Set implicit hydrogen counts 118 for (Atom molAtom : new ArrayList<>(molecule.getAtoms())) 119 { 120 int implicitHCount = mValenceModel.calculateImplicitHCount(molAtom); 121 if (implicitHCount > 0) 122 { 123 for (int i = 0; i < implicitHCount; i++) 124 { 125 Atom hAtom = new Atom(Element.HYDROGEN); 126 molecule.addAtom(hAtom); 127 CovalentBond bond = new CovalentBond(molAtom, hAtom); 128 molAtom.addBond(bond); 129 hAtom.addBond(bond); 130 } 131 } 132 } 133 } 134 catch (MolecularStructureException e) 135 { 136 throw new SMILES_ParseException("Problem parsing " + StringUtil.singleQuote(mSMILES) + "!", e); 137 } 138 } 139 140 return molecule; 141 } 142 143 //########################################################################### 144 // PRIVATE METHODS 145 //########################################################################### 146 147 //--------------------------------------------------------------------------- 148 private void init() 149 { 150 mCurrentIndex = 0; 151 mEND = false; 152 mRingClosureMap = new HashMap<>(4); 153 } 154 155 //--------------------------------------------------------------------------- 156 private char peek() 157 { 158 char theChar = (char) -1; 159 if (mCurrentIndex < mSMILES_Length) 160 { 161 theChar = mSMILES.charAt(mCurrentIndex); 162 } 163 else 164 { 165 mEND = true; 166 } 167 168 return theChar; 169 } 170 171 //--------------------------------------------------------------------------- 172 private char nextChar() 173 { 174 char theChar; 175 if (mCurrentIndex >= mSMILES_Length) 176 { 177 mEND = true; 178 theChar = (char) -1; 179 } 180 else 181 { 182 theChar = mSMILES.charAt(mCurrentIndex++); 183 if (mCurrentIndex >= mSMILES_Length) 184 { 185 mEND = true; 186 } 187 } 188 189 return theChar; 190 } 191 192 //--------------------------------------------------------------------------- 193 private List<Atom> parseSection(Atom inPrevAtom, char inEndingChar) 194 { 195 List<Atom> atoms = new ArrayList<>(5); 196 197 CovalentBond bond = null; 198 Atom atom; 199 Atom prevAtom = inPrevAtom; 200 Integer hCount; 201 boolean ionicBond = false; 202 203 char theChar = peek(); 204 205 while (! mEND 206 && theChar != inEndingChar) 207 { 208 if ('[' == theChar 209 || Character.isLetter(theChar)) 210 { 211 // Letter indicates an atom (and a default single bond between them) 212 atom = parseAtom(); 213 atoms.add(atom); 214 215 hCount = atom.getHCount(); 216 if (hCount != null) 217 { 218 for (int i = 0; i < hCount; i++) 219 { 220 Atom hAtom = new Atom(Element.HYDROGEN); 221 atoms.add(hAtom); 222 CovalentBond hBond = new CovalentBond(atom, hAtom); 223 atom.addBond(hBond); 224 hAtom.addBond(hBond); 225 } 226 } 227 228 if (ionicBond) 229 { 230 ionicBond = false; 231 } 232 else 233 { 234 if (null == bond) 235 { 236 bond = new CovalentBond(prevAtom, atom); // A bond order of 1 is assumed 237 prevAtom.addBond(bond); 238 } 239 else 240 { 241 bond.setSecondAtom(atom); 242 } 243 244 atom.addBond(bond); 245 } 246 247 prevAtom = atom; 248 bond = null; 249 } 250 else if ('(' == theChar) 251 { 252 atoms.addAll(parseBranch(prevAtom)); 253 } 254 else if (Character.isDigit(theChar) 255 || '%' == theChar) // '%' preceeds the use of a 2-digit ring-closure number 256 { 257 String ringClosureNumString = ""; 258 259 if ('%' == theChar) 260 { 261 nextChar(); // Consume the '%' 262 theChar = peek(); 263 264 ringClosureNumString += theChar; 265 nextChar(); // Consume the firstDigit 266 theChar = peek(); 267 } 268 269 ringClosureNumString += theChar; 270 271 272 if (! StringUtil.isNumber(ringClosureNumString)) 273 { 274 throw new SMILES_ParseException("The '%' at position " + (mCurrentIndex) 275 + " of " + StringUtil.singleQuote(mSMILES) 276 + " should precede a 2-digit ring-closure number!"); 277 } 278 279 // Ring-closure number 280 int ringClosureNum = Integer.parseInt(ringClosureNumString); 281 CovalentBond ringClosureBond = mRingClosureMap.remove(ringClosureNum); 282 if (ringClosureBond != null) 283 { 284 // Don't leave a half-processed bond (we didn't know yet that it was a ring bond) 285 Integer specifiedBondOrder = null; 286 if (bond != null) 287 { 288 specifiedBondOrder = bond.getSpecifiedBondOrder(); 289 prevAtom.removeBond(bond); 290 291 if (specifiedBondOrder != null 292 && ringClosureBond.getSpecifiedBondOrder() != null 293 && ! ringClosureBond.getSpecifiedBondOrder().equals(specifiedBondOrder)) 294 { 295 throw new SMILES_ParseException("Ring-closure bond mismatch at position " + (mCurrentIndex + 1) + " of " + mSMILES + "!"); 296 } 297 298 ringClosureBond.setBondOrder(specifiedBondOrder); 299 } 300 301 // Close the ring 302 ringClosureBond.setSecondAtom(prevAtom); 303 prevAtom.addBond(ringClosureBond); 304 } 305 else 306 { 307 // Start the ring 308 if (null == bond) 309 { 310 bond = new CovalentBond(prevAtom, null); 311 prevAtom.addBond(bond); 312 mRingClosureMap.put(ringClosureNum, bond); 313 } 314 315 mRingClosureMap.put(ringClosureNum, bond); 316 bond = null; 317 } 318 319 nextChar(); // Consume the peeked char 320 321 } 322 else 323 { 324 switch (theChar) 325 { 326 case '-': // Single bond 327 bond = new CovalentBond(prevAtom, 1); 328 prevAtom.addBond(bond); 329 break; 330 331 case '=': // Double bond 332 bond = new CovalentBond(prevAtom, 2); 333 prevAtom.addBond(bond); 334 break; 335 336 case '#': // Triple bond 337 bond = new CovalentBond(prevAtom, 3); 338 prevAtom.addBond(bond); 339 break; 340 341 case '$': // Quadruple bond 342 bond = new CovalentBond(prevAtom, 4); 343 prevAtom.addBond(bond); 344 break; 345 346 case ':': // aromatic bond 347 bond = new CovalentBond(prevAtom, 1).setIsAromatic(); 348 prevAtom.addBond(bond); 349 break; 350 351 case '.': // Ionic bond (non-covalent) 352 bond = null; 353 ionicBond = true; 354 break; 355 356 case '/': // Cis/Trans "up" bond 357 bond = new CovalentBond(prevAtom, 1).setIsUp(); 358 prevAtom.addBond(bond); 359 break; 360 361 case '\\': // Cis/Trans "down" bond 362 bond = new CovalentBond(prevAtom, 1).setIsDown(); 363 prevAtom.addBond(bond); 364 break; 365 366 case ' ': 367 case '\t': 368 case '\r': 369 case '\n': 370 case (char) -1: 371 mEND = true; 372 break; 373 374 default: 375 throw new SMILES_ParseException("Unexpected character '" + theChar + "' at position " + (mCurrentIndex + 1) + " in " + StringUtil.singleQuote(mSMILES) + "!"); 376 } 377 378 nextChar(); // Consume the peeked char 379 } 380 381 theChar = peek(); 382 } 383 384 if (theChar == inEndingChar) 385 { 386 nextChar(); // Consume it 387 } 388 389 return atoms; 390 } 391 392 //--------------------------------------------------------------------------- 393 private Atom parseAtom() 394 { 395 char theChar = nextChar(); 396 boolean inBrackets = (theChar == '['); 397 if (inBrackets) 398 { 399 theChar = nextChar(); 400 } 401 402 String isotopeString = ""; 403 404 while (! mEND 405 && Character.isDigit(theChar)) 406 { 407 isotopeString += theChar; 408 theChar = nextChar(); 409 } 410 411 String symbol = ""; 412 boolean aromatic = false; 413 414 if (Character.isLowerCase(theChar)) 415 { 416 // Lowercase symbols indicate aromatic atoms 417 418 if ("bcnospat".indexOf(theChar) < 0) // s for S or Se; a for As, t for Te 419 { 420 throw new SMILES_ParseException("Unexpected aromatic symbol '" + theChar + "' at position " + (mCurrentIndex + 1) + " of " + StringUtil.singleQuote(mSMILES) + "!"); 421 } 422 423 symbol += Character.toUpperCase(theChar); 424 aromatic = true; 425 } 426 else 427 { 428 symbol += theChar; 429 } 430 431 theChar = peek(); 432 if (inBrackets || ! aromatic) 433 { 434 // Two char element symbols are possible 435 436 if (Character.isLetter(theChar)) 437 { 438 // It could be a second letter of an element symbol or it could be the next element 439 if (Element.valueOf(symbol + theChar) != null) 440 { 441 symbol += theChar; 442 nextChar(); // Consume the second symbol letter 443 theChar = peek(); 444 } 445 } 446 } 447 448 Element element = Element.valueOf(symbol); 449 450 if (isotopeString.length() > 0) 451 { 452 element = Isotope.valueOf(element, Integer.parseInt(isotopeString)); 453 } 454 455 Atom atom = new Atom(element); 456 if (aromatic) 457 { 458 atom.setIsAromatic(true); 459 } 460 461 // Chirality (optional) 462 if ('@' == theChar) 463 { 464 Atom.ChiralityOrder chiralityOrder = Atom.ChiralityOrder.anticlockwise; 465 nextChar(); // Consume the character 466 467 theChar = peek(); 468 if ('@' == theChar) 469 { 470 chiralityOrder = Atom.ChiralityOrder.clockwise; 471 nextChar(); // Consume the character 472 theChar = peek(); 473 } 474 475 atom.setChiralityOrder(chiralityOrder); 476 } 477 478 // Explicit hydrogen count (optional) 479 if ('H' == theChar) 480 { 481 int hCount = 1; 482 nextChar(); // Consume the character 483 theChar = peek(); 484 if (Character.isDigit(theChar)) 485 { 486 hCount = Integer.parseInt(theChar + ""); 487 nextChar(); // Consume the character 488 theChar = peek(); 489 } 490 491 atom.setHCount(hCount); 492 } 493 else if (inBrackets) 494 { 495 atom.setHCount(0); 496 } 497 498 // Charge (optional) 499 if (inBrackets) 500 { 501 if ('+' == theChar) 502 { 503 int charge = 1; 504 nextChar(); // Consume the character 505 theChar = peek(); 506 507 508 if ('+' == theChar) // ++ 509 { 510 charge = 2; 511 nextChar(); // Consume the character 512 theChar = peek(); 513 } 514 else 515 { 516 String chargeString = ""; 517 while (Character.isDigit(theChar)) 518 { 519 chargeString += theChar; 520 nextChar(); // Consume the character 521 theChar = peek(); 522 } 523 524 if (chargeString.length() > 0) 525 { 526 charge = Integer.parseInt(chargeString); 527 } 528 } 529 530 atom.setCharge(charge); 531 } 532 else if ('-' == theChar) 533 { 534 int charge = -1; 535 nextChar(); // Consume the character 536 theChar = peek(); 537 538 539 if ('-' == theChar) // ++ 540 { 541 charge = -2; 542 nextChar(); // Consume the character 543 theChar = peek(); 544 } 545 else 546 { 547 String chargeString = ""; 548 while (Character.isDigit(theChar)) 549 { 550 chargeString += theChar; 551 nextChar(); // Consume the character 552 theChar = peek(); 553 } 554 555 if (chargeString.length() > 0) 556 { 557 charge = -1 * Integer.parseInt(chargeString); 558 } 559 } 560 561 atom.setCharge(charge); 562 } 563 564 // Atom class 565 if (':' == theChar) 566 { 567 nextChar(); // Consume the character 568 theChar = peek(); 569 570 String classString = ""; 571 572 while (Character.isDigit(theChar)) 573 { 574 classString += theChar; 575 nextChar(); // Consume the character 576 theChar = peek(); 577 } 578 579 atom.setAtomClass(Integer.parseInt(classString)); 580 } 581 } 582 583 if (']' == theChar) 584 { 585 if (inBrackets) 586 { 587 // We were expecting this ending bracket 588 ++mCurrentIndex; 589 } 590 else 591 { 592 throw new SMILES_ParseException("Unexpected ending atom bracket at position " + (mCurrentIndex + 1) + "!"); 593 } 594 } 595 else if (inBrackets) 596 { 597 throw new SMILES_ParseException("Missing ending atom bracket at position " + (mCurrentIndex + 1) + "!"); 598 } 599 600 return atom; 601 } 602 603 //--------------------------------------------------------------------------- 604 private List<Atom> parseBranch(Atom inPrevAtom) 605 { 606 char theChar = nextChar(); 607 if ('(' != theChar) 608 { 609 throw new SMILES_ParseException("SMILES branch missing starting parenthesis at position " + (mCurrentIndex + 1) + "!"); 610 } 611 612 return parseSection(inPrevAtom, ')'); 613 } 614}