001package com.hfg.chem.format; 002 003 004import java.io.BufferedReader; 005import java.util.ArrayList; 006import java.util.List; 007import java.util.regex.Matcher; 008import java.util.regex.Pattern; 009 010import com.hfg.chem.Atom; 011import com.hfg.chem.CovalentBond; 012import com.hfg.chem.Element; 013import com.hfg.chem.Molecule; 014import com.hfg.bio.seq.format.SeqIOException; 015import com.hfg.chem.ValenceModel; 016import com.hfg.util.StringBuilderPlus; 017import com.hfg.util.StringUtil; 018 019//------------------------------------------------------------------------------ 020/** 021 Basic implementation of the MDL SDF format. 022 <div> 023 @author J. Alex Taylor, hairyfatguy.com 024 </div> 025 */ 026//------------------------------------------------------------------------------ 027// com.hfg Library 028// 029// This library is free software; you can redistribute it and/or 030// modify it under the terms of the GNU Lesser General Public 031// License as published by the Free Software Foundation; either 032// version 2.1 of the License, or (at your option) any later version. 033// 034// This library is distributed in the hope that it will be useful, 035// but WITHOUT ANY WARRANTY; without even the implied warranty of 036// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 037// Lesser General Public License for more details. 038// 039// You should have received a copy of the GNU Lesser General Public 040// License along with this library; if not, write to the Free Software 041// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 042// 043// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com 044// jataylor@hairyfatguy.com 045//------------------------------------------------------------------------------ 046 047public class MDL_SDF<T extends Molecule> extends ReadableChemFormatBase<T> 048{ 049 private static final Pattern ATTRIBUTE_HEADER_PATTERN = Pattern.compile(">\\s+<(\\S+)>"); 050 051 // TODO: Add a way to specify the valence model or whether the MDL file uses the pre or post 2017 valence models 052 private ValenceModel mValenceModel = ValenceModel.MDL_2017; 053 054 private List<String> mMolLines = new ArrayList<>(50); 055 private Integer mAtomCount; 056 private Integer mBondCount; 057 058 059 //########################################################################### 060 // CONSTRUCTORS 061 //########################################################################### 062 063 //--------------------------------------------------------------------------- 064 public MDL_SDF() 065 { 066 super(null); 067 } 068 069 //--------------------------------------------------------------------------- 070 public MDL_SDF(MoleculeFactory<T> inMoleculeFactory) 071 { 072 super(inMoleculeFactory); 073 } 074 075 //########################################################################### 076 // PUBLIC METHODS 077 //########################################################################### 078 079 //--------------------------------------------------------------------------- 080 @Override 081 public boolean hasJanusDelimiter() 082 { 083 return false; 084 } 085 086 //--------------------------------------------------------------------------- 087 @Override 088 public boolean isEndOfRecord(String inLine) 089 { 090 return inLine.trim().equals("$$$$"); 091 } 092 093 //--------------------------------------------------------------------------- 094 @Override 095 public synchronized T readRecord(BufferedReader inReader) throws ChemIOException 096 { 097 if (null == getMoleculeFactory()) 098 { 099 throw new SeqIOException("No BioSequence factory has been specified!"); 100 } 101 102 T mol; 103 try 104 { 105 mol = getMoleculeFactory().createMoleculeObj(); 106 107 mMolLines.clear(); 108 mAtomCount = null; 109 mBondCount = null; 110 111 StringBuilderPlus currentAttributeValue = new StringBuilderPlus().setDelimiter("\n"); 112 String currentAttributeName = null; 113 String line; 114 boolean structureComplete = false; 115 116 int lineCount = 1; 117 while ((line = inReader.readLine()) != null) 118 { 119 if (! structureComplete) 120 { 121 // The might not be an molfile section 122 Matcher m = ATTRIBUTE_HEADER_PATTERN.matcher(line); 123 if (m.matches()) 124 { 125 structureComplete = true; 126 127 if (mMolLines.size() > 0) 128 { 129 setStructure(mol, mMolLines); 130 } 131 } 132 else 133 { 134 mMolLines.add(line); 135 136 if (line.trim().matches("M\\s+END")) 137 { 138 structureComplete = true; 139 setStructure(mol, mMolLines); 140 continue; 141 } 142 } 143 } 144 145 if (structureComplete) 146 { 147 line = line.trim(); 148 149 // The molecule record can optionally be followed by attributes 150 151 if (!StringUtil.isSet(line)) // A blank line is used to separate attributes 152 { 153 if (StringUtil.isSet(currentAttributeName)) 154 { 155 mol.setAttribute(currentAttributeName, (currentAttributeValue.length() > 0 ? currentAttributeValue.toString() : null)); 156 currentAttributeName = null; 157 } 158 } 159 else 160 { 161 Matcher m = ATTRIBUTE_HEADER_PATTERN.matcher(line); 162 if (m.matches()) 163 { 164 if (currentAttributeName != null) 165 { 166 // The blank line between attributes may have been missing 167 mol.setAttribute(currentAttributeName, (currentAttributeValue.length() > 0 ? currentAttributeValue.toString() : null)); 168 } 169 170 currentAttributeName = m.group(1); 171 currentAttributeValue.setLength(0); 172 } 173 else if (currentAttributeName != null) 174 { 175 currentAttributeValue.delimitedAppend(line); 176 } 177 } 178 } 179 } 180 } 181 catch (Exception e) 182 { 183 if (e instanceof ChemIOException) 184 { 185 throw (ChemIOException) e; 186 } 187 else 188 { 189 throw new ChemIOException(e); 190 } 191 } 192 193 return mol; 194 } 195 196 //--------------------------------------------------------------------------- 197 private void setStructure(T inMolecule, List<String> inMolLines) 198 { 199// inMolecule.setAttribute("molfile", inMolLines); 200 201 parseMolHeader(inMolecule, inMolLines); 202 203 // Examine the Counts line 204 // Ex: ' 9 8 0 0 0 0 0 0 0999 V2000' 205 parseCountsLine(inMolecule, inMolLines); 206 207 parseAtomsBlock(inMolecule, inMolLines); 208 209 parseBondsBlock(inMolecule, inMolLines); 210 211 parseProperties(inMolecule, inMolLines); 212 213 if (inMolecule.getAtoms() != null) 214 { 215 // Set implicit hydrogen counts 216 for (Atom molAtom : new ArrayList<>(inMolecule.getAtoms())) 217 { 218 int implicitHCount = mValenceModel.calculateImplicitHCount(molAtom); 219 if (implicitHCount > 0) 220 { 221 for (int i = 0; i < implicitHCount; i++) 222 { 223 Atom hAtom = new Atom(Element.HYDROGEN); 224 inMolecule.addAtom(hAtom); 225 CovalentBond bond = new CovalentBond(molAtom, hAtom); 226 molAtom.addBond(bond); 227 hAtom.addBond(bond); 228 } 229 } 230 } 231 } 232 } 233 234 //--------------------------------------------------------------------------- 235 private void parseMolHeader(T inMolecule, List<String> inMolLines) 236 { 237 if (inMolLines.size() >= 3) 238 { 239 // The first 3 line constitute the record header 240 241 // The first line may contain the name of the molecule 242 String structureStringName = inMolLines.get(0).trim(); 243 if (StringUtil.isSet(structureStringName) 244 && ! structureStringName.equals("NO STRUCTURE")) 245 { 246 inMolecule.setName(structureStringName); 247 } 248 249 // Line 2 optionally contains the details of the software used to generate the record 250 251 // Line 3 contains an optional comment 252 } 253 } 254 255 //--------------------------------------------------------------------------- 256 private void parseCountsLine(T inMolecule, List<String> inMolLines) 257 { 258 // The 4th line may contain the Counts line. 259 // The Counts line is composed of 12 fixed-length fields - the first eleven 260 // are 3 characters long, and the last 6 characters long. 261 // The first two fields are the number of atoms and bonds respectively. 262 // Ex: ' 9 8 0 0 0 0 0 0 0999 V2000' 263 if (inMolLines.size() >= 4) 264 { 265 String countsLine = inMolLines.get(3); 266 if (StringUtil.isSet(countsLine)) 267 { 268 if (countsLine.length() != 39) 269 { 270 throw new ChemIOException("Unexpected Counts line length for " + StringUtil.singleQuote(countsLine) + "!"); 271 } 272 273 // The first field is the number of atoms 274 mAtomCount = Integer.parseInt(countsLine.substring(0, 3).trim()); 275 // The first field is the number of bonds 276 mBondCount = Integer.parseInt(countsLine.substring(3, 6).trim()); 277 278 // TODO: Chirality should be the 5th field (but 4th could be empty?) 279 } 280 } 281 } 282 283 //--------------------------------------------------------------------------- 284 private void parseAtomsBlock(T inMolecule, List<String> inMolLines) 285 { 286 // The 5th line may be the first atom line 287 // Ex: ' 1.9050 -0.7932 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0' 288 if (mAtomCount != null 289 && inMolLines.size() >= 4 + mAtomCount) 290 { 291 for (int i = 0; i < mAtomCount; i++) 292 { 293 String atomLine = inMolLines.get(4 + i); 294 String[] fields = atomLine.trim().split("\\s+"); 295 296 Element element = Element.valueOf(fields[3]); 297 Atom atom = new Atom(element) 298 .setXCoordinate(Float.valueOf(fields[0])) 299 .setYCoordinate(Float.valueOf(fields[1])) 300 .setZCoordinate(Float.valueOf(fields[2])); 301 302 int chargeValue = Integer.parseInt(fields[5]); 303 if (chargeValue != 0) 304 { 305 int charge = 0; 306 switch (chargeValue) 307 { 308 case 7: 309 charge = -3; 310 break; 311 case 6: 312 charge = -2; 313 break; 314 case 5: 315 charge = -1; 316 break; 317 case 3: 318 charge = 1; 319 break; 320 case 2: 321 charge = 2; 322 break; 323 case 1: 324 charge = 3; 325 break; 326 // TODO: 4 ==> Doublet radical 327 } 328 329 atom.setCharge(charge); 330 } 331 332 inMolecule.addAtom(atom); 333 } 334 } 335 } 336 337 //--------------------------------------------------------------------------- 338 private void parseBondsBlock(T inMolecule, List<String> inMolLines) 339 { 340 // Bond lines may follow the atom lines 341 // Ex: ' 2 1 1 0 0 0 0' 342 if (mBondCount != null 343 && inMolLines.size() >= 4 + mAtomCount + mBondCount) 344 { 345 List<Atom> atoms = inMolecule.getAtoms(); 346 347 for (int i = 0; i < mBondCount; i++) 348 { 349 String bondLine = inMolLines.get(4 + mAtomCount + i); 350 String[] fields = bondLine.trim().split("\\s+"); 351 352 int atom1Num = Integer.parseInt(fields[0]); 353 int atom2Num = Integer.parseInt(fields[1]); 354 355 Atom atom1 = atoms.get(atom1Num - 1); 356 Atom atom2 = atoms.get(atom2Num - 1); 357 358 CovalentBond bond = new CovalentBond(atom1, atom2); 359 360 int bondOrder = Integer.parseInt(fields[2]); 361 if (bondOrder <= 3) 362 { 363 bond.setBondOrder(bondOrder); 364 } 365 else if (bondOrder == 4) 366 { 367 bond.setBondOrder(1); 368 atom1.setIsAromatic(true); 369 atom2.setIsAromatic(true); 370 bond.setIsAromatic(); 371 } 372 373 atom1.addBond(bond); 374 atom2.addBond(bond); 375 } 376 } 377 } 378 379 //--------------------------------------------------------------------------- 380 private void parseProperties(T inMolecule, List<String> inMolLines) 381 { 382 // Property lines may follow the Bonds block and will start with an 'M' 383 // Charge Ex: 'M CHG 1 1 2' 384 // Isotope Ex: 'M ISO 1 1 2' 385 if (mAtomCount != null 386 && mBondCount != null 387 && inMolLines.size() >= 4 + mAtomCount + mBondCount) 388 { 389 for (int i = 4 + mAtomCount + mBondCount; i < inMolLines.size(); i++) 390 { 391 String propertyLine = inMolLines.get(i); 392 393 if (propertyLine.startsWith("M CHG")) 394 { 395 // Charge 396 // The 1st field specifies the number of defined charges (up to 8). 397 // Ea. defined charge consists of the atom # (1-based) and a charge 398 String[] fields = propertyLine.split("\\s+"); 399 for (int index = 3; index < fields.length - 1; index+=2) 400 { 401 int atomNum = Integer.parseInt(fields[index]); 402 int charge = Integer.parseInt(fields[index + 1]); 403 404 Atom atom = inMolecule.getAtoms().get(atomNum - 1); 405 atom.setCharge(charge); 406 } 407 } 408 else if (propertyLine.startsWith("M ISO")) 409 { 410 // Isotope 411 // TODO 412 } 413 } 414 } 415 } 416}