001package com.hfg.bio.phylogeny; 002 003import java.io.*; 004import java.util.*; 005import java.util.regex.Pattern; 006import java.util.regex.Matcher; 007 008import com.hfg.util.StringUtil; 009import com.hfg.network.Edge; 010import com.hfg.util.collection.CollectionUtil; 011import com.hfg.util.collection.MatrixCell; 012import com.hfg.util.collection.SymmetricNumberMatrix; 013 014//------------------------------------------------------------------------------ 015/** 016 Simple distance matrix container. Can read Phylip-format matrices or general 017 matrices where the identifiers do not contain whitespace. 018 019 <div class='example'>Most commonly constructed via a multiple sequence alignment: 020 <pre> 021 DistanceMatrix matrix = msa.getDistanceMatrix(new UncorrectedModel()); 022 </pre> 023 </div> 024 025 <div class='example'>Phylip-formatted matrix example: 026 <pre> 027 String testMatrix = 028 " 14\n" + 029 "Mouse \n" + 030 "Bovine 1.7043\n" + 031 "Lemur 2.0235 1.1901\n" + 032 "Tarsier 2.1378 1.3287 1.2905\n" + 033 "Squir Monk 1.5232 1.2423 1.3199 1.7878\n" + 034 "Jpn Macaq 1.8261 1.2508 1.3887 1.3137 1.0642\n" + 035 "Rhesus Mac 1.9182 1.2536 1.4658 1.3788 1.1124 0.1022\n" + 036 "Crab-E.Mac 2.0039 1.3066 1.4826 1.3826 0.9832 0.2061 0.2681\n"; 037 038 DistanceMatrix matrix = new DistanceMatrix(testMatrix); 039 </pre> 040 </div> 041 042 <div class='example'>General matrix example: 043 <pre> 044 String testMatrix = 045 "# Subunit distance matrix\n" + 046 "Alpha 0.000 1.000 2.000 3.000 3.000\n" + 047 "Beta 1.000 0.000 2.000 3.000 3.000\n" + 048 "Gamma 2.000 2.000 0.000 3.000 3.000\n" + 049 "Delta 3.000 3.000 0.000 0.000 1.000\n" + 050 "Epsilon 3.000 3.000 3.000 1.000 0.000\n\n"; 051 052 DistanceMatrix matrix = new DistanceMatrix(testMatrix); 053 </pre> 054 </div> 055 056 <div> 057 See the Phylip 058 <a href='http://evolution.genetics.washington.edu/phylip/doc/distance.html'>Distance matrix programs page</a> 059 for the description of the Phylip distance matrix format. 060 </div> 061 <div> 062 @author J. Alex Taylor, hairyfatguy.com 063 </div> 064 */ 065//------------------------------------------------------------------------------ 066// com.hfg Library 067// 068// This library is free software; you can redistribute it and/or 069// modify it under the terms of the GNU Lesser General Public 070// License as published by the Free Software Foundation; either 071// version 2.1 of the License, or (at your option) any later version. 072// 073// This library is distributed in the hope that it will be useful, 074// but WITHOUT ANY WARRANTY; without even the implied warranty of 075// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 076// Lesser General Public License for more details. 077// 078// You should have received a copy of the GNU Lesser General Public 079// License along with this library; if not, write to the Free Software 080// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 081// 082// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com 083// jataylor@hairyfatguy.com 084//------------------------------------------------------------------------------ 085 086public class DistanceMatrix implements Cloneable 087{ 088 private SymmetricNumberMatrix<String, Float> mMatrix; 089 090 private boolean mIsConsumable = true; 091 private boolean mIsConsumed; 092 093 private static final Pattern sPhylipPattern = Pattern.compile("(.{1,10})(?:\\s*(.+))?"); 094 private static final Pattern sPhylipFirstLinePattern = Pattern.compile("^\\s*\\d+\\s*[\\n\\r]"); 095 096 //########################################################################## 097 // CONSTRUCTORS 098 //########################################################################## 099 100 101 //-------------------------------------------------------------------------- 102 public DistanceMatrix() 103 { 104 this(100); 105 } 106 107 //-------------------------------------------------------------------------- 108 public DistanceMatrix(int inInitialSize) 109 { 110 mMatrix = new SymmetricNumberMatrix<>(inInitialSize); 111 } 112 113 //-------------------------------------------------------------------------- 114 public DistanceMatrix(String inMatrix) 115 { 116 this(); 117 if (StringUtil.isSet(inMatrix)) 118 { 119 try 120 { 121 BufferedReader reader = null; 122 try 123 { 124 reader = new BufferedReader(new StringReader(inMatrix)); 125 parseMatrix(reader); 126 } 127 finally 128 { 129 if (reader != null) 130 { 131 reader.close(); 132 } 133 } 134 } 135 catch (IOException e) 136 { 137 throw new RuntimeException("Problem parsing distance matrix!", e); 138 } 139 } 140 } 141 142 //-------------------------------------------------------------------------- 143 public DistanceMatrix(File inMatrix) 144 throws IOException 145 { 146 this(); 147 if (null == inMatrix) 148 { 149 throw new IOException("No file was specified!"); 150 } 151 else if (!inMatrix.exists()) 152 { 153 throw new IOException("The file " + StringUtil.singleQuote(inMatrix.getPath()) + " does not exist!"); 154 } 155 156 BufferedReader reader = null; 157 try 158 { 159 reader = new BufferedReader(new FileReader(inMatrix)); 160 parseMatrix(reader); 161 } 162 finally 163 { 164 if (reader != null) reader.close(); 165 } 166 } 167 168 //########################################################################## 169 // PUBLIC METHODS 170 //########################################################################## 171 172 //-------------------------------------------------------------------------- 173 public DistanceMatrix setIsConsumable(boolean inValue) 174 { 175 mIsConsumable = inValue; 176 return this; 177 } 178 179 //-------------------------------------------------------------------------- 180 public boolean isConsumable() 181 { 182 return mIsConsumable; 183 } 184 185 //-------------------------------------------------------------------------- 186 public DistanceMatrix setIsConsumed() 187 { 188 mIsConsumed = true; 189 return this; 190 } 191 192 //-------------------------------------------------------------------------- 193 public boolean isConsumed() 194 { 195 return mIsConsumed; 196 } 197 198 //-------------------------------------------------------------------------- 199 public void addKey(String inKey) 200 { 201 mMatrix.addKey(inKey); 202 } 203 204 //-------------------------------------------------------------------------- 205 public void removeKey(String inKey) 206 { 207 mMatrix.removeKey(inKey); 208 } 209 210 //-------------------------------------------------------------------------- 211 public void removeKeys(Set<String> inKeys) 212 { 213 mMatrix.removeKeys(inKeys); 214 } 215 216 //-------------------------------------------------------------------------- 217 /** 218 Changes the key name inOldKey to inNewKey. Can be useful in re-expanding names 219 after replacing them in order to comply with the 10-character Phylip format limitation. 220 @param inOldKey the old matrix key 221 @param inNewKey the new matrix key 222 */ 223 public void changeKey(String inOldKey, String inNewKey) 224 { 225 mMatrix.changeKey(inOldKey, inNewKey); 226 } 227 228 //-------------------------------------------------------------------------- 229 public void setDistance(String inKey1, String inKey2, Float inDistance) 230 { 231 if (! inKey1.equals(inKey2)) 232 { 233 mMatrix.put(inKey1, inKey2, inDistance); 234 } 235 } 236 237 //-------------------------------------------------------------------------- 238 public Float getDistance(String inKey1, String inKey2) 239 { 240 Float distance; 241 242 if (inKey1.equals(inKey2)) 243 { 244 distance = 0.0f; 245 } 246 else 247 { 248 distance = mMatrix.get(inKey1, inKey2); 249 } 250 251 return distance; 252 } 253 254 //-------------------------------------------------------------------------- 255 public int size() 256 { 257 return mMatrix.size(); 258 } 259 260 //-------------------------------------------------------------------------- 261 public int numKeys() 262 { 263 return mMatrix.numKeys(); 264 } 265 266 //-------------------------------------------------------------------------- 267 public Collection<String> keySet() 268 { 269 return Collections.unmodifiableCollection(mMatrix.keySet()); 270 } 271 272 //-------------------------------------------------------------------------- 273 public float getNetDivergence(String inKey) 274 { 275 return mMatrix.getSumForKey(inKey); 276 } 277 278 //-------------------------------------------------------------------------- 279 public String getNearestNeighbor(String inKey) 280 { 281 Float minValue = Float.MAX_VALUE; 282 String minKey = null; 283 for (String key2 : mMatrix.keySet()) 284 { 285 if (key2.equals(inKey)) 286 { 287 continue; 288 } 289 290 Float value = mMatrix.get(inKey, key2); 291 if (value != null 292 && value < minValue) 293 { 294 minValue = value; 295 minKey = key2; 296 } 297 } 298 299 return minKey; 300 } 301 302 //-------------------------------------------------------------------------- 303 /** 304 Returns the Edge with the shortest distance. If multiple edges are found with 305 the same distance, no guarantee is made as to which one will be returned. 306 @return the Edge with the shortest distance 307 */ 308 public Edge<String> getShortestEdge() 309 { 310 Edge<String> shortestEdge = null; 311 312 MatrixCell<String, String, Float> smallestValueCell = mMatrix.getNonIdentityCellWithSmallestValue(); 313 if (smallestValueCell != null) 314 { 315 shortestEdge = new Edge<>(smallestValueCell.getColKey(), smallestValueCell.getRowKey(), smallestValueCell.getValue()); 316 } 317 318 return shortestEdge; 319 } 320 321 322 //-------------------------------------------------------------------------- 323 /** 324 Returns the Edges for the specified sequence sorted shortest to longest. 325 @param inSeqId the id (key) of the sequence for which Edges should be retrieved 326 @return the Edges for the specified sequence sorted shortest to longest 327 */ 328 public List<Edge<String>> getSortedEdges(String inSeqId) 329 { 330 Collection<String> keySet = keySet(); 331 332 Map<String, Float> distanceMap = new HashMap<>(keySet.size()); 333 for (String key : keySet) 334 { 335 distanceMap.put(key, getDistance(inSeqId, key)); 336 } 337 338 Map<String, Float> sortedRowMap = CollectionUtil.sortMapByValue(distanceMap); 339 340 List<Edge<String>> edges = new ArrayList<>(sortedRowMap.size()); 341 for (String key : sortedRowMap.keySet()) 342 { 343 edges.add(new Edge<>(inSeqId, key, sortedRowMap.get(key))); 344 } 345 346 return edges; 347 } 348 349 //-------------------------------------------------------------------------- 350 @Override 351 public DistanceMatrix clone() 352 { 353 DistanceMatrix clone; 354 try 355 { 356 clone = (DistanceMatrix) super.clone(); 357 } 358 catch (CloneNotSupportedException e) 359 { 360 throw new RuntimeException(e); 361 } 362 363 clone.mMatrix = mMatrix.clone(); 364 365 return clone; 366 } 367 368 //************************************************************************** 369 // PRIVATE METHODS 370 //************************************************************************** 371 372 //-------------------------------------------------------------------------- 373 private void parseMatrix(BufferedReader inMatrixReader) 374 throws IOException 375 { 376 char[] preview = new char[256]; 377 378 inMatrixReader.mark(preview.length); 379 inMatrixReader.read(preview, 0, preview.length); 380 inMatrixReader.reset(); 381 382 // In Phylip format, the first line should be the number of entries. 383 if (sPhylipFirstLinePattern.matcher(new String(preview)).find()) 384 { 385 parsePhylipFormatMatrix(inMatrixReader); 386 } 387 else 388 { 389 parseGeneralFormatMatrix(inMatrixReader); 390 } 391 } 392 393 //-------------------------------------------------------------------------- 394 private void parsePhylipFormatMatrix(BufferedReader inMatrixReader) 395 throws IOException 396 { 397 List<String> keys = new ArrayList<>(); 398 String newKey = null; 399 int distancesParsed = 0; 400 401 int numEntries = 0; 402 int lineCount = 1; 403 boolean needMoreValues = false; 404 405 String line; 406 while ((line = inMatrixReader.readLine()) != null) 407 { 408 // The first line should be the number of entries. 409 if (numEntries == 0 410 && line.matches("\\s*\\d+\\s*")) 411 { 412 numEntries = Integer.parseInt(line.trim()); 413 } 414 else if (needMoreValues) 415 { 416 String[] distances = line.trim().split("\\s+"); 417 for (int i = 0; i < distances.length && distancesParsed + i < keys.size(); i++) 418 { 419 setDistance(newKey, keys.get(distancesParsed + i), Float.parseFloat(distances[i])); 420 } 421 422 if (distancesParsed + distances.length < keys.size() - 1) 423 { 424 distancesParsed += distances.length; 425 needMoreValues = true; 426 } 427 else 428 { 429 needMoreValues = false; 430 } 431 } 432 else if (line.matches("\\s*((?:\\s*[\\-\\d\\.]+)*)")) 433 { 434 // Continued full-table values we don't need. 435 } 436 else 437 { 438 Matcher m = sPhylipPattern.matcher(line); 439 if (! m.matches()) 440 { 441 throw new RuntimeException("Unexpected format of matrix file line " + lineCount + ": '" + line + "'"); 442 } 443 444 newKey = m.group(1).trim(); 445 keys.add(newKey); 446 mMatrix.put(newKey, newKey, null); 447 448 if (m.group(2) != null) 449 { 450 String[] distances = m.group(2).split("\\s+"); 451 int i; 452 for (i = 0; i < distances.length && i < keys.size(); i++) 453 { 454 setDistance(newKey, keys.get(i), Float.parseFloat(distances[i])); 455 } 456 457 if (i < keys.size() - 1) 458 { 459 // Line must be wrapped 460 distancesParsed = distances.length; 461 needMoreValues = true; 462 } 463 } 464 } 465 466 lineCount++; 467 } 468 } 469 470 //-------------------------------------------------------------------------- 471 private void parseGeneralFormatMatrix(BufferedReader inMatrixReader) 472 throws IOException 473 { 474 List<String> keys = new ArrayList<>(); 475 String newKey = null; 476 int distancesParsed = 0; 477 478 int lineCount = 1; 479 boolean needMoreValues = false; 480 481 String line; 482 while ((line = inMatrixReader.readLine()) != null) 483 { 484 if (! StringUtil.isSet(line) 485 || line.startsWith("//") 486 || line.startsWith("#")) 487 { 488 continue; // Skip blank lines or comment lines. 489 } 490 491 if (needMoreValues) 492 { 493 String[] distances = line.trim().split("\\s+"); 494 for (int i = 0; i < distances.length && distancesParsed + i < keys.size(); i++) 495 { 496 setDistance(newKey, keys.get(distancesParsed + i), Float.parseFloat(distances[i])); 497 } 498 499 if (distancesParsed + distances.length < keys.size() - 1) 500 { 501 distancesParsed += distances.length; 502 needMoreValues = true; 503 } 504 else 505 { 506 needMoreValues = false; 507 } 508 } 509 else if (line.matches("^\\s+.*")) 510 { 511 // Continued full-table values we don't need. 512 } 513 else 514 { 515 String[] pieces = line.split("\\s+"); 516 newKey = pieces[0]; 517 keys.add(newKey); 518 mMatrix.put(newKey, newKey, 0.0f); 519 520 int i; 521 for (i = 1; i < pieces.length && i < keys.size(); i++) 522 { 523 setDistance(newKey, keys.get(i - 1), Float.parseFloat(pieces[i])); 524 } 525 526 if (i < keys.size()) 527 { 528 // Line must be wrapped 529 distancesParsed = pieces.length - 1; 530 needMoreValues = true; 531 } 532 } 533 534 lineCount++; 535 } 536 } 537 538 //-------------------------------------------------------------------------- 539 private int getMaxKeyLength() 540 { 541 int maxLength = 0; 542 for (String key : keySet()) 543 { 544 if (key.length() > maxLength) 545 { 546 maxLength = key.length(); 547 } 548 } 549 550 return maxLength; 551 } 552}