001package com.hfg.bio.phylogeny;
002
003import java.io.*;
004import java.util.*;
005import java.util.regex.Pattern;
006import java.util.regex.Matcher;
007
008import com.hfg.util.StringUtil;
009import com.hfg.network.Edge;
010import com.hfg.util.collection.CollectionUtil;
011import com.hfg.util.collection.MatrixCell;
012import com.hfg.util.collection.SymmetricNumberMatrix;
013
014//------------------------------------------------------------------------------
015/**
016 Simple distance matrix container. Can read Phylip-format matrices or general
017 matrices where the identifiers do not contain whitespace.
018
019 <div class='example'>Most commonly constructed via a multiple sequence alignment:
020   <pre>
021    DistanceMatrix matrix = msa.getDistanceMatrix(new UncorrectedModel());
022   </pre>
023 </div>
024
025 <div class='example'>Phylip-formatted matrix example:
026   <pre>
027    String testMatrix =
028       "   14\n" +
029       "Mouse     \n" +
030       "Bovine      1.7043\n" +
031       "Lemur       2.0235  1.1901\n" +
032       "Tarsier     2.1378  1.3287  1.2905\n" +
033       "Squir Monk  1.5232  1.2423  1.3199  1.7878\n" +
034       "Jpn Macaq   1.8261  1.2508  1.3887  1.3137  1.0642\n" +
035       "Rhesus Mac  1.9182  1.2536  1.4658  1.3788  1.1124  0.1022\n" +
036       "Crab-E.Mac  2.0039  1.3066  1.4826  1.3826  0.9832  0.2061  0.2681\n";
037
038    DistanceMatrix matrix = new DistanceMatrix(testMatrix);
039   </pre>
040 </div>
041
042 <div class='example'>General matrix example:
043   <pre>
044    String testMatrix =
045             "# Subunit distance matrix\n" +
046             "Alpha      0.000 1.000 2.000 3.000 3.000\n" +
047             "Beta       1.000 0.000 2.000 3.000 3.000\n" +
048             "Gamma      2.000 2.000 0.000 3.000 3.000\n" +
049             "Delta      3.000 3.000 0.000 0.000 1.000\n" +
050             "Epsilon    3.000 3.000 3.000 1.000 0.000\n\n";
051
052    DistanceMatrix matrix = new DistanceMatrix(testMatrix);
053   </pre>
054 </div>
055
056 <div>
057 See the Phylip
058 <a href='http://evolution.genetics.washington.edu/phylip/doc/distance.html'>Distance matrix programs page</a>
059 for the description of the Phylip distance matrix format.
060 </div>
061 <div>
062  @author J. Alex Taylor, hairyfatguy.com
063 </div>
064 */
065//------------------------------------------------------------------------------
066// com.hfg Library
067//
068// This library is free software; you can redistribute it and/or
069// modify it under the terms of the GNU Lesser General Public
070// License as published by the Free Software Foundation; either
071// version 2.1 of the License, or (at your option) any later version.
072//
073// This library is distributed in the hope that it will be useful,
074// but WITHOUT ANY WARRANTY; without even the implied warranty of
075// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
076// Lesser General Public License for more details.
077//
078// You should have received a copy of the GNU Lesser General Public
079// License along with this library; if not, write to the Free Software
080// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
081//
082// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com
083// jataylor@hairyfatguy.com
084//------------------------------------------------------------------------------
085
086public class DistanceMatrix implements Cloneable
087{
088   private SymmetricNumberMatrix<String, Float> mMatrix;
089
090   private boolean mIsConsumable = true;
091   private boolean mIsConsumed;
092
093   private static final Pattern sPhylipPattern  = Pattern.compile("(.{1,10})(?:\\s*(.+))?");
094   private static final Pattern sPhylipFirstLinePattern  = Pattern.compile("^\\s*\\d+\\s*[\\n\\r]");
095
096   //##########################################################################
097   // CONSTRUCTORS
098   //##########################################################################
099
100
101   //--------------------------------------------------------------------------
102   public DistanceMatrix()
103   {
104      this(100);
105   }
106
107   //--------------------------------------------------------------------------
108   public DistanceMatrix(int inInitialSize)
109   {
110      mMatrix = new SymmetricNumberMatrix<>(inInitialSize);
111   }
112
113   //--------------------------------------------------------------------------
114   public DistanceMatrix(String inMatrix)
115   {
116      this();
117      if (StringUtil.isSet(inMatrix))
118      {
119         try
120         {
121            BufferedReader reader = null;
122            try
123            {
124               reader = new BufferedReader(new StringReader(inMatrix));
125               parseMatrix(reader);
126            }
127            finally
128            {
129               if (reader != null)
130               {
131                  reader.close();
132               }
133            }
134         }
135         catch (IOException e)
136         {
137            throw new RuntimeException("Problem parsing distance matrix!", e);
138         }
139      }
140   }
141
142   //--------------------------------------------------------------------------
143   public DistanceMatrix(File inMatrix)
144      throws IOException
145   {
146      this();
147      if (null == inMatrix)
148      {
149         throw new IOException("No file was specified!");
150      }
151      else if (!inMatrix.exists())
152      {
153         throw new IOException("The file " + StringUtil.singleQuote(inMatrix.getPath()) + " does not exist!");
154      }
155
156      BufferedReader reader = null;
157      try
158      {
159         reader = new BufferedReader(new FileReader(inMatrix));
160         parseMatrix(reader);
161      }
162      finally
163      {
164         if (reader != null) reader.close();
165      }
166   }
167
168   //##########################################################################
169   // PUBLIC METHODS
170   //##########################################################################
171
172   //--------------------------------------------------------------------------
173   public DistanceMatrix setIsConsumable(boolean inValue)
174   {
175      mIsConsumable = inValue;
176      return this;
177   }
178
179   //--------------------------------------------------------------------------
180   public boolean isConsumable()
181   {
182      return mIsConsumable;
183   }
184
185   //--------------------------------------------------------------------------
186   public DistanceMatrix setIsConsumed()
187   {
188      mIsConsumed = true;
189      return this;
190   }
191
192   //--------------------------------------------------------------------------
193   public boolean isConsumed()
194   {
195      return mIsConsumed;
196   }
197
198   //--------------------------------------------------------------------------
199   public void addKey(String inKey)
200   {
201      mMatrix.addKey(inKey);
202   }
203
204   //--------------------------------------------------------------------------
205   public void removeKey(String inKey)
206   {
207      mMatrix.removeKey(inKey);
208   }
209
210   //--------------------------------------------------------------------------
211   public void removeKeys(Set<String> inKeys)
212   {
213      mMatrix.removeKeys(inKeys);
214   }
215
216   //--------------------------------------------------------------------------
217   /**
218    Changes the key name inOldKey to inNewKey. Can be useful in re-expanding names
219    after replacing them in order to comply with the 10-character Phylip format limitation.
220    @param inOldKey the old matrix key
221    @param inNewKey the new matrix key
222    */
223   public void changeKey(String inOldKey, String inNewKey)
224   {
225      mMatrix.changeKey(inOldKey, inNewKey);
226   }
227
228   //--------------------------------------------------------------------------
229   public void setDistance(String inKey1, String inKey2, Float inDistance)
230   {
231      if (! inKey1.equals(inKey2))
232      {
233         mMatrix.put(inKey1, inKey2, inDistance);
234      }
235   }
236
237   //--------------------------------------------------------------------------
238   public Float getDistance(String inKey1, String inKey2)
239   {
240      Float distance;
241
242      if (inKey1.equals(inKey2))
243      {
244         distance = 0.0f;
245      }
246      else
247      {
248         distance = mMatrix.get(inKey1, inKey2);
249      }
250
251      return distance;
252   }
253
254   //--------------------------------------------------------------------------
255   public int size()
256   {
257      return mMatrix.size();
258   }
259
260   //--------------------------------------------------------------------------
261   public int numKeys()
262   {
263      return mMatrix.numKeys();
264   }
265
266   //--------------------------------------------------------------------------
267   public Collection<String> keySet()
268   {
269      return Collections.unmodifiableCollection(mMatrix.keySet());
270   }
271
272   //--------------------------------------------------------------------------
273   public float getNetDivergence(String inKey)
274   {
275      return mMatrix.getSumForKey(inKey);
276   }
277
278   //--------------------------------------------------------------------------
279   public String getNearestNeighbor(String inKey)
280   {
281      Float minValue = Float.MAX_VALUE;
282      String minKey = null;
283      for (String key2 : mMatrix.keySet())
284      {
285         if (key2.equals(inKey))
286         {
287            continue;
288         }
289
290         Float value = mMatrix.get(inKey, key2);
291         if (value != null
292             && value < minValue)
293         {
294            minValue = value;
295            minKey = key2;
296         }
297      }
298
299      return minKey;
300   }
301
302   //--------------------------------------------------------------------------
303   /**
304    Returns the Edge with the shortest distance. If multiple edges are found with
305    the same distance, no guarantee is made as to which one will be returned.
306    @return the Edge with the shortest distance
307    */
308   public Edge<String> getShortestEdge()
309   {
310      Edge<String> shortestEdge = null;
311
312      MatrixCell<String, String, Float> smallestValueCell = mMatrix.getNonIdentityCellWithSmallestValue();
313      if (smallestValueCell != null)
314      {
315         shortestEdge = new Edge<>(smallestValueCell.getColKey(), smallestValueCell.getRowKey(), smallestValueCell.getValue());
316      }
317
318      return shortestEdge;
319   }
320
321
322   //--------------------------------------------------------------------------
323   /**
324    Returns the Edges for the specified sequence sorted shortest to longest.
325    @param inSeqId the id (key) of the sequence for which Edges should be retrieved
326    @return the Edges for the specified sequence sorted shortest to longest
327    */
328   public List<Edge<String>> getSortedEdges(String inSeqId)
329   {
330      Collection<String> keySet = keySet();
331
332      Map<String, Float> distanceMap = new HashMap<>(keySet.size());
333      for (String key : keySet)
334      {
335         distanceMap.put(key, getDistance(inSeqId, key));
336      }
337
338      Map<String, Float> sortedRowMap = CollectionUtil.sortMapByValue(distanceMap);
339
340      List<Edge<String>> edges = new ArrayList<>(sortedRowMap.size());
341      for (String key : sortedRowMap.keySet())
342      {
343         edges.add(new Edge<>(inSeqId, key, sortedRowMap.get(key)));
344      }
345
346      return edges;
347   }
348
349   //--------------------------------------------------------------------------
350   @Override
351   public DistanceMatrix clone()
352   {
353      DistanceMatrix clone;
354      try
355      {
356         clone = (DistanceMatrix) super.clone();
357      }
358      catch (CloneNotSupportedException e)
359      {
360         throw new RuntimeException(e);
361      }
362
363      clone.mMatrix = mMatrix.clone();
364
365      return clone;
366   }
367
368   //**************************************************************************
369   // PRIVATE METHODS
370   //**************************************************************************
371
372   //--------------------------------------------------------------------------
373   private void parseMatrix(BufferedReader inMatrixReader)
374      throws IOException
375   {
376      char[] preview = new char[256];
377
378      inMatrixReader.mark(preview.length);
379      inMatrixReader.read(preview, 0, preview.length);
380      inMatrixReader.reset();
381
382      // In Phylip format, the first line should be the number of entries.
383      if (sPhylipFirstLinePattern.matcher(new String(preview)).find())
384      {
385         parsePhylipFormatMatrix(inMatrixReader);
386      }
387      else
388      {
389         parseGeneralFormatMatrix(inMatrixReader);
390      }
391   }
392
393   //--------------------------------------------------------------------------
394   private void parsePhylipFormatMatrix(BufferedReader inMatrixReader)
395         throws IOException
396   {
397      List<String> keys = new ArrayList<>();
398      String newKey = null;
399      int distancesParsed = 0;
400
401      int numEntries = 0;
402      int lineCount = 1;
403      boolean needMoreValues = false;
404
405      String line;
406      while ((line = inMatrixReader.readLine()) != null)
407      {
408         // The first line should be the number of entries.
409         if (numEntries == 0
410             && line.matches("\\s*\\d+\\s*"))
411         {
412            numEntries = Integer.parseInt(line.trim());
413         }
414         else if (needMoreValues)
415         {
416            String[] distances = line.trim().split("\\s+");
417            for (int i = 0; i < distances.length && distancesParsed + i < keys.size(); i++)
418            {
419               setDistance(newKey, keys.get(distancesParsed + i), Float.parseFloat(distances[i]));
420            }
421
422            if (distancesParsed + distances.length < keys.size() - 1)
423            {
424               distancesParsed += distances.length;
425               needMoreValues = true;
426            }
427            else
428            {
429               needMoreValues = false;
430            }
431         }
432         else if (line.matches("\\s*((?:\\s*[\\-\\d\\.]+)*)"))
433         {
434            // Continued full-table values we don't need.
435         }
436         else
437         {
438            Matcher m = sPhylipPattern.matcher(line);
439            if (! m.matches())
440            {
441               throw new RuntimeException("Unexpected format of matrix file line " + lineCount + ": '" + line + "'");
442            }
443
444            newKey = m.group(1).trim();
445            keys.add(newKey);
446            mMatrix.put(newKey, newKey, null);
447
448            if (m.group(2) != null)
449            {
450               String[] distances = m.group(2).split("\\s+");
451               int i;
452               for (i = 0; i < distances.length && i < keys.size(); i++)
453               {
454                  setDistance(newKey, keys.get(i), Float.parseFloat(distances[i]));
455               }
456
457               if (i < keys.size() - 1)
458               {
459                  // Line must be wrapped
460                  distancesParsed = distances.length;
461                  needMoreValues = true;
462               }
463            }
464         }
465
466         lineCount++;
467      }
468   }
469
470   //--------------------------------------------------------------------------
471   private void parseGeneralFormatMatrix(BufferedReader inMatrixReader)
472      throws IOException
473   {
474      List<String> keys = new ArrayList<>();
475      String newKey = null;
476      int distancesParsed = 0;
477
478      int lineCount = 1;
479      boolean needMoreValues = false;
480
481      String line;
482      while ((line = inMatrixReader.readLine()) != null)
483      {
484         if (! StringUtil.isSet(line)
485             || line.startsWith("//")
486             || line.startsWith("#"))
487         {
488            continue;  // Skip blank lines or comment lines.
489         }
490
491         if (needMoreValues)
492         {
493            String[] distances = line.trim().split("\\s+");
494            for (int i = 0; i < distances.length && distancesParsed + i < keys.size(); i++)
495            {
496               setDistance(newKey, keys.get(distancesParsed + i), Float.parseFloat(distances[i]));
497            }
498
499            if (distancesParsed + distances.length < keys.size() - 1)
500            {
501               distancesParsed += distances.length;
502               needMoreValues = true;
503            }
504            else
505            {
506               needMoreValues = false;
507            }
508         }
509         else if (line.matches("^\\s+.*"))
510         {
511            // Continued full-table values we don't need.
512         }
513         else
514         {
515            String[] pieces = line.split("\\s+");
516            newKey = pieces[0];
517            keys.add(newKey);
518            mMatrix.put(newKey, newKey, 0.0f);
519
520            int i;
521            for (i = 1; i < pieces.length && i < keys.size(); i++)
522            {
523               setDistance(newKey, keys.get(i - 1), Float.parseFloat(pieces[i]));
524            }
525
526            if (i < keys.size())
527            {
528               // Line must be wrapped
529               distancesParsed = pieces.length - 1;
530               needMoreValues = true;
531            }
532         }
533
534         lineCount++;
535      }
536   }
537
538   //--------------------------------------------------------------------------
539   private int getMaxKeyLength()
540   {
541      int maxLength = 0;
542      for (String key : keySet())
543      {
544         if (key.length() > maxLength)
545         {
546            maxLength = key.length();
547         }
548      }
549
550      return maxLength;
551   }
552}