001package com.hfg.util; 002 003import java.util.ArrayList; 004import java.util.List; 005 006//------------------------------------------------------------------------------ 007/** 008 Baeza-Yates, Perleberg string matcher. 009 <div> 010 @author J. Alex Taylor, hairyfatguy.com 011 </div> 012 */ 013//------------------------------------------------------------------------------ 014// com.hfg XML/HTML Coding Library 015// 016// This library is free software; you can redistribute it and/or 017// modify it under the terms of the GNU Lesser General Public 018// License as published by the Free Software Foundation; either 019// version 2.1 of the License, or (at your option) any later version. 020// 021// This library is distributed in the hope that it will be useful, 022// but WITHOUT ANY WARRANTY; without even the implied warranty of 023// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 024// Lesser General Public License for more details. 025// 026// You should have received a copy of the GNU Lesser General Public 027// License along with this library; if not, write to the Free Software 028// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 029// 030// J. Alex Taylor, President, Founder, CEO, COO, CFO, OOPS hairyfatguy.com 031// jataylor@hairyfatguy.com 032//------------------------------------------------------------------------------ 033 034public class BYPStringMatcher 035{ 036 public BYPStringPattern mPattern; 037 public String mTargetString; 038 039 private List<BYPStringMatch> mMatches; 040 041 private int[] mMismatchCount = new int[ALPHABET_SIZE]; 042 private List<IndexNode> mOffsetList = new ArrayList<>(ALPHABET_SIZE); 043 044 private static final int ALPHABET_SIZE = 256; 045 private static final int MOD256 = 0xff; 046 047 private static final boolean STOP_AFTER_FIRST_MATCH = true; 048 private static final boolean FIND_ALL_MATCHES = false; 049 050 //########################################################################### 051 // CONSTRUCTORS 052 //########################################################################### 053 054 //--------------------------------------------------------------------------- 055 private BYPStringMatcher() 056 { 057 058 } 059 060 //--------------------------------------------------------------------------- 061 BYPStringMatcher(BYPStringPattern inPattern, String inTargetString) 062 { 063 mPattern = inPattern; 064 mTargetString = inTargetString; 065 066 if (null == mTargetString 067 || 0 == mTargetString.length()) 068 { 069 throw new RuntimeException("No string was specified for searching!"); 070 } 071 else if (mTargetString.length() < mPattern.length()) 072 { 073 throw new RuntimeException("No target string cannot be shorter than the pattern!"); 074 } 075 076 setup(); 077 } 078 079 //########################################################################### 080 // PUBLIC METHODS 081 //########################################################################### 082 083 //--------------------------------------------------------------------------- 084 public boolean matches() 085 { 086 innerFind(0, STOP_AFTER_FIRST_MATCH, mPattern.length()); 087 return mMatches != null && start() == 0 && end() == mTargetString.length(); 088 } 089 090 //--------------------------------------------------------------------------- 091 public boolean find() 092 { 093 return find(0); 094 } 095 096 //--------------------------------------------------------------------------- 097 public boolean find(int inStartIndex) 098 { 099 innerFind(inStartIndex, STOP_AFTER_FIRST_MATCH); 100 return mMatches != null; 101 } 102 103 //--------------------------------------------------------------------------- 104 public List<BYPStringMatch> findAll() 105 { 106 innerFind(0, FIND_ALL_MATCHES); 107 return mMatches; 108 } 109 110 //--------------------------------------------------------------------------- 111 public String group() 112 { 113 return mMatches.get(0).getString(); 114 } 115 116 //--------------------------------------------------------------------------- 117 public int start() 118 { 119 return mMatches.get(0).start(); 120 } 121 122 //--------------------------------------------------------------------------- 123 public int end() 124 { 125 return mMatches.get(0).end(); 126 } 127 128 //--------------------------------------------------------------------------- 129 public int mismatches() 130 { 131 return mMatches.get(0).getNumMismatches(); 132 } 133 134 //########################################################################### 135 // PRIVATE METHODS 136 //########################################################################### 137 138 //--------------------------------------------------------------------------- 139 private void setup() 140 { 141 if (mPattern.isCaseInsensitive()) 142 { 143 mTargetString = mTargetString.toUpperCase(); 144 } 145 146 int patternLength = mPattern.length(); 147 148 String patternString = mPattern.getPatternString(); 149 if (mPattern.isCaseInsensitive()) 150 { 151 patternString = patternString.toUpperCase(); 152 } 153 154 for (int i = 0; i < ALPHABET_SIZE; i++) 155 { 156 mOffsetList.add(new BYPStringMatcher(). new IndexNode()); 157 mMismatchCount[i] = patternLength; 158 } 159 160 for (int i = 0, j = 128; i < patternLength; i++) 161 { 162 mMismatchCount[i] = ALPHABET_SIZE; 163 164 char patternChar = patternString.charAt(i); 165 IndexNode indexNode = mOffsetList.get((int)patternChar); 166 if (indexNode.getOffset() == -1) 167 { 168 indexNode.setOffset(patternLength - i - 1); 169 } 170 else 171 { 172 short nextIndex = indexNode.nextIndex(); 173 indexNode.setNextIndex((short)j++); 174 indexNode = mOffsetList.get(indexNode.nextIndex()); 175 indexNode.setOffset(patternLength - i - 1); 176 indexNode.setNextIndex(nextIndex); 177 } 178 } 179 180 mMismatchCount[patternLength - 1] = patternLength; 181 } 182 183 //--------------------------------------------------------------------------- 184 private void innerFind(int inStartIndex, boolean inStopAfterFirstMatch) 185 { 186 innerFind(inStartIndex, inStopAfterFirstMatch, mTargetString.length()); 187 } 188 189 //--------------------------------------------------------------------------- 190 private void innerFind(int inStartIndex, boolean inStopAfterFirstMatch, int inSearchLength) 191 { 192 mMatches = null; 193 194 int patternLength = mPattern.length(); 195 int maxMismatches = mPattern.getMaxMismatches(); 196 197 for (int i = inStartIndex; i < inSearchLength; i++) 198 { 199 IndexNode indexNode = mOffsetList.get((int)mTargetString.charAt(i)); 200 int offset; 201 if ((offset = indexNode.getOffset()) >= 0) 202 { 203 mMismatchCount[(i + offset)&MOD256]--; 204 if (indexNode.nextIndex() >= 0) 205 { 206 for (indexNode = mOffsetList.get(indexNode.nextIndex()); indexNode != null; indexNode = mOffsetList.get(indexNode.nextIndex())) 207 { 208 mMismatchCount[(i + indexNode.getOffset()) & MOD256]--; 209 if (indexNode.nextIndex() < 0) 210 { 211 break; 212 } 213 } 214 } 215 } 216 217 if (mMismatchCount[i&MOD256] <= maxMismatches) 218 { 219 int start = i - patternLength + 1; 220 if (start >= inStartIndex) 221 { 222// System.out.println(String.format("Match in position %d with %d mismatches", i-patternLength+1, mMismatchCount[i&MOD256])); 223 if (null == mMatches) 224 { 225 mMatches = new ArrayList<>(1); 226 } 227 228 mMatches.add(new BYPStringMatch(mTargetString.substring(start, i + 1), start, i + 1).setNumMismatches(mMismatchCount[i & MOD256])); 229 230 if (inStopAfterFirstMatch) 231 { 232 break; 233 } 234 } 235 } 236 237 mMismatchCount[i&MOD256] = patternLength; 238 } 239 } 240 241 //########################################################################### 242 // INNER CLASS 243 //########################################################################### 244 245 private class IndexNode 246 { 247 int mOffset = -1; 248 short mNext = -1; 249 250 251 public int getOffset() 252 { 253 return mOffset; 254 } 255 public void setOffset(int inValue) 256 { 257 mOffset = inValue; 258 } 259 260 public short nextIndex() 261 { 262 return mNext; 263 } 264 265 public void setNextIndex(short inValue) 266 { 267 mNext = inValue; 268 } 269 270 public String toString() 271 { 272 return "Offset: " + mOffset + "; Next: " + mNext; 273 } 274 } 275}