WildcharMatcher xref

View Javadoc

1   /*
2    * Copyright (c) 2003-2008 by Cosylab d. d.
3    *
4    * This file is part of Java-Common.
5    *
6    * Java-Common is free software: you can redistribute it and/or modify
7    * it under the terms of the GNU General Public License as published by
8    * the Free Software Foundation, either version 3 of the License, or
9    * (at your option) any later version.
10   *
11   * Java-Common is distributed in the hope that it will be useful,
12   * but WITHOUT ANY WARRANTY; without even the implied warranty of
13   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14   * GNU General Public License for more details.
15   *
16   * You should have received a copy of the GNU General Public License
17   * along with Java-Common.  If not, see <http://www.gnu.org/licenses/>.
18   */
19  
20  package com.cosylab.util;
21  
22  /**
23   * A Unix-like wildchar matcher. Supported wild-characters: '', '?'; sets:
24   * [a-z], '!' negation Examples: '[a-g]li?n' matches 'florian' '[!abc]e'
25   * matches 'smile' '[-z] matches 'a' Rules for sets: RegEx definition of the
26   * valid set is: [!]?(-.)?((.-.)|(.))(.-)? a-z : match any letter between 'a'
27   * and 'z' inclusively [-a : match everything up to and including 'a' (only
28   * valid at beginning) a-] : match everything from 'a' (only valid at the end)
29   * a   : match exactly 'a' !a  : not operator, match everything except 'a'
30   * (only allowed at beginning) \a  : treat a literally (useful for specifying
31   * '!]-' in sets. Note that     \t\b\n... are not processed.  Wildchar rules:
32   * : match any number (0..inf) number of occurences of any character ?     :
33   * match exactly and only one occurence of any character ab    : match exactly
34   * 'ab' [..]: same as , but character must match the set.
35   *
36   * @author <a href="mailto:ales.pucelj@cosylab.com">Ales Pucelj</a>
37   * @version $id$
38   */
39  public class WildcharMatcher
40  {
41  	private static final boolean DEBUG = false;
42  
43  	/** Value of initial state */
44  	private static final int INITIAL = 0;
45  
46  	/** Value of final state */
47  	private static final int FINAL = 2;
48  
49  	/** Value of error state */
50  	private static final int ERROR = 99;
51  
52  	/** Any character (except control, unless escaped) */
53  	private static final int TOKEN_CHAR = 0;
54  
55  	/** Token for end of set: ] */
56  	private static final int TOKEN_END = 1;
57  
58  	/** Token for negation: */
59  	private static final int TOKEN_NOT = 2;
60  
61  	/** Token for range specification: - */
62  	private static final int TOKEN_MINUS = 3;
63  
64  	/**
65  	 * Transition table holds the nextState used in set parsing. Rows define
66  	 * states, columns define tokens. transitions[1][3] = 5 means: if in state
67  	 * 1 next token is 3, goto state 5
68  	 */
69  	private static final int[][] TRANSITIONS = {
70  		{ 1, FINAL, 3, 4 },
71  		{ 1, FINAL, ERROR, 5 },
72  		{ ERROR, ERROR, ERROR, ERROR },
73  		{ 1, FINAL, ERROR, 4 },
74  		{ 6, ERROR, ERROR, ERROR },
75  		{ 6, FINAL, ERROR, ERROR },
76  		{ 1, FINAL, ERROR, ERROR }
77  	};
78  
79  	private static int getToken(final char ch)
80  	{
81  		switch (ch) {
82  		case ']':
83  			return TOKEN_END;
84  
85  		case '!':
86  			return TOKEN_NOT;
87  
88  		case '-':
89  			return TOKEN_MINUS;
90  
91  		default:
92  			return TOKEN_CHAR;
93  		}
94  	}
95  
96  	/**
97  	 * DFA for parsing set strings. DFA was obtained from JFlex using the rule
98  	 * : macro: CHAR = [^-\]\!] (everything except ], ! and - rule :
99  	 * [!]?(-{CHAR})?(({CHAR}-{CHAR})|({CHAR}))({CHAR}-)?\] Result of
100 	 * optimized NDFA is Character classes: class 0: [0-'
101 	 * ']['"'-',']['.'-'\']['^'-65535]  class 1: [']']  class 2: ['!']  class
102 	 * 3: ['-']  Transition graph (for class goto state) State 0: 0 -> 1, 1 ->
103 	 * 2, 2 -> 3, 3 -> 4 State 1: 0 -> 1, 1 -> 2, 3 -> 5 State [FINAL] State
104 	 * 3: 0 -> 1, 1 -> 2, 3 -> 4 State 4: 0 -> 6 State 5: 0 -> 6, 1 -> 2 State
105 	 * 6: 0 -> 1, 1 -> 2
106 	 *
107 	 * @param pattern DOCUMENT ME!
108 	 * @param offset DOCUMENT ME!
109 	 * @param ch DOCUMENT ME!
110 	 *
111 	 * @return DOCUMENT ME!
112 	 */
113 	public static boolean testSet(final String pattern, int offset,
114 	    final char ch)
115 	{
116 		final int n = pattern.length();
117 
118 		int state = INITIAL;
119 		int nextToken = ' ';
120 		char nextChar = ' ';
121 		char ch1 = ' ';
122 
123 		boolean found = false;
124 
125 		boolean negate = false;
126 
127 		while (!found) {
128 			// Check for offset in case of final state, which is over the limit,
129 			// if ] is at the end of the string.
130 			if (offset < n) {
131 				nextChar = pattern.charAt(offset);
132 
133 				if (nextChar == '\\') {
134 					// Any escaped sequence is two characters, otherwise error will
135 					// be throws, since this is an invalid sequence anyway
136 					nextChar = pattern.charAt(offset + 1);
137 					nextToken = TOKEN_CHAR;
138 					offset++;
139 				} else {
140 					nextToken = getToken(nextChar);
141 				}
142 			}
143 
144 			switch (state) {
145 			case INITIAL:
146 
147 				if (nextToken == TOKEN_NOT) {
148 					negate = true;
149 
150 					break;
151 				}
152 
153 			// No break, states 0, 1, 3, 6 have same next condition.
154 			case 1:
155 
156 				if (nextToken == TOKEN_END) {
157 					return true;
158 				}
159 
160 			case 3:
161 			case 6:
162 
163 				if (nextToken == TOKEN_CHAR) {
164 					found = (ch == nextChar);
165 					ch1 = nextChar;
166 				}
167 
168 				break;
169 
170 			case 4:
171 
172 				// condition [-a...
173 				found = (ch <= nextChar);
174 
175 				break;
176 
177 			case 5:
178 
179 				if (nextToken == TOKEN_CHAR) {
180 					// condition ...a-z...
181 					found = ((ch >= ch1) && (ch <= nextChar));
182 				}
183 
184 				if (nextToken == TOKEN_END) {
185 					// condition ...a-]
186 					found = (ch >= ch1);
187 				}
188 
189 				break;
190 
191 			default:}
192 
193 			if (DEBUG) {
194 				System.out.println("( " + state + " -> "
195 				    + TRANSITIONS[state][nextToken] + " ) token = " + nextToken
196 				    + " char = " + nextChar + ", found = " + found
197 				    + ", negate = " + negate);
198 			}
199 
200 			// Lookup next state in transition table and check for valid pattern			
201 			state = TRANSITIONS[state][nextToken];
202 
203 			if (state == ERROR) {
204 				return false;
205 
206 				// don't bother, this is a no match anyway
207 				// throw new RuntimeException("Invalid pattern");
208 			}
209 
210 			if (state == FINAL) {
211 				return found ^ negate;
212 			}
213 
214 			offset++;
215 		}
216 
217 		return found ^ negate;
218 	}
219 
220 	/**
221 	 * Recursive method for parsing the string. To avoid copying the strings,
222 	 * the method accepts offset indices into both parameters.
223 	 *
224 	 * @param pattern Pattern used in parsing
225 	 * @param ofp Offset into pattern string (ofp > 0)
226 	 * @param str String to test
227 	 * @param ofs Offset into test string (ofs > 0);
228 	 *
229 	 * @return boolean Do the strings match
230 	 */
231 	public static boolean parse(final String pattern, final int ofp,
232 	    final String str, final int ofs)
233 	{
234 		final int lp = pattern.length();
235 		final int ls = str.length();
236 
237 		// index into pattern string
238 		int ip = ofp;
239 
240 		// index into test string;
241 		int is = ofs;
242 
243 		char chp;
244 		char chs;
245 
246 		if (DEBUG) {
247 			if ((ip > -1) && (is > -1) && (ip < lp) && (is < ls)) {
248 				System.out.println("parse: " + pattern.substring(ip) + " "
249 				    + str.substring(is));
250 			}
251 		}
252 
253 		// Match happens only, if we parse both strings exactly to the end
254 		while ((ip < lp)) {
255 			chp = pattern.charAt(ip);
256 
257 			if (DEBUG) {
258 				if ((ip > -1) && (is > -1) && (ip < lp) && (is < ls)) {
259 					System.out.println(pattern.substring(ip) + " "
260 					    + str.substring(is));
261 				}
262 			}
263 
264 			switch (chp) {
265 			case '[':
266 
267 				// System.out.println("[ "+chp+", "+chs);
268 				// Each set must be close with a ], otherwise it is invalid.
269 				int end = pattern.indexOf("]", ip);
270 
271 				if (end == -1) {
272 					return false;
273 				}
274 
275 				// Is this set followed by a *	
276 				boolean isWildchar = ((end + 1) < lp)
277 					&& (pattern.charAt(end + 1) == '*');
278 
279 				if (is < ls) {
280 					chs = str.charAt(is);
281 				} else {
282 					return parse(pattern, end + 2, str, is);
283 				}
284 
285 				// Does this character match
286 				boolean thisChar = testSet(pattern, ip + 1, chs);
287 
288 				// Check for single character match only if there is no
289 				// * at the end.
290 				if (!thisChar && !isWildchar) {
291 					// Return only if this character does not match
292 					return false;
293 				}
294 
295 				if (isWildchar) {
296 					// If this character does not match, maybe this set
297 					// can be skipped entirely
298 					if (!thisChar) {
299 						ip = end + 2;
300 
301 						break;
302 					}
303 
304 					// Special case when this character matches, although
305 					// it should not: a[a-z]*z == az
306 					if (parse(pattern, end + 2, str, is)) {
307 						return true;
308 					}
309 
310 					// Try to match next character
311 					if (parse(pattern, ip, str, is + 1)) {
312 						return true;
313 					}
314 				}
315 
316 				// Single character matched, set was processed, since
317 				// no * was at the end.
318 				ip = end + 1;
319 				is++;
320 
321 				break;
322 
323 			case '?':
324 
325 				// Obvious
326 				ip++;
327 				is++;
328 
329 				break;
330 
331 			case '*':
332 
333 				// Trailing asterisk means that string matches till the end.
334 				// Also, checks if this is last char in the string
335 				if (ip + 1 == lp) {
336 					return true;
337 				}
338 
339 				// Skip the *
340 				do {
341 					ip++;
342 					chp = pattern.charAt(ip);
343 				} while ((ip + 1 < lp) && (chp == '*'));
344 
345 				// But perform a special check and solve it by recursing
346 				// from new position
347 				if (chp == '?') {
348 					if (parse(pattern, ip, str, is)) {
349 						return true;
350 					}
351 				}
352 
353 				// Iterate through all possible matches in the test string
354 				int i = is;
355 
356 				while (i < ls) {
357 					/*
358 					 * Would be nice to skip unmatchable characters,
359 					 * but it's too much fuss
360 					while ((i < ls) && (str.charAt(i) != chp)) {
361 					    i++;
362 					    if (i == ls) {
363 					        return false;
364 					    }
365 					}
366 					*/
367 
368 					// Stupid brute force, but isn't as bad as it seems.
369 					// Try all possible matches in the test string.
370 					if (parse(pattern, ip, str, i)) {
371 						return true;
372 					}
373 
374 					i++;
375 				}
376 
377 				break;
378 
379 			default:
380 
381 				// Literal match
382 				if (is == ls || pattern.charAt(ip) != str.charAt(is)) {
383 					return false;
384 				}
385 
386 				ip++;
387 				is++;
388 			}
389 		}
390 
391 		// There could be several * at the end of the pattern, although the
392 		// test string is at the end.
393 		while ((ip < lp) && ((pattern.charAt(ip)) == '*')) {
394 			ip++;
395 		}
396 
397 		// Same condition as with while loop
398 		return (is == ls) && (ip == lp);
399 	}
400 
401 	/**
402 	 * DOCUMENT ME!
403 	 *
404 	 * @param pattern DOCUMENT ME!
405 	 * @param str DOCUMENT ME!
406 	 *
407 	 * @return DOCUMENT ME!
408 	 */
409 	public static boolean match(final String pattern, final String str)
410 	{
411 		return parse(pattern, 0, str, 0);
412 	}
413 
414 	/**
415 	 * Run test applet.
416 	 *
417 	 * @param args command line parameters
418 	 */
419 	public static void main(String[] args)
420 	{
421 		System.out.println("[-az-]* == 01 abAZ : true = "
422 		    + WildcharMatcher.match("[-aa-]*", "01 abAZ"));
423 		System.out.println("[\\!a\\-bc]* == !!!b-bb- : true = "
424 		    + WildcharMatcher.match("[\\!a\\-bc]*", "!!!b-bb-"));
425 
426 		System.out.println("*zz == zz : true = "
427 		    + WildcharMatcher.match("*zz", "zz"));
428 		System.out.println("[abc]*zz == zz : true = "
429 		    + WildcharMatcher.match("[abc]*zz", "zz"));
430 
431 		System.out.println("[!abc]*a[def] == xyzbd : false = "
432 		    + WildcharMatcher.match("[!abc]*a[def]", "xyzbd"));
433 		System.out.println("[!abc]*a[def] == xyzad : true = "
434 		    + WildcharMatcher.match("[!abc]*a[def]", "xyzad"));
435 		System.out.println("[a-g]l*i?n == florian : true = "
436 		    + WildcharMatcher.match("[a-g]l*i?n", "florian"));
437 		System.out.println("[!abc]*e == smile : true = "
438 		    + WildcharMatcher.match("[!abc]*e", "smile"));
439 		System.out.println("[-z] == a : true = "
440 		    + WildcharMatcher.match("[-z]", "a"));
441 		System.out.println("[] == '' : false = "
442 		    + WildcharMatcher.match("[]", ""));
443 		System.out.println("[a-z]* == java : true = "
444 		    + WildcharMatcher.match("[a-z]*", "java"));
445 		System.out.println("*.* == command.com : true = "
446 		    + WildcharMatcher.match("*.*", "command.com"));
447 		System.out.println("*.* == /var/etc : false = "
448 		    + WildcharMatcher.match("*.*", "/var/etc"));
449 		System.out.println("**?*x*[abh-]*Q == XYZxabbauuZQ : true = "
450 		    + WildcharMatcher.match("**?*x*[abh-]*Q", "XYZxabbauuZQ"));
451 	}
452 }
453 
454 /* __oOo__ */