1 /*
2 * Copyright (c) 2003-2008 by Cosylab d. d.
3 *
4 * This file is part of Java-Common.
5 *
6 * Java-Common is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * Java-Common is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with Java-Common. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 package com.cosylab.util;
21
22 /**
23 * A Unix-like wildchar matcher. Supported wild-characters: '', '?'; sets:
24 * [a-z], '!' negation Examples: '[a-g]li?n' matches 'florian' '[!abc]e'
25 * matches 'smile' '[-z] matches 'a' Rules for sets: RegEx definition of the
26 * valid set is: [!]?(-.)?((.-.)|(.))(.-)? a-z : match any letter between 'a'
27 * and 'z' inclusively [-a : match everything up to and including 'a' (only
28 * valid at beginning) a-] : match everything from 'a' (only valid at the end)
29 * a : match exactly 'a' !a : not operator, match everything except 'a'
30 * (only allowed at beginning) \a : treat a literally (useful for specifying
31 * '!]-' in sets. Note that \t\b\n... are not processed. Wildchar rules:
32 * : match any number (0..inf) number of occurences of any character ? :
33 * match exactly and only one occurence of any character ab : match exactly
34 * 'ab' [..]: same as , but character must match the set.
35 *
36 * @author <a href="mailto:ales.pucelj@cosylab.com">Ales Pucelj</a>
37 * @version $id$
38 */
39 public class WildcharMatcher
40 {
41 private static final boolean DEBUG = false;
42
43 /** Value of initial state */
44 private static final int INITIAL = 0;
45
46 /** Value of final state */
47 private static final int FINAL = 2;
48
49 /** Value of error state */
50 private static final int ERROR = 99;
51
52 /** Any character (except control, unless escaped) */
53 private static final int TOKEN_CHAR = 0;
54
55 /** Token for end of set: ] */
56 private static final int TOKEN_END = 1;
57
58 /** Token for negation: */
59 private static final int TOKEN_NOT = 2;
60
61 /** Token for range specification: - */
62 private static final int TOKEN_MINUS = 3;
63
64 /**
65 * Transition table holds the nextState used in set parsing. Rows define
66 * states, columns define tokens. transitions[1][3] = 5 means: if in state
67 * 1 next token is 3, goto state 5
68 */
69 private static final int[][] TRANSITIONS = {
70 { 1, FINAL, 3, 4 },
71 { 1, FINAL, ERROR, 5 },
72 { ERROR, ERROR, ERROR, ERROR },
73 { 1, FINAL, ERROR, 4 },
74 { 6, ERROR, ERROR, ERROR },
75 { 6, FINAL, ERROR, ERROR },
76 { 1, FINAL, ERROR, ERROR }
77 };
78
79 private static int getToken(final char ch)
80 {
81 switch (ch) {
82 case ']':
83 return TOKEN_END;
84
85 case '!':
86 return TOKEN_NOT;
87
88 case '-':
89 return TOKEN_MINUS;
90
91 default:
92 return TOKEN_CHAR;
93 }
94 }
95
96 /**
97 * DFA for parsing set strings. DFA was obtained from JFlex using the rule
98 * : macro: CHAR = [^-\]\!] (everything except ], ! and - rule :
99 * [!]?(-{CHAR})?(({CHAR}-{CHAR})|({CHAR}))({CHAR}-)?\] Result of
100 * optimized NDFA is Character classes: class 0: [0-'
101 * ']['"'-',']['.'-'\']['^'-65535] class 1: [']'] class 2: ['!'] class
102 * 3: ['-'] Transition graph (for class goto state) State 0: 0 -> 1, 1 ->
103 * 2, 2 -> 3, 3 -> 4 State 1: 0 -> 1, 1 -> 2, 3 -> 5 State [FINAL] State
104 * 3: 0 -> 1, 1 -> 2, 3 -> 4 State 4: 0 -> 6 State 5: 0 -> 6, 1 -> 2 State
105 * 6: 0 -> 1, 1 -> 2
106 *
107 * @param pattern DOCUMENT ME!
108 * @param offset DOCUMENT ME!
109 * @param ch DOCUMENT ME!
110 *
111 * @return DOCUMENT ME!
112 */
113 public static boolean testSet(final String pattern, int offset,
114 final char ch)
115 {
116 final int n = pattern.length();
117
118 int state = INITIAL;
119 int nextToken = ' ';
120 char nextChar = ' ';
121 char ch1 = ' ';
122
123 boolean found = false;
124
125 boolean negate = false;
126
127 while (!found) {
128 // Check for offset in case of final state, which is over the limit,
129 // if ] is at the end of the string.
130 if (offset < n) {
131 nextChar = pattern.charAt(offset);
132
133 if (nextChar == '\\') {
134 // Any escaped sequence is two characters, otherwise error will
135 // be throws, since this is an invalid sequence anyway
136 nextChar = pattern.charAt(offset + 1);
137 nextToken = TOKEN_CHAR;
138 offset++;
139 } else {
140 nextToken = getToken(nextChar);
141 }
142 }
143
144 switch (state) {
145 case INITIAL:
146
147 if (nextToken == TOKEN_NOT) {
148 negate = true;
149
150 break;
151 }
152
153 // No break, states 0, 1, 3, 6 have same next condition.
154 case 1:
155
156 if (nextToken == TOKEN_END) {
157 return true;
158 }
159
160 case 3:
161 case 6:
162
163 if (nextToken == TOKEN_CHAR) {
164 found = (ch == nextChar);
165 ch1 = nextChar;
166 }
167
168 break;
169
170 case 4:
171
172 // condition [-a...
173 found = (ch <= nextChar);
174
175 break;
176
177 case 5:
178
179 if (nextToken == TOKEN_CHAR) {
180 // condition ...a-z...
181 found = ((ch >= ch1) && (ch <= nextChar));
182 }
183
184 if (nextToken == TOKEN_END) {
185 // condition ...a-]
186 found = (ch >= ch1);
187 }
188
189 break;
190
191 default:}
192
193 if (DEBUG) {
194 System.out.println("( " + state + " -> "
195 + TRANSITIONS[state][nextToken] + " ) token = " + nextToken
196 + " char = " + nextChar + ", found = " + found
197 + ", negate = " + negate);
198 }
199
200 // Lookup next state in transition table and check for valid pattern
201 state = TRANSITIONS[state][nextToken];
202
203 if (state == ERROR) {
204 return false;
205
206 // don't bother, this is a no match anyway
207 // throw new RuntimeException("Invalid pattern");
208 }
209
210 if (state == FINAL) {
211 return found ^ negate;
212 }
213
214 offset++;
215 }
216
217 return found ^ negate;
218 }
219
220 /**
221 * Recursive method for parsing the string. To avoid copying the strings,
222 * the method accepts offset indices into both parameters.
223 *
224 * @param pattern Pattern used in parsing
225 * @param ofp Offset into pattern string (ofp > 0)
226 * @param str String to test
227 * @param ofs Offset into test string (ofs > 0);
228 *
229 * @return boolean Do the strings match
230 */
231 public static boolean parse(final String pattern, final int ofp,
232 final String str, final int ofs)
233 {
234 final int lp = pattern.length();
235 final int ls = str.length();
236
237 // index into pattern string
238 int ip = ofp;
239
240 // index into test string;
241 int is = ofs;
242
243 char chp;
244 char chs;
245
246 if (DEBUG) {
247 if ((ip > -1) && (is > -1) && (ip < lp) && (is < ls)) {
248 System.out.println("parse: " + pattern.substring(ip) + " "
249 + str.substring(is));
250 }
251 }
252
253 // Match happens only, if we parse both strings exactly to the end
254 while ((ip < lp)) {
255 chp = pattern.charAt(ip);
256
257 if (DEBUG) {
258 if ((ip > -1) && (is > -1) && (ip < lp) && (is < ls)) {
259 System.out.println(pattern.substring(ip) + " "
260 + str.substring(is));
261 }
262 }
263
264 switch (chp) {
265 case '[':
266
267 // System.out.println("[ "+chp+", "+chs);
268 // Each set must be close with a ], otherwise it is invalid.
269 int end = pattern.indexOf("]", ip);
270
271 if (end == -1) {
272 return false;
273 }
274
275 // Is this set followed by a *
276 boolean isWildchar = ((end + 1) < lp)
277 && (pattern.charAt(end + 1) == '*');
278
279 if (is < ls) {
280 chs = str.charAt(is);
281 } else {
282 return parse(pattern, end + 2, str, is);
283 }
284
285 // Does this character match
286 boolean thisChar = testSet(pattern, ip + 1, chs);
287
288 // Check for single character match only if there is no
289 // * at the end.
290 if (!thisChar && !isWildchar) {
291 // Return only if this character does not match
292 return false;
293 }
294
295 if (isWildchar) {
296 // If this character does not match, maybe this set
297 // can be skipped entirely
298 if (!thisChar) {
299 ip = end + 2;
300
301 break;
302 }
303
304 // Special case when this character matches, although
305 // it should not: a[a-z]*z == az
306 if (parse(pattern, end + 2, str, is)) {
307 return true;
308 }
309
310 // Try to match next character
311 if (parse(pattern, ip, str, is + 1)) {
312 return true;
313 }
314 }
315
316 // Single character matched, set was processed, since
317 // no * was at the end.
318 ip = end + 1;
319 is++;
320
321 break;
322
323 case '?':
324
325 // Obvious
326 ip++;
327 is++;
328
329 break;
330
331 case '*':
332
333 // Trailing asterisk means that string matches till the end.
334 // Also, checks if this is last char in the string
335 if (ip + 1 == lp) {
336 return true;
337 }
338
339 // Skip the *
340 do {
341 ip++;
342 chp = pattern.charAt(ip);
343 } while ((ip + 1 < lp) && (chp == '*'));
344
345 // But perform a special check and solve it by recursing
346 // from new position
347 if (chp == '?') {
348 if (parse(pattern, ip, str, is)) {
349 return true;
350 }
351 }
352
353 // Iterate through all possible matches in the test string
354 int i = is;
355
356 while (i < ls) {
357 /*
358 * Would be nice to skip unmatchable characters,
359 * but it's too much fuss
360 while ((i < ls) && (str.charAt(i) != chp)) {
361 i++;
362 if (i == ls) {
363 return false;
364 }
365 }
366 */
367
368 // Stupid brute force, but isn't as bad as it seems.
369 // Try all possible matches in the test string.
370 if (parse(pattern, ip, str, i)) {
371 return true;
372 }
373
374 i++;
375 }
376
377 break;
378
379 default:
380
381 // Literal match
382 if (is == ls || pattern.charAt(ip) != str.charAt(is)) {
383 return false;
384 }
385
386 ip++;
387 is++;
388 }
389 }
390
391 // There could be several * at the end of the pattern, although the
392 // test string is at the end.
393 while ((ip < lp) && ((pattern.charAt(ip)) == '*')) {
394 ip++;
395 }
396
397 // Same condition as with while loop
398 return (is == ls) && (ip == lp);
399 }
400
401 /**
402 * DOCUMENT ME!
403 *
404 * @param pattern DOCUMENT ME!
405 * @param str DOCUMENT ME!
406 *
407 * @return DOCUMENT ME!
408 */
409 public static boolean match(final String pattern, final String str)
410 {
411 return parse(pattern, 0, str, 0);
412 }
413
414 /**
415 * Run test applet.
416 *
417 * @param args command line parameters
418 */
419 public static void main(String[] args)
420 {
421 System.out.println("[-az-]* == 01 abAZ : true = "
422 + WildcharMatcher.match("[-aa-]*", "01 abAZ"));
423 System.out.println("[\\!a\\-bc]* == !!!b-bb- : true = "
424 + WildcharMatcher.match("[\\!a\\-bc]*", "!!!b-bb-"));
425
426 System.out.println("*zz == zz : true = "
427 + WildcharMatcher.match("*zz", "zz"));
428 System.out.println("[abc]*zz == zz : true = "
429 + WildcharMatcher.match("[abc]*zz", "zz"));
430
431 System.out.println("[!abc]*a[def] == xyzbd : false = "
432 + WildcharMatcher.match("[!abc]*a[def]", "xyzbd"));
433 System.out.println("[!abc]*a[def] == xyzad : true = "
434 + WildcharMatcher.match("[!abc]*a[def]", "xyzad"));
435 System.out.println("[a-g]l*i?n == florian : true = "
436 + WildcharMatcher.match("[a-g]l*i?n", "florian"));
437 System.out.println("[!abc]*e == smile : true = "
438 + WildcharMatcher.match("[!abc]*e", "smile"));
439 System.out.println("[-z] == a : true = "
440 + WildcharMatcher.match("[-z]", "a"));
441 System.out.println("[] == '' : false = "
442 + WildcharMatcher.match("[]", ""));
443 System.out.println("[a-z]* == java : true = "
444 + WildcharMatcher.match("[a-z]*", "java"));
445 System.out.println("*.* == command.com : true = "
446 + WildcharMatcher.match("*.*", "command.com"));
447 System.out.println("*.* == /var/etc : false = "
448 + WildcharMatcher.match("*.*", "/var/etc"));
449 System.out.println("**?*x*[abh-]*Q == XYZxabbauuZQ : true = "
450 + WildcharMatcher.match("**?*x*[abh-]*Q", "XYZxabbauuZQ"));
451 }
452 }
453
454 /* __oOo__ */