VPR-7.0
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Macros
slre.c
Go to the documentation of this file.
1 // Copyright (c) 2004-2012 Sergey Lyubka <valenok@gmail.com>
2 // All rights reserved
3 //
4 // Permission is hereby granted, free of charge, to any person obtaining a copy
5 // of this software and associated documentation files (the "Software"), to deal
6 // in the Software without restriction, including without limitation the rights
7 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 // copies of the Software, and to permit persons to whom the Software is
9 // furnished to do so, subject to the following conditions:
10 //
11 // The above copyright notice and this permission notice shall be included in
12 // all copies or substantial portions of the Software.
13 //
14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 // THE SOFTWARE.
21 
22 #include <stdio.h>
23 #include <assert.h>
24 #include <ctype.h>
25 #include <stdio.h>
26 #include <stdarg.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include "slre.h"
30 
31 // Compiled regular expression
32 struct slre {
33  unsigned char code[256];
34  unsigned char data[256];
35  int code_size;
36  int data_size;
37  int num_caps; // Number of bracket pairs
38  int anchored; // Must match from string start
40  const char *error_string; // Error string
41 };
42 
43 // Captured substring
44 struct cap {
45  const char *ptr; // Pointer to the substring
46  int len; // Substring length
47 };
48 
49 enum {
52 };
53 
54 // Commands and operands are all unsigned char (1 byte long). All code offsets
55 // are relative to current address, and positive (always point forward). Data
56 // offsets are absolute. Commands with operands:
57 //
58 // BRANCH offset1 offset2
59 // Try to match the code block that follows the BRANCH instruction
60 // (code block ends with END). If no match, try to match code block that
61 // starts at offset1. If either of these match, jump to offset2.
62 //
63 // EXACT data_offset data_length
64 // Try to match exact string. String is recorded in data section from
65 // data_offset, and has length data_length.
66 //
67 // OPEN capture_number
68 // CLOSE capture_number
69 // If the user have passed 'struct cap' array for captures, OPEN
70 // records the beginning of the matched substring (cap->ptr), CLOSE
71 // sets the length (cap->len) for respective capture_number.
72 //
73 // STAR code_offset
74 // PLUS code_offset
75 // QUEST code_offset
76 // *, +, ?, respectively. Try to gobble as much as possible from the
77 // matched buffer while code block that follows these instructions
78 // matches. When the longest possible string is matched,
79 // jump to code_offset
80 //
81 // STARQ, PLUSQ are non-greedy versions of STAR and PLUS.
82 
83 static const char *meta_characters = "|.*+?()[\\";
84 static const char *error_no_match = "No match";
85 
86 static void set_jump_offset(struct slre *r, int pc, int offset) {
87  assert(offset < r->code_size);
88  if (r->code_size - offset > 0xff) {
89  r->error_string = "Jump offset is too big";
90  } else {
91  r->code[pc] = (unsigned char) (r->code_size - offset);
92  }
93 }
94 
95 static void emit(struct slre *r, int code) {
96  if (r->code_size >= (int) (sizeof(r->code) / sizeof(r->code[0]))) {
97  r->error_string = "RE is too long (code overflow)";
98  } else {
99  r->code[r->code_size++] = (unsigned char) code;
100  }
101 }
102 
103 static void store_char_in_data(struct slre *r, int ch) {
104  if (r->data_size >= (int) sizeof(r->data)) {
105  r->error_string = "RE is too long (data overflow)";
106  } else {
107  r->data[r->data_size++] = ch;
108  }
109 }
110 
111 static void exact(struct slre *r, const char **re) {
112  int old_data_size = r->data_size;
113 
114  while (**re != '\0' && (strchr(meta_characters, **re)) == NULL) {
115  store_char_in_data(r, *(*re)++);
116  }
117 
118  emit(r, EXACT);
119  emit(r, old_data_size);
120  emit(r, r->data_size - old_data_size);
121 }
122 
123 static int get_escape_char(const char **re) {
124  int res;
125 
126  switch (*(*re)++) {
127  case 'n': res = '\n'; break;
128  case 'r': res = '\r'; break;
129  case 't': res = '\t'; break;
130  case '0': res = 0; break;
131  case 'S': res = NONSPACE << 8; break;
132  case 's': res = SPACE << 8; break;
133  case 'd': res = DIGIT << 8; break;
134  default: res = (*re)[-1]; break;
135  }
136 
137  return res;
138 }
139 
140 static void anyof(struct slre *r, const char **re) {
141  int esc, old_data_size = r->data_size, op = ANYOF;
142 
143  while (**re != '\0')
144 
145  switch (*(*re)++) {
146  case ']':
147  emit(r, op);
148  emit(r, old_data_size);
149  emit(r, r->data_size - old_data_size);
150  return;
151  // NOTREACHED
152  break;
153  case '\\':
154  esc = get_escape_char(re);
155  if ((esc & 0xff) == 0) {
156  store_char_in_data(r, 0);
157  store_char_in_data(r, esc >> 8);
158  } else {
159  store_char_in_data(r, esc);
160  }
161  break;
162  default:
163  store_char_in_data(r, (*re)[-1]);
164  break;
165  }
166 
167  r->error_string = "No closing ']' bracket";
168 }
169 
170 static void relocate(struct slre *r, int begin, int shift) {
171  emit(r, END);
172  memmove(r->code + begin + shift, r->code + begin, r->code_size - begin);
173  r->code_size += shift;
174 }
175 
176 static void quantifier(struct slre *r, int prev, int op) {
177  if (r->code[prev] == EXACT && r->code[prev + 2] > 1) {
178  r->code[prev + 2]--;
179  emit(r, EXACT);
180  emit(r, r->code[prev + 1] + r->code[prev + 2]);
181  emit(r, 1);
182  prev = r->code_size - 3;
183  }
184  relocate(r, prev, 2);
185  r->code[prev] = op;
186  set_jump_offset(r, prev + 1, prev);
187 }
188 
189 static void exact_one_char(struct slre *r, int ch) {
190  emit(r, EXACT);
191  emit(r, r->data_size);
192  emit(r, 1);
193  store_char_in_data(r, ch);
194 }
195 
196 static void fixup_branch(struct slre *r, int fixup) {
197  if (fixup > 0) {
198  emit(r, END);
199  set_jump_offset(r, fixup, fixup - 2);
200  }
201 }
202 
203 static void compile(struct slre *r, const char **re) {
204  int op, esc, branch_start, last_op, fixup, cap_no, level;
205 
206  fixup = 0;
207  level = r->num_caps;
208  branch_start = last_op = r->code_size;
209 
210  for (;;)
211  switch (*(*re)++) {
212 
213  case '\0':
214  (*re)--;
215  return;
216  // NOTREACHED
217  break;
218 
219  case '.':
220  last_op = r->code_size;
221  emit(r, ANY);
222  break;
223 
224  case '[':
225  last_op = r->code_size;
226  anyof(r, re);
227  break;
228 
229  case '\\':
230  last_op = r->code_size;
231  esc = get_escape_char(re);
232  if (esc & 0xff00) {
233  emit(r, esc >> 8);
234  } else {
235  exact_one_char(r, esc);
236  }
237  break;
238 
239  case '(':
240  last_op = r->code_size;
241  cap_no = ++r->num_caps;
242  emit(r, OPEN);
243  emit(r, cap_no);
244 
245  compile(r, re);
246  if (*(*re)++ != ')') {
247  r->error_string = "No closing bracket";
248  return;
249  }
250 
251  emit(r, CLOSE);
252  emit(r, cap_no);
253  break;
254 
255  case ')':
256  (*re)--;
257  fixup_branch(r, fixup);
258  if (level == 0) {
259  r->error_string = "Unbalanced brackets";
260  return;
261  }
262  return;
263  // NOTREACHED
264  break;
265 
266  case '+':
267  case '*':
268  op = (*re)[-1] == '*' ? STAR: PLUS;
269  if (**re == '?') {
270  (*re)++;
271  op = op == STAR ? STARQ : PLUSQ;
272  }
273  quantifier(r, last_op, op);
274  break;
275 
276  case '?':
277  quantifier(r, last_op, QUEST);
278  break;
279 
280  case '|':
281  fixup_branch(r, fixup);
282  relocate(r, branch_start, 3);
283  r->code[branch_start] = BRANCH;
284  set_jump_offset(r, branch_start + 1, branch_start);
285  fixup = branch_start + 2;
286  r->code[fixup] = 0xff;
287  break;
288 
289  default:
290  (*re)--;
291  last_op = r->code_size;
292  exact(r, re);
293  break;
294  }
295 }
296 
297 // Compile regular expression. If success, 1 is returned.
298 // If error, 0 is returned and slre.error_string points to the error message.
299 static const char *compile2(struct slre *r, const char *re) {
300  r->error_string = NULL;
301  r->code_size = r->data_size = r->num_caps = r->anchored = 0;
302 
303  emit(r, OPEN); // This will capture what matches full RE
304  emit(r, 0);
305 
306  while (*re != '\0') {
307  compile(r, &re);
308  }
309 
310  if (r->code[2] == BRANCH) {
311  fixup_branch(r, 4);
312  }
313 
314  emit(r, CLOSE);
315  emit(r, 0);
316  emit(r, END);
317 
318 #if 0
319  static void dump(const struct slre *, FILE *);
320  dump(r, stdout);
321 #endif
322 
323  return r->error_string;
324 }
325 
326 static const char *match(const struct slre *, int, const char *, int, int *,
327  struct cap *);
328 
329 static void loop_greedy(const struct slre *r, int pc, const char *s, int len,
330  int *ofs) {
331  int saved_offset, matched_offset;
332 
333  saved_offset = matched_offset = *ofs;
334 
335  while (!match(r, pc + 2, s, len, ofs, NULL)) {
336  saved_offset = *ofs;
337  if (!match(r, pc + r->code[pc + 1], s, len, ofs, NULL)) {
338  matched_offset = saved_offset;
339  }
340  *ofs = saved_offset;
341  }
342 
343  *ofs = matched_offset;
344 }
345 
346 static void loop_non_greedy(const struct slre *r, int pc, const char *s,
347  int len, int *ofs) {
348  int saved_offset = *ofs;
349 
350  while (!match(r, pc + 2, s, len, ofs, NULL)) {
351  saved_offset = *ofs;
352  if (!match(r, pc + r->code[pc + 1], s, len, ofs, NULL))
353  break;
354  }
355 
356  *ofs = saved_offset;
357 }
358 
359 static int is_any_of(const unsigned char *p, int len, const char *s, int *ofs) {
360  int i, ch;
361 
362  ch = s[*ofs];
363 
364  for (i = 0; i < len; i++)
365  if (p[i] == ch) {
366  (*ofs)++;
367  return 1;
368  }
369 
370  return 0;
371 }
372 
373 static int is_any_but(const unsigned char *p, int len, const char *s,
374  int *ofs) {
375  int i, ch;
376 
377  ch = s[*ofs];
378 
379  for (i = 0; i < len; i++)
380  if (p[i] == ch) {
381  return 0;
382  }
383 
384  (*ofs)++;
385  return 1;
386 }
387 
388 static int lowercase(const char *s) {
389  return tolower(* (const unsigned char *) s);
390 }
391 
392 static int casecmp(const void *p1, const void *p2, size_t len) {
393  const char *s1 = (const char *) p1, *s2 = (const char *) p2;
394  int diff = 0;
395 
396  if (len > 0)
397  do {
398  diff = lowercase(s1++) - lowercase(s2++);
399  } while (diff == 0 && s1[-1] != '\0' && --len > 0);
400 
401  return diff;
402 }
403 
404 static const char *match(const struct slre *r, int pc, const char *s, int len,
405  int *ofs, struct cap *caps) {
406  int n, saved_offset;
407  const char *error_string = NULL;
408  int (*cmp)(const void *string1, const void *string2, size_t len);
409 
410  while (error_string == NULL && r->code[pc] != END) {
411 
412  assert(pc < r->code_size);
413  assert(pc < (int) (sizeof(r->code) / sizeof(r->code[0])));
414 
415  switch (r->code[pc]) {
416  case BRANCH:
417  saved_offset = *ofs;
418  error_string = match(r, pc + 3, s, len, ofs, caps);
419  if (error_string != NULL) {
420  *ofs = saved_offset;
421  error_string = match(r, pc + r->code[pc + 1], s, len, ofs, caps);
422  }
423  pc += r->code[pc + 2];
424  break;
425 
426  case EXACT:
427  error_string = error_no_match;
428  n = r->code[pc + 2]; // String length
429  cmp = r->options & SLRE_CASE_INSENSITIVE ? casecmp : memcmp;
430  if (n <= len - *ofs && !cmp(s + *ofs, r->data + r->code[pc + 1], n)) {
431  (*ofs) += n;
432  error_string = NULL;
433  }
434  pc += 3;
435  break;
436 
437  case QUEST:
438  error_string = NULL;
439  saved_offset = *ofs;
440  if (match(r, pc + 2, s, len, ofs, caps) != NULL) {
441  *ofs = saved_offset;
442  }
443  pc += r->code[pc + 1];
444  break;
445 
446  case STAR:
447  error_string = NULL;
448  loop_greedy(r, pc, s, len, ofs);
449  pc += r->code[pc + 1];
450  break;
451 
452  case STARQ:
453  error_string = NULL;
454  loop_non_greedy(r, pc, s, len, ofs);
455  pc += r->code[pc + 1];
456  break;
457 
458  case PLUS:
459  if ((error_string = match(r, pc + 2, s, len, ofs, caps)) != NULL)
460  break;
461 
462  loop_greedy(r, pc, s, len, ofs);
463  pc += r->code[pc + 1];
464  break;
465 
466  case PLUSQ:
467  if ((error_string = match(r, pc + 2, s, len, ofs, caps)) != NULL)
468  break;
469 
470  loop_non_greedy(r, pc, s, len, ofs);
471  pc += r->code[pc + 1];
472  break;
473 
474  case SPACE:
475  error_string = error_no_match;
476  if (*ofs < len && isspace(((const unsigned char *)s)[*ofs])) {
477  (*ofs)++;
478  error_string = NULL;
479  }
480  pc++;
481  break;
482 
483  case NONSPACE:
484  error_string = error_no_match;
485  if (*ofs <len && !isspace(((const unsigned char *)s)[*ofs])) {
486  (*ofs)++;
487  error_string = NULL;
488  }
489  pc++;
490  break;
491 
492  case DIGIT:
493  error_string = error_no_match;
494  if (*ofs < len && isdigit(((const unsigned char *)s)[*ofs])) {
495  (*ofs)++;
496  error_string = NULL;
497  }
498  pc++;
499  break;
500 
501  case ANY:
502  error_string = error_no_match;
503  if (*ofs < len) {
504  (*ofs)++;
505  error_string = NULL;
506  }
507  pc++;
508  break;
509 
510  case ANYOF:
511  error_string = error_no_match;
512  if (*ofs < len)
513  error_string = is_any_of(r->data + r->code[pc + 1], r->code[pc + 2],
514  s, ofs) ? NULL : error_no_match;
515  pc += 3;
516  break;
517 
518  case ANYBUT:
519  error_string = error_no_match;
520  if (*ofs < len)
521  error_string = is_any_but(r->data + r->code[pc + 1], r->code[pc + 2],
522  s, ofs) ? NULL : error_no_match;
523  pc += 3;
524  break;
525 
526  case BOL:
527  error_string = *ofs == 0 ? NULL : error_no_match;
528  pc++;
529  break;
530 
531  case EOL:
532  error_string = *ofs == len ? NULL : error_no_match;
533  pc++;
534  break;
535 
536  case OPEN:
537  if (caps != NULL)
538  caps[r->code[pc + 1]].ptr = s + *ofs;
539  pc += 2;
540  break;
541 
542  case CLOSE:
543  if (caps != NULL)
544  caps[r->code[pc + 1]].len = (s + *ofs) -
545  caps[r->code[pc + 1]].ptr;
546  pc += 2;
547  break;
548 
549  case END:
550  pc++;
551  break;
552 
553  default:
554  printf("unknown cmd (%d) at %d\n", r->code[pc], pc);
555  assert(0);
556  break;
557  }
558  }
559 
560  return error_string;
561 }
562 
563 // Return 1 if match, 0 if no match.
564 // If 'captured_substrings' array is not NULL, then it is filled with the
565 // values of captured substrings. captured_substrings[0] element is always
566 // a full matched substring. The round bracket captures start from
567 // captured_substrings[1].
568 // It is assumed that the size of captured_substrings array is enough to
569 // hold all captures. The caller function must make sure it is! So, the
570 // array_size = number_of_round_bracket_pairs + 1
571 static const char *match2(const struct slre *r, const char *buf, int len,
572  struct cap *caps) {
573  int i, ofs = 0;
574  const char *error_string = error_no_match;
575 
576  if (r->anchored) {
577  error_string = match(r, 0, buf, len, &ofs, caps);
578  } else {
579  for (i = 0; i < len && error_string != NULL; i++) {
580  ofs = i;
581  error_string = match(r, 0, buf, len, &ofs, caps);
582  }
583  }
584 
585  return error_string;
586 }
587 
588 static const char *capture_float(const struct cap *cap, void *p, size_t len) {
589  const char *fmt;
590  char buf[20];
591 
592  switch (len) {
593  case sizeof(float): fmt = "f"; break;
594  case sizeof(double): fmt = "lf"; break;
595  default: return "SLRE_FLOAT: unsupported size";
596  }
597 
598  sprintf(buf, "%%%d%s", cap->len, fmt);
599  return sscanf(cap->ptr, buf, p) == 1 ? NULL : "SLRE_FLOAT: capture failed";
600 }
601 
602 static const char *capture_string(const struct cap *cap, void *p, size_t len) {
603  if ((int) len <= cap->len) {
604  return "SLRE_STRING: buffer size too small";
605  }
606  memcpy(p, cap->ptr, cap->len);
607  ((char *) p)[cap->len] = '\0';
608  return NULL;
609 }
610 
611 static const char *capture_int(const struct cap *cap, void *p, size_t len) {
612  const char *fmt;
613  char buf[20];
614 
615  switch (len) {
616  case sizeof(char): fmt = "hh"; break;
617  case sizeof(short): fmt = "h"; break;
618  case sizeof(int): fmt = "d"; break;
619  default: return "SLRE_INT: unsupported size";
620  }
621 
622  sprintf(buf, "%%%d%s", cap->len, fmt);
623  return sscanf(cap->ptr, buf, p) == 1 ? NULL : "SLRE_INT: capture failed";
624 }
625 
626 static const char *capture(const struct cap *caps, int num_caps, va_list ap) {
627  int i, type;
628  size_t size;
629  void *p;
630  const char *err = NULL;
631 
632  for (i = 0; i < num_caps; i++) {
633  type = va_arg(ap, int);
634  size = va_arg(ap, size_t);
635  p = va_arg(ap, void *);
636  switch (type) {
637  case SLRE_INT: err = capture_int(&caps[i], p, size); break;
638  case SLRE_FLOAT: err = capture_float(&caps[i], p, size); break;
639  case SLRE_STRING: err = capture_string(&caps[i], p, size); break;
640  default: err = "Unknown type, expected SLRE_(INT|FLOAT|STRING)"; break;
641  }
642  }
643  return err;
644 }
645 
646 const char *slre_match(enum slre_option options, const char *re,
647  const char *buf, int buf_len, ...) {
648  struct slre slre;
649  struct cap caps[20];
650  va_list ap;
651  const char *error_string = NULL;
652 
653  slre.options = options;
654  if ((error_string = compile2(&slre, re)) == NULL &&
655  (error_string = match2(&slre, buf, buf_len, caps)) == NULL) {
656  va_start(ap, buf_len);
657  error_string = capture(caps + 1, slre.num_caps, ap);
658  va_end(ap);
659  }
660 
661  return error_string;
662 }
Definition: slre.c:51
static int casecmp(const void *p1, const void *p2, size_t len)
Definition: slre.c:392
const char * slre_match(enum slre_option options, const char *re, const char *buf, int buf_len,...)
Definition: slre.c:646
static const char * match2(const struct slre *r, const char *buf, int len, struct cap *caps)
Definition: slre.c:571
const char * ptr
Definition: slre.c:45
Definition: slre.c:50
Definition: slre.c:50
Definition: slre.c:50
int code_size
Definition: slre.c:35
Definition: slre.c:50
int num_caps
Definition: slre.c:37
Definition: slre.c:51
slre_option
Definition: slre.h:80
static void exact_one_char(struct slre *r, int ch)
Definition: slre.c:189
static const char * capture_int(const struct cap *cap, void *p, size_t len)
Definition: slre.c:611
static void quantifier(struct slre *r, int prev, int op)
Definition: slre.c:176
Definition: slre.c:50
int data_size
Definition: slre.c:36
unsigned char data[256]
Definition: slre.c:34
static const char * error_no_match
Definition: slre.c:84
static const char * capture_float(const struct cap *cap, void *p, size_t len)
Definition: slre.c:588
static int is_any_but(const unsigned char *p, int len, const char *s, int *ofs)
Definition: slre.c:373
Definition: slre.c:50
int anchored
Definition: slre.c:38
static void fixup_branch(struct slre *r, int fixup)
Definition: slre.c:196
static const char * meta_characters
Definition: slre.c:83
static void emit(struct slre *r, int code)
Definition: slre.c:95
static const char * compile2(struct slre *r, const char *re)
Definition: slre.c:299
static void compile(struct slre *r, const char **re)
Definition: slre.c:203
Definition: slre.c:51
enum slre_option options
Definition: slre.c:39
Definition: slre.c:50
Definition: slre.c:51
static void relocate(struct slre *r, int begin, int shift)
Definition: slre.c:170
static const char * capture(const struct cap *caps, int num_caps, va_list ap)
Definition: slre.c:626
int len
Definition: slre.c:46
static int lowercase(const char *s)
Definition: slre.c:388
unsigned char code[256]
Definition: slre.c:33
Definition: slre.c:51
static void loop_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
Definition: slre.c:329
Definition: slre.c:50
Definition: slre.c:44
static int get_escape_char(const char **re)
Definition: slre.c:123
Definition: slre.c:50
static void loop_non_greedy(const struct slre *r, int pc, const char *s, int len, int *ofs)
Definition: slre.c:346
const char * error_string
Definition: slre.c:40
Definition: slre.c:50
Definition: slre.c:51
static void anyof(struct slre *r, const char **re)
Definition: slre.c:140
static const char * capture_string(const struct cap *cap, void *p, size_t len)
Definition: slre.c:602
Definition: slre.c:50
static const char * match(const struct slre *, int, const char *, int, int *, struct cap *)
Definition: slre.c:404
Definition: slre.h:81
static int is_any_of(const unsigned char *p, int len, const char *s, int *ofs)
Definition: slre.c:359
Definition: slre.c:50
static void exact(struct slre *r, const char **re)
Definition: slre.c:111
static void store_char_in_data(struct slre *r, int ch)
Definition: slre.c:103
Definition: slre.c:32
static void set_jump_offset(struct slre *r, int pc, int offset)
Definition: slre.c:86