summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBobby Bingham <koorogi@koorogi.info>2017-07-23 22:24:32 -0500
committerBobby Bingham <koorogi@koorogi.info>2017-07-24 21:19:10 -0500
commitc8a0b7157d544f2359f2373160dcc69cdef8f4de (patch)
tree2fa9c2cc4646cab3886025a9663289b177ad4c55
Initial commit
-rw-r--r--.gitignore5
-rw-r--r--Makefile31
-rw-r--r--ast.c99
-rw-r--r--ast.h40
-rw-r--r--lex-pattern.l58
-rw-r--r--parse-pattern.y96
-rw-r--r--pattern2regex.c20
7 files changed, 349 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..7677c59
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+*.o
+lex-*.c
+lex-*.h
+parse-*.c
+parse-*.h
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..e62c90c
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,31 @@
+OPTFLAGS = -O2
+CFLAGS = -std=c99 -D_POSIX_C_SOURCE=200809L $(OPTFLAGS)
+
+TARGETS = pattern2regex
+LEXERS = $(wildcard *.l)
+PARSERS = $(wildcard *.y)
+SOURCES = $(wildcard *.c)
+OBJECTS = $(sort $(SOURCES:.c=.o) $(LEXERS:.l=.o) $(PARSERS:.y=.o))
+
+all: $(TARGETS)
+
+clean:
+ rm -f $(TARGETS)
+ rm -f $(OBJECTS)
+ rm -f $(LEXERS:.l=.c) $(LEXERS:.l=.h)
+ rm -f $(PARSERS:.y=.c) $(PARSERS:.y=.h)
+
+lex-%.c lex-%.h: lex-%.l
+ flex $<
+
+parse-%.c parse-%.h: parse-%.y
+ bison $<
+
+%.o: %.c
+ $(CC) $(CFLAGS) -c $< -o $@
+
+pattern2regex.o: lex-pattern.h parse-pattern.h
+pattern2regex: pattern2regex.o lex-pattern.o parse-pattern.o ast.o
+
+$(TARGETS):
+ $(CC) $(CFLAGS) $^ -o $@
diff --git a/ast.c b/ast.c
new file mode 100644
index 0000000..d261b9d
--- /dev/null
+++ b/ast.c
@@ -0,0 +1,99 @@
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "ast.h"
+
+struct atom *ast;
+
+struct atom *mkatom(const struct atom *src)
+{
+ struct atom *ret = malloc(sizeof *ret);
+ if (ret) *ret = *src;
+ return ret;
+}
+
+static void ivprintf(int level, const char *fmt, va_list ap)
+{
+ printf("%*s", 4 * level, "");
+ vprintf(fmt, ap);
+}
+
+static void iprintf(int level, const char *fmt, ...)
+{
+ va_list ap;
+ va_start(ap, fmt);
+ ivprintf(level, fmt, ap);
+ va_end(ap);
+}
+
+static void dump_atom_indent(const struct atom *a, int level);
+
+static void dump_atom_children(const struct atom *a, int level, const char *fmt, ...)
+{
+ va_list ap;
+ va_start(ap, fmt);
+ ivprintf(level, fmt, ap);
+ va_end(ap);
+
+ iprintf(level, "{\n");
+ dump_atom_indent(a->u.children[0], level + 1);
+ dump_atom_indent(a->u.children[1], level + 1);
+ iprintf(level, "}\n");
+}
+
+const char *fmt_repeat(char *buf, const struct repeat *r)
+{
+ if (r->min == r->max)
+ sprintf(buf, "%ld", r->min);
+ else if (r->max < 0)
+ sprintf(buf, "%ld+", r->min);
+ else
+ sprintf(buf, "%ld - %ld", r->min, r->max);
+ return buf;
+}
+
+#define FMT_REPEAT(r) fmt_repeat((char[64]){0}, (r))
+
+static void dump_atom_indent(const struct atom *a, int level)
+{
+ switch (a->type) {
+ case ATOM_ALTERNATION:
+ dump_atom_children(a, level, "alternation\n");
+ break;
+
+ case ATOM_SEQUENCE:
+ dump_atom_indent(a->u.children[0], level);
+ dump_atom_indent(a->u.children[1], level);
+ break;
+
+ case ATOM_REPETITION:
+ if (a->u.repeat.counts.min == 1 && a->u.repeat.counts.max == 1) {
+ dump_atom_indent(a->u.repeat.child, level);
+ } else {
+ iprintf(level, "repeat %s times\n", FMT_REPEAT(&a->u.repeat.counts));
+ iprintf(level, "{\n");
+ dump_atom_indent(a->u.repeat.child, level + 1);
+ iprintf(level, "}\n");
+ }
+ break;
+
+ case ATOM_LITERAL:
+ iprintf(level, "literal: %s\n", a->u.literal);
+ break;
+
+ case ATOM_ALPHABETIC: iprintf(level, "[:alpha:]\n"); break;
+ case ATOM_NUMERIC: iprintf(level, "[:digit:]\n"); break;
+ case ATOM_UPPERCASE: iprintf(level, "[:upper:]\n"); break;
+ case ATOM_LOWERCASE: iprintf(level, "[:lower:]\n"); break;
+ case ATOM_CONTROL: iprintf(level, "[:cntrl:]\n"); break;
+ case ATOM_PUNCTUATION: iprintf(level, "[:punct:]\n"); break;
+ case ATOM_EVERYTHING: iprintf(level, "any\n"); break;
+ }
+}
+
+void dump_atom(const struct atom *a)
+{
+ dump_atom_indent(a, 0);
+}
+
diff --git a/ast.h b/ast.h
new file mode 100644
index 0000000..4694c33
--- /dev/null
+++ b/ast.h
@@ -0,0 +1,40 @@
+#ifndef AST_H
+#define AST_H
+
+struct repeat {
+ long min, max;
+};
+
+enum {
+ ATOM_ALTERNATION,
+ ATOM_SEQUENCE,
+ ATOM_REPETITION,
+ ATOM_ALPHABETIC,
+ ATOM_NUMERIC,
+ ATOM_UPPERCASE,
+ ATOM_LOWERCASE,
+ ATOM_CONTROL,
+ ATOM_PUNCTUATION,
+ ATOM_EVERYTHING,
+ ATOM_LITERAL,
+};
+
+struct atom {
+ int type;
+ union {
+ struct {
+ struct repeat counts;
+ struct atom *child;
+ } repeat;
+ const struct atom *children[2];
+ const char *literal;
+ } u;
+};
+
+struct atom *mkatom(const struct atom *src);
+void dump_atom(const struct atom *a);
+
+extern struct atom *ast;
+
+#endif
+
diff --git a/lex-pattern.l b/lex-pattern.l
new file mode 100644
index 0000000..d091c74
--- /dev/null
+++ b/lex-pattern.l
@@ -0,0 +1,58 @@
+%{
+
+#include <errno.h>
+#include <stdlib.h>
+
+#include "ast.h"
+#include "parse-pattern.h"
+
+static int intlit(long *out, const char *in)
+{
+ errno = 0;
+ *out = strtol(in, NULL, 10);
+ return errno == ERANGE ? T_BADLIT : T_INTEGER;
+}
+
+static int strlit(char **out, const char *in, size_t inlen)
+{
+ in += 1;
+ inlen -= 2;
+
+ size_t len = 0;
+ for (const char *c = in; c < in + inlen; c++, len++) {
+ if (*c == '\"') c++;
+ }
+
+ char *str;
+ if (!(*out = str = malloc(len + 1))) return T_NOMEM;
+
+ for (const char *c = in; c < in + inlen; *str++ = *c++) {
+ if (*c == '\"') c++;
+ }
+ *str = 0;
+
+ return T_STRING;
+}
+
+%}
+
+%option outfile="lex-pattern.c" header-file="lex-pattern.h"
+%option noyywrap never-interactive
+
+%%
+
+\( { return T_LPAREN; }
+\) { return T_RPAREN; }
+, { return T_COMMA; }
+\. { return T_PERIOD; }
+A { return T_ALPHABETIC; }
+N { return T_NUMERIC; }
+U { return T_UPPERCASE; }
+L { return T_LOWERCASE; }
+C { return T_CONTROL; }
+P { return T_PUNCTUATION; }
+E { return T_EVERYTHING; }
+[0-9]+ { return intlit(&yylval.lval, yytext); }
+\"(\"\"|[^"])*\" { return strlit(&yylval.sval, yytext, yyleng); }
+
+%%
diff --git a/parse-pattern.y b/parse-pattern.y
new file mode 100644
index 0000000..e5acd3c
--- /dev/null
+++ b/parse-pattern.y
@@ -0,0 +1,96 @@
+%{
+
+#include <stdio.h>
+
+#include "ast.h"
+#include "lex-pattern.h"
+
+void yyerror(const char *msg)
+{
+ fprintf(stderr, "%s\n", msg);
+}
+
+%}
+
+%output "parse-pattern.c"
+%defines "parse-pattern.h"
+
+%union {
+ long lval;
+ char *sval;
+ struct atom *atom;
+ struct repeat repeat;
+}
+
+%token END 0 "end of file"
+
+%token T_BADLIT
+%token T_NOMEM
+
+%token T_ALPHABETIC
+%token T_NUMERIC
+%token T_UPPERCASE
+%token T_LOWERCASE
+%token T_CONTROL
+%token T_PUNCTUATION
+%token T_EVERYTHING
+
+%token T_LPAREN "("
+%token T_RPAREN ")"
+%token T_COMMA ","
+%token T_PERIOD "."
+%token <lval> T_INTEGER
+%token <sval> T_STRING
+
+%type <atom> alternation
+%type <atom> atom
+%type <atom> input
+%type <atom> sequence
+%type <atom> molecule
+
+%type <repeat> repeat
+
+%%
+
+start:
+ input END { ast = $1; }
+ ;
+
+input:
+ molecule
+ | input molecule { $$ = mkatom(&(struct atom) { .type = ATOM_SEQUENCE, .u = { .children = { $1, $2 } } }); }
+ ;
+
+molecule:
+ repeat sequence { $$ = mkatom(&(struct atom) { .type = ATOM_REPETITION, .u = { .repeat = { .counts = $1, .child = $2 } } }); }
+ ;
+
+repeat:
+ T_INTEGER T_PERIOD T_INTEGER { $$ = (struct repeat) { $1, $3 }; }
+ | T_INTEGER T_PERIOD { $$ = (struct repeat) { $1, -1 }; }
+ | T_PERIOD T_INTEGER { $$ = (struct repeat) { 0, $2 }; }
+ | T_PERIOD { $$ = (struct repeat) { 0, -1 }; }
+ | T_INTEGER { $$ = (struct repeat) { $1, $1 }; }
+ ;
+
+sequence:
+ atom
+ | sequence atom { $$ = mkatom(&(struct atom) { .type = ATOM_SEQUENCE, .u = { .children = { $1, $2 } } }); }
+ ;
+
+atom:
+ T_ALPHABETIC { $$ = mkatom(&(struct atom) { .type = ATOM_ALPHABETIC }); }
+ | T_NUMERIC { $$ = mkatom(&(struct atom) { .type = ATOM_NUMERIC }); }
+ | T_UPPERCASE { $$ = mkatom(&(struct atom) { .type = ATOM_UPPERCASE }); }
+ | T_LOWERCASE { $$ = mkatom(&(struct atom) { .type = ATOM_LOWERCASE }); }
+ | T_CONTROL { $$ = mkatom(&(struct atom) { .type = ATOM_CONTROL }); }
+ | T_PUNCTUATION { $$ = mkatom(&(struct atom) { .type = ATOM_PUNCTUATION }); }
+ | T_EVERYTHING { $$ = mkatom(&(struct atom) { .type = ATOM_EVERYTHING }); }
+ | T_STRING { $$ = mkatom(&(struct atom) { .type = ATOM_LITERAL, .u = { .literal = $1 } }); }
+ | T_LPAREN alternation T_RPAREN { $$ = $2; }
+ ;
+
+alternation:
+ molecule
+ | alternation T_COMMA molecule { $$ = mkatom(&(struct atom) { .type = ATOM_ALTERNATION, .u = { .children = { $1, $3 } } }); }
+ ;
diff --git a/pattern2regex.c b/pattern2regex.c
new file mode 100644
index 0000000..669206a
--- /dev/null
+++ b/pattern2regex.c
@@ -0,0 +1,20 @@
+#include <stdio.h>
+
+#include "ast.h"
+#include "lex-pattern.h"
+#include "parse-pattern.h"
+
+int main(int argc, char **argv)
+{
+ if (argc >= 2) {
+ yyin = fmemopen(argv[1], strlen(argv[1]), "r");
+ } else {
+ yyin = stdin;
+ }
+
+ if (!yyparse()) dump_atom(ast);
+
+ fclose(yyin);
+ return 0;
+}
+