From c8a0b7157d544f2359f2373160dcc69cdef8f4de Mon Sep 17 00:00:00 2001 From: Bobby Bingham Date: Sun, 23 Jul 2017 22:24:32 -0500 Subject: Initial commit --- .gitignore | 5 +++ Makefile | 31 ++++++++++++++++++ ast.c | 99 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ ast.h | 40 +++++++++++++++++++++++ lex-pattern.l | 58 +++++++++++++++++++++++++++++++++ parse-pattern.y | 96 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ pattern2regex.c | 20 ++++++++++++ 7 files changed, 349 insertions(+) create mode 100644 .gitignore create mode 100644 Makefile create mode 100644 ast.c create mode 100644 ast.h create mode 100644 lex-pattern.l create mode 100644 parse-pattern.y create mode 100644 pattern2regex.c diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7677c59 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +*.o +lex-*.c +lex-*.h +parse-*.c +parse-*.h diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..e62c90c --- /dev/null +++ b/Makefile @@ -0,0 +1,31 @@ +OPTFLAGS = -O2 +CFLAGS = -std=c99 -D_POSIX_C_SOURCE=200809L $(OPTFLAGS) + +TARGETS = pattern2regex +LEXERS = $(wildcard *.l) +PARSERS = $(wildcard *.y) +SOURCES = $(wildcard *.c) +OBJECTS = $(sort $(SOURCES:.c=.o) $(LEXERS:.l=.o) $(PARSERS:.y=.o)) + +all: $(TARGETS) + +clean: + rm -f $(TARGETS) + rm -f $(OBJECTS) + rm -f $(LEXERS:.l=.c) $(LEXERS:.l=.h) + rm -f $(PARSERS:.y=.c) $(PARSERS:.y=.h) + +lex-%.c lex-%.h: lex-%.l + flex $< + +parse-%.c parse-%.h: parse-%.y + bison $< + +%.o: %.c + $(CC) $(CFLAGS) -c $< -o $@ + +pattern2regex.o: lex-pattern.h parse-pattern.h +pattern2regex: pattern2regex.o lex-pattern.o parse-pattern.o ast.o + +$(TARGETS): + $(CC) $(CFLAGS) $^ -o $@ diff --git a/ast.c b/ast.c new file mode 100644 index 0000000..d261b9d --- /dev/null +++ b/ast.c @@ -0,0 +1,99 @@ +#include +#include +#include + +#include "ast.h" + +struct atom *ast; + +struct atom *mkatom(const struct atom *src) +{ + struct atom *ret = malloc(sizeof *ret); + if (ret) *ret = *src; + return ret; +} + +static void ivprintf(int level, const char *fmt, va_list ap) +{ + printf("%*s", 4 * level, ""); + vprintf(fmt, ap); +} + +static void iprintf(int level, const char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + ivprintf(level, fmt, ap); + va_end(ap); +} + +static void dump_atom_indent(const struct atom *a, int level); + +static void dump_atom_children(const struct atom *a, int level, const char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + ivprintf(level, fmt, ap); + va_end(ap); + + iprintf(level, "{\n"); + dump_atom_indent(a->u.children[0], level + 1); + dump_atom_indent(a->u.children[1], level + 1); + iprintf(level, "}\n"); +} + +const char *fmt_repeat(char *buf, const struct repeat *r) +{ + if (r->min == r->max) + sprintf(buf, "%ld", r->min); + else if (r->max < 0) + sprintf(buf, "%ld+", r->min); + else + sprintf(buf, "%ld - %ld", r->min, r->max); + return buf; +} + +#define FMT_REPEAT(r) fmt_repeat((char[64]){0}, (r)) + +static void dump_atom_indent(const struct atom *a, int level) +{ + switch (a->type) { + case ATOM_ALTERNATION: + dump_atom_children(a, level, "alternation\n"); + break; + + case ATOM_SEQUENCE: + dump_atom_indent(a->u.children[0], level); + dump_atom_indent(a->u.children[1], level); + break; + + case ATOM_REPETITION: + if (a->u.repeat.counts.min == 1 && a->u.repeat.counts.max == 1) { + dump_atom_indent(a->u.repeat.child, level); + } else { + iprintf(level, "repeat %s times\n", FMT_REPEAT(&a->u.repeat.counts)); + iprintf(level, "{\n"); + dump_atom_indent(a->u.repeat.child, level + 1); + iprintf(level, "}\n"); + } + break; + + case ATOM_LITERAL: + iprintf(level, "literal: %s\n", a->u.literal); + break; + + case ATOM_ALPHABETIC: iprintf(level, "[:alpha:]\n"); break; + case ATOM_NUMERIC: iprintf(level, "[:digit:]\n"); break; + case ATOM_UPPERCASE: iprintf(level, "[:upper:]\n"); break; + case ATOM_LOWERCASE: iprintf(level, "[:lower:]\n"); break; + case ATOM_CONTROL: iprintf(level, "[:cntrl:]\n"); break; + case ATOM_PUNCTUATION: iprintf(level, "[:punct:]\n"); break; + case ATOM_EVERYTHING: iprintf(level, "any\n"); break; + } +} + +void dump_atom(const struct atom *a) +{ + dump_atom_indent(a, 0); +} + diff --git a/ast.h b/ast.h new file mode 100644 index 0000000..4694c33 --- /dev/null +++ b/ast.h @@ -0,0 +1,40 @@ +#ifndef AST_H +#define AST_H + +struct repeat { + long min, max; +}; + +enum { + ATOM_ALTERNATION, + ATOM_SEQUENCE, + ATOM_REPETITION, + ATOM_ALPHABETIC, + ATOM_NUMERIC, + ATOM_UPPERCASE, + ATOM_LOWERCASE, + ATOM_CONTROL, + ATOM_PUNCTUATION, + ATOM_EVERYTHING, + ATOM_LITERAL, +}; + +struct atom { + int type; + union { + struct { + struct repeat counts; + struct atom *child; + } repeat; + const struct atom *children[2]; + const char *literal; + } u; +}; + +struct atom *mkatom(const struct atom *src); +void dump_atom(const struct atom *a); + +extern struct atom *ast; + +#endif + diff --git a/lex-pattern.l b/lex-pattern.l new file mode 100644 index 0000000..d091c74 --- /dev/null +++ b/lex-pattern.l @@ -0,0 +1,58 @@ +%{ + +#include +#include + +#include "ast.h" +#include "parse-pattern.h" + +static int intlit(long *out, const char *in) +{ + errno = 0; + *out = strtol(in, NULL, 10); + return errno == ERANGE ? T_BADLIT : T_INTEGER; +} + +static int strlit(char **out, const char *in, size_t inlen) +{ + in += 1; + inlen -= 2; + + size_t len = 0; + for (const char *c = in; c < in + inlen; c++, len++) { + if (*c == '\"') c++; + } + + char *str; + if (!(*out = str = malloc(len + 1))) return T_NOMEM; + + for (const char *c = in; c < in + inlen; *str++ = *c++) { + if (*c == '\"') c++; + } + *str = 0; + + return T_STRING; +} + +%} + +%option outfile="lex-pattern.c" header-file="lex-pattern.h" +%option noyywrap never-interactive + +%% + +\( { return T_LPAREN; } +\) { return T_RPAREN; } +, { return T_COMMA; } +\. { return T_PERIOD; } +A { return T_ALPHABETIC; } +N { return T_NUMERIC; } +U { return T_UPPERCASE; } +L { return T_LOWERCASE; } +C { return T_CONTROL; } +P { return T_PUNCTUATION; } +E { return T_EVERYTHING; } +[0-9]+ { return intlit(&yylval.lval, yytext); } +\"(\"\"|[^"])*\" { return strlit(&yylval.sval, yytext, yyleng); } + +%% diff --git a/parse-pattern.y b/parse-pattern.y new file mode 100644 index 0000000..e5acd3c --- /dev/null +++ b/parse-pattern.y @@ -0,0 +1,96 @@ +%{ + +#include + +#include "ast.h" +#include "lex-pattern.h" + +void yyerror(const char *msg) +{ + fprintf(stderr, "%s\n", msg); +} + +%} + +%output "parse-pattern.c" +%defines "parse-pattern.h" + +%union { + long lval; + char *sval; + struct atom *atom; + struct repeat repeat; +} + +%token END 0 "end of file" + +%token T_BADLIT +%token T_NOMEM + +%token T_ALPHABETIC +%token T_NUMERIC +%token T_UPPERCASE +%token T_LOWERCASE +%token T_CONTROL +%token T_PUNCTUATION +%token T_EVERYTHING + +%token T_LPAREN "(" +%token T_RPAREN ")" +%token T_COMMA "," +%token T_PERIOD "." +%token T_INTEGER +%token T_STRING + +%type alternation +%type atom +%type input +%type sequence +%type molecule + +%type repeat + +%% + +start: + input END { ast = $1; } + ; + +input: + molecule + | input molecule { $$ = mkatom(&(struct atom) { .type = ATOM_SEQUENCE, .u = { .children = { $1, $2 } } }); } + ; + +molecule: + repeat sequence { $$ = mkatom(&(struct atom) { .type = ATOM_REPETITION, .u = { .repeat = { .counts = $1, .child = $2 } } }); } + ; + +repeat: + T_INTEGER T_PERIOD T_INTEGER { $$ = (struct repeat) { $1, $3 }; } + | T_INTEGER T_PERIOD { $$ = (struct repeat) { $1, -1 }; } + | T_PERIOD T_INTEGER { $$ = (struct repeat) { 0, $2 }; } + | T_PERIOD { $$ = (struct repeat) { 0, -1 }; } + | T_INTEGER { $$ = (struct repeat) { $1, $1 }; } + ; + +sequence: + atom + | sequence atom { $$ = mkatom(&(struct atom) { .type = ATOM_SEQUENCE, .u = { .children = { $1, $2 } } }); } + ; + +atom: + T_ALPHABETIC { $$ = mkatom(&(struct atom) { .type = ATOM_ALPHABETIC }); } + | T_NUMERIC { $$ = mkatom(&(struct atom) { .type = ATOM_NUMERIC }); } + | T_UPPERCASE { $$ = mkatom(&(struct atom) { .type = ATOM_UPPERCASE }); } + | T_LOWERCASE { $$ = mkatom(&(struct atom) { .type = ATOM_LOWERCASE }); } + | T_CONTROL { $$ = mkatom(&(struct atom) { .type = ATOM_CONTROL }); } + | T_PUNCTUATION { $$ = mkatom(&(struct atom) { .type = ATOM_PUNCTUATION }); } + | T_EVERYTHING { $$ = mkatom(&(struct atom) { .type = ATOM_EVERYTHING }); } + | T_STRING { $$ = mkatom(&(struct atom) { .type = ATOM_LITERAL, .u = { .literal = $1 } }); } + | T_LPAREN alternation T_RPAREN { $$ = $2; } + ; + +alternation: + molecule + | alternation T_COMMA molecule { $$ = mkatom(&(struct atom) { .type = ATOM_ALTERNATION, .u = { .children = { $1, $3 } } }); } + ; diff --git a/pattern2regex.c b/pattern2regex.c new file mode 100644 index 0000000..669206a --- /dev/null +++ b/pattern2regex.c @@ -0,0 +1,20 @@ +#include + +#include "ast.h" +#include "lex-pattern.h" +#include "parse-pattern.h" + +int main(int argc, char **argv) +{ + if (argc >= 2) { + yyin = fmemopen(argv[1], strlen(argv[1]), "r"); + } else { + yyin = stdin; + } + + if (!yyparse()) dump_atom(ast); + + fclose(yyin); + return 0; +} + -- cgit v1.2.3