Initial commit

author: Bobby Bingham <koorogi@koorogi.info> 2017-07-23 22:24:32 -0500
committer: Bobby Bingham <koorogi@koorogi.info> 2017-07-24 21:19:10 -0500
commit: c8a0b7157d544f2359f2373160dcc69cdef8f4de (patch)
tree: 2fa9c2cc4646cab3886025a9663289b177ad4c55
7 files changed, 349 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..7677c59
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+*.o
+lex-*.c
+lex-*.h
+parse-*.c
+parse-*.h
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..e62c90c
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,31 @@
+OPTFLAGS = -O2
+CFLAGS   = -std=c99 -D_POSIX_C_SOURCE=200809L $(OPTFLAGS)
+
+TARGETS  = pattern2regex
+LEXERS   = $(wildcard *.l)
+PARSERS  = $(wildcard *.y)
+SOURCES  = $(wildcard *.c)
+OBJECTS  = $(sort $(SOURCES:.c=.o) $(LEXERS:.l=.o) $(PARSERS:.y=.o))
+
+all: $(TARGETS)
+
+clean:
+	rm -f $(TARGETS)
+	rm -f $(OBJECTS)
+	rm -f $(LEXERS:.l=.c) $(LEXERS:.l=.h)
+	rm -f $(PARSERS:.y=.c) $(PARSERS:.y=.h)
+
+lex-%.c lex-%.h: lex-%.l
+	flex $<
+
+parse-%.c parse-%.h: parse-%.y
+	bison $<
+
+%.o: %.c
+	$(CC) $(CFLAGS) -c $< -o $@
+
+pattern2regex.o: lex-pattern.h parse-pattern.h
+pattern2regex: pattern2regex.o lex-pattern.o parse-pattern.o ast.o
+
+$(TARGETS):
+	$(CC) $(CFLAGS) $^ -o $@
diff --git a/ast.c b/ast.c
new file mode 100644
index 0000000..d261b9d
--- /dev/null
+++ b/ast.c
@@ -0,0 +1,99 @@
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "ast.h"
+
+struct atom *ast;
+
+struct atom *mkatom(const struct atom *src)
+{
+	struct atom *ret = malloc(sizeof *ret);
+	if (ret) *ret = *src;
+	return ret;
+}
+
+static void ivprintf(int level, const char *fmt, va_list ap)
+{
+	printf("%*s", 4 * level, "");
+	vprintf(fmt, ap);
+}
+
+static void iprintf(int level, const char *fmt, ...)
+{
+	va_list ap;
+	va_start(ap, fmt);
+	ivprintf(level, fmt, ap);
+	va_end(ap);
+}
+
+static void dump_atom_indent(const struct atom *a, int level);
+
+static void dump_atom_children(const struct atom *a, int level, const char *fmt, ...)
+{
+	va_list ap;
+	va_start(ap, fmt);
+	ivprintf(level, fmt, ap);
+	va_end(ap);
+
+	iprintf(level, "{\n");
+	dump_atom_indent(a->u.children[0], level + 1);
+	dump_atom_indent(a->u.children[1], level + 1);
+	iprintf(level, "}\n");
+}
+
+const char *fmt_repeat(char *buf, const struct repeat *r)
+{
+	if (r->min == r->max)
+		sprintf(buf, "%ld", r->min);
+	else if (r->max < 0)
+		sprintf(buf, "%ld+", r->min);
+	else
+		sprintf(buf, "%ld - %ld", r->min, r->max);
+	return buf;
+}
+
+#define FMT_REPEAT(r) fmt_repeat((char[64]){0}, (r))
+
+static void dump_atom_indent(const struct atom *a, int level)
+{
+	switch (a->type) {
+	case ATOM_ALTERNATION:
+		dump_atom_children(a, level, "alternation\n");
+		break;
+
+	case ATOM_SEQUENCE:
+		dump_atom_indent(a->u.children[0], level);
+		dump_atom_indent(a->u.children[1], level);
+		break;
+
+	case ATOM_REPETITION:
+		if (a->u.repeat.counts.min == 1 && a->u.repeat.counts.max == 1) {
+			dump_atom_indent(a->u.repeat.child, level);
+		} else {
+			iprintf(level, "repeat %s times\n", FMT_REPEAT(&a->u.repeat.counts));
+			iprintf(level, "{\n");
+			dump_atom_indent(a->u.repeat.child, level + 1);
+			iprintf(level, "}\n");
+		}
+		break;
+
+	case ATOM_LITERAL:
+		iprintf(level, "literal: %s\n", a->u.literal);
+		break;
+
+	case ATOM_ALPHABETIC:  iprintf(level, "[:alpha:]\n"); break;
+	case ATOM_NUMERIC:     iprintf(level, "[:digit:]\n"); break;
+	case ATOM_UPPERCASE:   iprintf(level, "[:upper:]\n"); break;
+	case ATOM_LOWERCASE:   iprintf(level, "[:lower:]\n"); break;
+	case ATOM_CONTROL:     iprintf(level, "[:cntrl:]\n"); break;
+	case ATOM_PUNCTUATION: iprintf(level, "[:punct:]\n"); break;
+	case ATOM_EVERYTHING:  iprintf(level, "any\n");       break;
+	}
+}
+
+void dump_atom(const struct atom *a)
+{
+	dump_atom_indent(a, 0);
+}
+
diff --git a/ast.h b/ast.h
new file mode 100644
index 0000000..4694c33
--- /dev/null
+++ b/ast.h
@@ -0,0 +1,40 @@
+#ifndef AST_H
+#define AST_H
+
+struct repeat {
+	long min, max;
+};
+
+enum {
+	ATOM_ALTERNATION,
+	ATOM_SEQUENCE,
+	ATOM_REPETITION,
+	ATOM_ALPHABETIC,
+	ATOM_NUMERIC,
+	ATOM_UPPERCASE,
+	ATOM_LOWERCASE,
+	ATOM_CONTROL,
+	ATOM_PUNCTUATION,
+	ATOM_EVERYTHING,
+	ATOM_LITERAL,
+};
+
+struct atom {
+	int type;
+	union {
+		struct {
+			struct repeat counts;
+			struct atom *child;
+		} repeat;
+		const struct atom *children[2];
+		const char *literal;
+	} u;
+};
+
+struct atom *mkatom(const struct atom *src);
+void dump_atom(const struct atom *a);
+
+extern struct atom *ast;
+
+#endif
+
diff --git a/lex-pattern.l b/lex-pattern.l
new file mode 100644
index 0000000..d091c74
--- /dev/null
+++ b/lex-pattern.l
@@ -0,0 +1,58 @@
+%{
+
+#include <errno.h>
+#include <stdlib.h>
+
+#include "ast.h"
+#include "parse-pattern.h"
+
+static int intlit(long *out, const char *in)
+{
+	errno = 0;
+	*out = strtol(in, NULL, 10);
+	return errno == ERANGE ? T_BADLIT : T_INTEGER;
+}
+
+static int strlit(char **out, const char *in, size_t inlen)
+{
+	in    += 1;
+	inlen -= 2;
+
+	size_t len = 0;
+	for (const char *c = in; c < in + inlen; c++, len++) {
+		if (*c == '\"') c++;
+	}
+
+	char *str;
+	if (!(*out = str = malloc(len + 1))) return T_NOMEM;
+
+	for (const char *c = in; c < in + inlen; *str++ = *c++) {
+		if (*c == '\"') c++;
+	}
+	*str = 0;
+
+	return T_STRING;
+}
+
+%}
+
+%option outfile="lex-pattern.c" header-file="lex-pattern.h"
+%option noyywrap never-interactive
+
+%%
+
+\(               { return T_LPAREN;      }
+\)               { return T_RPAREN;      }
+,                { return T_COMMA;       }
+\.               { return T_PERIOD;      }
+A                { return T_ALPHABETIC;  }
+N                { return T_NUMERIC;     }
+U                { return T_UPPERCASE;   }
+L                { return T_LOWERCASE;   }
+C                { return T_CONTROL;     }
+P                { return T_PUNCTUATION; }
+E                { return T_EVERYTHING;  }
+[0-9]+           { return intlit(&yylval.lval, yytext); }
+\"(\"\"|[^"])*\" { return strlit(&yylval.sval, yytext, yyleng); }
+
+%%
diff --git a/parse-pattern.y b/parse-pattern.y
new file mode 100644
index 0000000..e5acd3c
--- /dev/null
+++ b/parse-pattern.y
@@ -0,0 +1,96 @@
+%{
+
+#include <stdio.h>
+
+#include "ast.h"
+#include "lex-pattern.h"
+
+void yyerror(const char *msg)
+{
+	fprintf(stderr, "%s\n", msg);
+}
+
+%}
+
+%output  "parse-pattern.c"
+%defines "parse-pattern.h"
+
+%union {
+	long  lval;
+	char *sval;
+	struct atom  *atom;
+	struct repeat repeat;
+}
+
+%token END 0 "end of file"
+
+%token T_BADLIT
+%token T_NOMEM
+
+%token T_ALPHABETIC
+%token T_NUMERIC
+%token T_UPPERCASE
+%token T_LOWERCASE
+%token T_CONTROL
+%token T_PUNCTUATION
+%token T_EVERYTHING
+
+%token          T_LPAREN  "("
+%token          T_RPAREN  ")"
+%token          T_COMMA   ","
+%token          T_PERIOD  "."
+%token <lval>   T_INTEGER
+%token <sval>   T_STRING
+
+%type  <atom>   alternation
+%type  <atom>   atom
+%type  <atom>   input
+%type  <atom>   sequence
+%type  <atom>   molecule
+
+%type  <repeat> repeat
+
+%%
+
+start:
+		input END                     { ast = $1; }
+	;
+
+input:
+		molecule
+	|	input molecule                { $$ = mkatom(&(struct atom) { .type = ATOM_SEQUENCE,   .u = { .children = { $1, $2 } } }); }
+	;
+
+molecule:
+		repeat sequence               { $$ = mkatom(&(struct atom) { .type = ATOM_REPETITION, .u = { .repeat = { .counts = $1, .child = $2 } } }); }
+	;
+
+repeat:
+		T_INTEGER T_PERIOD T_INTEGER  { $$ = (struct repeat) { $1, $3 }; }
+	|	T_INTEGER T_PERIOD            { $$ = (struct repeat) { $1, -1 }; }
+	|	T_PERIOD T_INTEGER            { $$ = (struct repeat) {  0, $2 }; }
+	|	T_PERIOD                      { $$ = (struct repeat) {  0, -1 }; }
+	|	T_INTEGER                     { $$ = (struct repeat) { $1, $1 }; }
+	;
+
+sequence:
+		atom
+	|	sequence atom                 { $$ = mkatom(&(struct atom) { .type = ATOM_SEQUENCE, .u = { .children = { $1, $2 } } }); }
+	;
+
+atom:
+		T_ALPHABETIC                  { $$ = mkatom(&(struct atom) { .type = ATOM_ALPHABETIC  }); }
+	|	T_NUMERIC                     { $$ = mkatom(&(struct atom) { .type = ATOM_NUMERIC     }); }
+	|	T_UPPERCASE                   { $$ = mkatom(&(struct atom) { .type = ATOM_UPPERCASE   }); }
+	|	T_LOWERCASE                   { $$ = mkatom(&(struct atom) { .type = ATOM_LOWERCASE   }); }
+	|	T_CONTROL                     { $$ = mkatom(&(struct atom) { .type = ATOM_CONTROL     }); }
+	|	T_PUNCTUATION                 { $$ = mkatom(&(struct atom) { .type = ATOM_PUNCTUATION }); }
+	|	T_EVERYTHING                  { $$ = mkatom(&(struct atom) { .type = ATOM_EVERYTHING  }); }
+	|	T_STRING                      { $$ = mkatom(&(struct atom) { .type = ATOM_LITERAL, .u = { .literal = $1 } }); }
+	|	T_LPAREN alternation T_RPAREN { $$ = $2; }
+	;
+
+alternation:
+		molecule
+	|	alternation T_COMMA molecule  { $$ = mkatom(&(struct atom) { .type = ATOM_ALTERNATION, .u = { .children = { $1, $3 } } }); }
+	;
diff --git a/pattern2regex.c b/pattern2regex.c
new file mode 100644
index 0000000..669206a
--- /dev/null
+++ b/pattern2regex.c
@@ -0,0 +1,20 @@
+#include <stdio.h>
+
+#include "ast.h"
+#include "lex-pattern.h"
+#include "parse-pattern.h"
+
+int main(int argc, char **argv)
+{
+	if (argc >= 2) {
+		yyin = fmemopen(argv[1], strlen(argv[1]), "r");
+	} else {
+		yyin = stdin;
+	}
+
+	if (!yyparse()) dump_atom(ast);
+
+	fclose(yyin);
+	return 0;
+}
+
author	Bobby Bingham <koorogi@koorogi.info>	2017-07-23 22:24:32 -0500
committer	Bobby Bingham <koorogi@koorogi.info>	2017-07-24 21:19:10 -0500
commit	c8a0b7157d544f2359f2373160dcc69cdef8f4de (patch)
tree	2fa9c2cc4646cab3886025a9663289b177ad4c55