526 lines
12 KiB
Go
526 lines
12 KiB
Go
// Package treerack provides a parser generator for defining and interacting with arbitrary syntaxes.
|
|
//
|
|
// Treerack allows developers to define grammars - programmatically or via a syntax definition language
|
|
// derivative of EBNF — and generate recursive descent parsers. These parsers process input content and produce
|
|
// an Abstract Syntax Tree (AST) representation.
|
|
//
|
|
// The library supports two primary workflows:
|
|
//
|
|
// 1. Dynamic (Runtime): Loading or defining syntaxes programmatically at runtime to parse input immediately.
|
|
//
|
|
// 2. Static (Generation): Defining syntaxes during development and generating Go source code to be compiled
|
|
// into the application.
|
|
//
|
|
// For detailed syntax definition rules and the command-line tool usage, please refer to the repository
|
|
// documentation: https://code.squareroundforest.org/arpio/treerack
|
|
package treerack
|
|
|
|
import (
|
|
"code.squareroundforest.org/arpio/treerack/internal/self"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
)
|
|
|
|
// SequenceItem represents a single element within a sequence definition, referencing another parser by name.
|
|
//
|
|
// Cardinality logic for SequenceItem:
|
|
//
|
|
// - If Min=0 and Max=0: Matches exactly once (equivalent to Min=1, Max=1).
|
|
//
|
|
// - If Max <= 0: Unbounded upper limit (matches Min or more times).
|
|
//
|
|
// - If Min <= 0: No lower limit (matches 0 to Max times).
|
|
type SequenceItem struct {
|
|
|
|
// Name is the identifier of the referenced parser definition.
|
|
Name string
|
|
|
|
// Min specifies the minimum required occurrences of the item.
|
|
Min int
|
|
|
|
// Max specifies the maximum accepted occurrences of the item.
|
|
Max int
|
|
}
|
|
|
|
// Syntax represents a complete grammar definition consisting of multiple named parsers.
|
|
//
|
|
// The lifecycle of a Syntax instance consists of three phases:
|
|
//
|
|
// 1. Definition: Define parsers using methods like AnyChar, Sequence, and Choice, or load a definition via
|
|
// ReadSyntax.
|
|
//
|
|
// 2. Initialization: Call Init() to validate definitions, resolve references, and seal the syntax.
|
|
//
|
|
// 3. Execution: Use Parse() to process input or Generate() to create Go source code.
|
|
type Syntax struct {
|
|
registry *registry
|
|
initialized bool
|
|
errInitFailed error
|
|
explicitRoot bool
|
|
keywords []definition
|
|
root definition
|
|
}
|
|
|
|
// GeneratorOptions control the behavior of the Go code generator.
|
|
type GeneratorOptions struct {
|
|
|
|
// PackageName sets the package name for the generated source file. Defaults to main.
|
|
PackageName string
|
|
|
|
// Export determines whether the generated Parse function is exported (public) or unexported (private)
|
|
// within the package.
|
|
Export bool
|
|
}
|
|
|
|
// applied in a non-type-checked way
|
|
type generator interface {
|
|
generate(io.Writer, map[string]bool) error
|
|
}
|
|
|
|
type definition interface {
|
|
nodeName() string
|
|
setName(string)
|
|
nodeID() int
|
|
setID(int)
|
|
commitType() CommitType
|
|
setCommitType(CommitType)
|
|
preinit()
|
|
validate(*registry) error
|
|
init(*registry)
|
|
addGeneralization(int)
|
|
parser() parser
|
|
builder() builder
|
|
format(*registry, formatFlags) string
|
|
}
|
|
|
|
var (
|
|
|
|
// ErrSyntaxInitialized is returned when attempting to modify a syntax that has already been initialized.
|
|
ErrSyntaxInitialized = errors.New("syntax initialized")
|
|
|
|
// ErrNoParsersDefined is returned when attempting to initialize a syntax containing no parser definitions.
|
|
ErrNoParsersDefined = errors.New("no parsers defined")
|
|
|
|
// ErrMultipleRoots is returned when a syntax definition contains multiple explicit root parsers.
|
|
ErrMultipleRoots = errors.New("multiple roots")
|
|
|
|
// ErrInvalidSymbolName is returned when a named parser is assigned an invalid identifier.
|
|
ErrInvalidSymbolName = errors.New("invalid symbol name")
|
|
)
|
|
|
|
func (ct CommitType) String() string {
|
|
switch ct {
|
|
case None:
|
|
return "none"
|
|
case Alias:
|
|
return "alias"
|
|
case Whitespace:
|
|
return "whitespace"
|
|
case NoWhitespace:
|
|
return "no-whitespace"
|
|
case Keyword:
|
|
return "keyword"
|
|
case NoKeyword:
|
|
return "no-keyword"
|
|
case FailPass:
|
|
return "fail-pass"
|
|
case Root:
|
|
return "root"
|
|
default:
|
|
return "unknown"
|
|
}
|
|
}
|
|
|
|
func duplicateDefinition(name string) error {
|
|
return fmt.Errorf("duplicate definition: %s", name)
|
|
}
|
|
|
|
func parserNotFound(name string) error {
|
|
return fmt.Errorf("parser not found: %s", name)
|
|
}
|
|
|
|
var symbolChars = []rune("\\ \n\t\b\f\r\v/.[]\"{}^+*?|():=;")
|
|
|
|
func isValidSymbol(n string) bool {
|
|
runes := []rune(n)
|
|
for _, r := range runes {
|
|
if !matchChar(symbolChars, nil, true, r) {
|
|
return false
|
|
}
|
|
}
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
func intsContain(is []int, i int) bool {
|
|
for _, ii := range is {
|
|
if ii == i {
|
|
return true
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
var incompatibleCommitTypes = map[CommitType][]CommitType{
|
|
Alias: {Root},
|
|
Whitespace: {Keyword, NoKeyword, FailPass, Root},
|
|
Keyword: {NoKeyword, Root},
|
|
FailPass: {Root},
|
|
}
|
|
|
|
func (s *Syntax) checkCommitType(d definition) error {
|
|
for ct, ict := range incompatibleCommitTypes {
|
|
if d.commitType()&ct == 0 {
|
|
continue
|
|
}
|
|
|
|
for _, cti := range ict {
|
|
if d.commitType()&cti == 0 {
|
|
continue
|
|
}
|
|
|
|
return fmt.Errorf(
|
|
"incompatible commit types in %s: %v and %v",
|
|
d.nodeName(),
|
|
ct,
|
|
cti,
|
|
)
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (s *Syntax) applyRoot(d definition) error {
|
|
explicitRoot := d.commitType()&Root != 0
|
|
if explicitRoot && s.explicitRoot {
|
|
return ErrMultipleRoots
|
|
}
|
|
|
|
if s.root != nil && (explicitRoot || !s.explicitRoot) {
|
|
s.root.setCommitType(s.root.commitType() &^ Root)
|
|
}
|
|
|
|
if explicitRoot || !s.explicitRoot {
|
|
s.root = d
|
|
s.root.setCommitType(s.root.commitType() | Root)
|
|
}
|
|
|
|
if explicitRoot {
|
|
s.explicitRoot = true
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (s *Syntax) register(d definition) error {
|
|
if s.initialized {
|
|
return ErrSyntaxInitialized
|
|
}
|
|
|
|
if s.registry == nil {
|
|
s.registry = newRegistry()
|
|
}
|
|
|
|
if err := s.checkCommitType(d); err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := s.applyRoot(d); err != nil {
|
|
return err
|
|
}
|
|
|
|
if d.commitType()&Keyword != 0 {
|
|
s.keywords = append(s.keywords, d)
|
|
}
|
|
|
|
return s.registry.setDefinition(d)
|
|
}
|
|
|
|
func (s *Syntax) anyChar(name string, ct CommitType) error {
|
|
return s.class(name, ct, true, nil, nil)
|
|
}
|
|
|
|
// AnyChar registers a parser that accepts any single character (a wildcard).
|
|
func (s *Syntax) AnyChar(name string, ct CommitType) error {
|
|
if !isValidSymbol(name) {
|
|
return ErrInvalidSymbolName
|
|
}
|
|
|
|
return s.anyChar(name, ct|userDefined)
|
|
}
|
|
|
|
func childName(name string, childIndex int) string {
|
|
return fmt.Sprintf("%s:%d", name, childIndex)
|
|
}
|
|
|
|
func namesToSequenceItems(n []string) []SequenceItem {
|
|
si := make([]SequenceItem, len(n))
|
|
for i := range n {
|
|
si[i] = SequenceItem{Name: n[i]}
|
|
}
|
|
|
|
return si
|
|
}
|
|
|
|
func (s *Syntax) class(name string, ct CommitType, not bool, chars []rune, ranges [][]rune) error {
|
|
cname := childName(name, 0)
|
|
if err := s.register(newChar(cname, not, chars, ranges)); err != nil {
|
|
return err
|
|
}
|
|
|
|
return s.sequence(name, ct, SequenceItem{Name: cname})
|
|
}
|
|
|
|
// Class registers a character class parser, accepting characters defined in the specific list or ranges. If
|
|
// 'not' is true, it matches any character *except* those defined.
|
|
func (s *Syntax) Class(name string, ct CommitType, not bool, chars []rune, ranges [][]rune) error {
|
|
if !isValidSymbol(name) {
|
|
return ErrInvalidSymbolName
|
|
}
|
|
|
|
return s.class(name, ct|userDefined, not, chars, ranges)
|
|
}
|
|
|
|
func (s *Syntax) charSequence(name string, ct CommitType, chars []rune) error {
|
|
var refs []string
|
|
for i, ci := range chars {
|
|
ref := childName(name, i)
|
|
refs = append(refs, ref)
|
|
if err := s.register(newChar(ref, false, []rune{ci}, nil)); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return s.sequence(name, ct|NoWhitespace, namesToSequenceItems(refs)...)
|
|
}
|
|
|
|
// CharSequence registers a parser that matches a specific string literal (e.g., "foo").
|
|
func (s *Syntax) CharSequence(name string, ct CommitType, chars []rune) error {
|
|
if !isValidSymbol(name) {
|
|
return ErrInvalidSymbolName
|
|
}
|
|
|
|
return s.charSequence(name, ct|userDefined, chars)
|
|
}
|
|
|
|
func (s *Syntax) sequence(name string, ct CommitType, items ...SequenceItem) error {
|
|
return s.register(newSequence(name, ct, items))
|
|
}
|
|
|
|
// Sequence registers a parser that matches a specific order of other named parsers (defined as SequenceItems).
|
|
func (s *Syntax) Sequence(name string, ct CommitType, items ...SequenceItem) error {
|
|
if !isValidSymbol(name) {
|
|
return ErrInvalidSymbolName
|
|
}
|
|
|
|
return s.sequence(name, ct|userDefined, items...)
|
|
}
|
|
|
|
func (s *Syntax) choice(name string, ct CommitType, options ...string) error {
|
|
return s.register(newChoice(name, ct, options))
|
|
}
|
|
|
|
// Choice registers a parser that matches exactly one of the provided named options.
|
|
func (s *Syntax) Choice(name string, ct CommitType, options ...string) error {
|
|
if !isValidSymbol(name) {
|
|
return ErrInvalidSymbolName
|
|
}
|
|
|
|
return s.choice(name, ct|userDefined, options...)
|
|
}
|
|
|
|
// ReadSyntax loads a grammar definition from a reader using the Treerack syntax format.
|
|
func (s *Syntax) ReadSyntax(r io.Reader) error {
|
|
if s.initialized {
|
|
return ErrSyntaxInitialized
|
|
}
|
|
|
|
sn, err := self.Parse(r)
|
|
|
|
var sperr *self.ParseError
|
|
if errors.As(err, &sperr) {
|
|
var perr ParseError
|
|
perr.Input = sperr.Input
|
|
perr.Offset = sperr.Offset
|
|
perr.Line = sperr.Line
|
|
perr.Column = sperr.Column
|
|
perr.Definition = sperr.Definition
|
|
return &perr
|
|
}
|
|
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
n := mapSelfNode(sn)
|
|
return define(s, n)
|
|
}
|
|
|
|
// Init validates, initializes, and seals the syntax. This method must be called exactly once before Parsing or
|
|
// Generating.
|
|
func (s *Syntax) Init() error {
|
|
if s.errInitFailed != nil {
|
|
return s.errInitFailed
|
|
}
|
|
|
|
if s.initialized {
|
|
return nil
|
|
}
|
|
|
|
if s.root == nil {
|
|
return ErrNoParsersDefined
|
|
}
|
|
|
|
if err := s.checkCommitType(s.root); err != nil {
|
|
return err
|
|
}
|
|
|
|
defs := s.registry.definitions
|
|
for i := range defs {
|
|
defs[i].preinit()
|
|
}
|
|
|
|
if hasWhitespace(defs) {
|
|
defs, s.root = applyWhitespace(defs)
|
|
s.registry = newRegistry(defs...)
|
|
}
|
|
|
|
for i := range s.keywords {
|
|
if err := s.keywords[i].validate(s.registry); err != nil {
|
|
s.errInitFailed = err
|
|
return err
|
|
}
|
|
}
|
|
|
|
if err := s.root.validate(s.registry); err != nil {
|
|
s.errInitFailed = err
|
|
return err
|
|
}
|
|
|
|
for i := range s.keywords {
|
|
s.keywords[i].init(s.registry)
|
|
}
|
|
|
|
s.root.init(s.registry)
|
|
s.initialized = true
|
|
return nil
|
|
}
|
|
|
|
func (s *Syntax) keywordParsers() []parser {
|
|
var p []parser
|
|
for _, kw := range s.keywords {
|
|
p = append(p, kw.parser())
|
|
}
|
|
|
|
return p
|
|
}
|
|
|
|
// Generate writes Go source code implementing the parser to the provided writer.
|
|
func (s *Syntax) Generate(o GeneratorOptions, w io.Writer) error {
|
|
if err := s.Init(); err != nil {
|
|
return err
|
|
}
|
|
|
|
if o.PackageName == "" {
|
|
o.PackageName = "main"
|
|
}
|
|
|
|
var err error
|
|
fprintf := func(f string, args ...interface{}) {
|
|
if err != nil {
|
|
return
|
|
}
|
|
|
|
_, err = fmt.Fprintf(w, f, args...)
|
|
}
|
|
|
|
fprint := func(args ...interface{}) {
|
|
if err != nil {
|
|
return
|
|
}
|
|
|
|
_, err = fmt.Fprint(w, args...)
|
|
}
|
|
|
|
fprintln := func() {
|
|
fprint("\n")
|
|
}
|
|
|
|
fprint(gendoc)
|
|
fprintln()
|
|
fprintln()
|
|
|
|
fprintf("package %s", o.PackageName)
|
|
fprintln()
|
|
fprintln()
|
|
|
|
// generate headCode with scripts/createhead.go
|
|
hc := headCode
|
|
if o.Export {
|
|
hc = headCodeExported
|
|
}
|
|
|
|
fprint("// head")
|
|
fprintln()
|
|
fprint(hc)
|
|
fprintln()
|
|
fprint("// eo head")
|
|
fprintln()
|
|
fprintln()
|
|
|
|
if o.Export {
|
|
fprint(`func Parse(r io.Reader) (*Node, error) {`)
|
|
} else {
|
|
fprint(`func parse(r io.Reader) (*node, error) {`)
|
|
}
|
|
|
|
fprintln()
|
|
|
|
done := make(map[string]bool)
|
|
for _, p := range s.keywordParsers() {
|
|
if err := p.(generator).generate(w, done); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
fprintln()
|
|
|
|
if err := s.root.parser().(generator).generate(w, done); err != nil {
|
|
return err
|
|
}
|
|
|
|
done = make(map[string]bool)
|
|
if err := s.root.builder().(generator).generate(w, done); err != nil {
|
|
return err
|
|
}
|
|
|
|
fprintln()
|
|
fprintln()
|
|
fprint(`var keywords = []parser{`)
|
|
for i := range s.keywords {
|
|
fprintf(`&p%d, `, s.keywords[i].nodeID())
|
|
}
|
|
fprint(`}`)
|
|
|
|
fprintln()
|
|
fprintln()
|
|
fprintf(`return parseInput(r, &p%d, &b%d, keywords)`, s.root.parser().nodeID(), s.root.builder().nodeID())
|
|
fprintln()
|
|
fprint(`}`)
|
|
fprintln()
|
|
|
|
return nil
|
|
}
|
|
|
|
// Parse reads from the input stream and constructs an AST based on the defined syntax.
|
|
func (s *Syntax) Parse(r io.Reader) (*Node, error) {
|
|
if err := s.Init(); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
return parseInput(r, s.root.parser(), s.root.builder(), s.keywordParsers())
|
|
}
|