1
0
treerack/syntax.go

526 lines
12 KiB
Go
Raw Normal View History

2026-01-18 22:52:27 +01:00
// Package treerack provides a parser generator for defining and interacting with arbitrary syntaxes.
//
// Treerack allows developers to define grammars - programmatically or via a syntax definition language
// derivative of EBNF — and generate recursive descent parsers. These parsers process input content and produce
// an Abstract Syntax Tree (AST) representation.
//
// The library supports two primary workflows:
//
// 1. Dynamic (Runtime): Loading or defining syntaxes programmatically at runtime to parse input immediately.
//
// 2. Static (Generation): Defining syntaxes during development and generating Go source code to be compiled
// into the application.
//
// For detailed syntax definition rules and the command-line tool usage, please refer to the repository
// documentation: https://code.squareroundforest.org/arpio/treerack
2017-07-15 21:49:08 +02:00
package treerack
2017-06-25 17:51:08 +02:00
import (
2026-01-16 01:03:43 +01:00
"code.squareroundforest.org/arpio/treerack/internal/self"
2017-06-25 17:51:08 +02:00
"errors"
"fmt"
"io"
)
2026-01-18 22:52:27 +01:00
// SequenceItem represents a single element within a sequence definition, referencing another parser by name.
//
// Cardinality logic for SequenceItem:
//
// - If Min=0 and Max=0: Matches exactly once (equivalent to Min=1, Max=1).
//
// - If Max <= 0: Unbounded upper limit (matches Min or more times).
//
// - If Min <= 0: No lower limit (matches 0 to Max times).
2017-06-25 23:38:32 +02:00
type SequenceItem struct {
2026-01-18 22:52:27 +01:00
// Name is the identifier of the referenced parser definition.
Name string
// Min specifies the minimum required occurrences of the item.
Min int
// Max specifies the maximum accepted occurrences of the item.
Max int
2017-06-25 23:38:32 +02:00
}
2026-01-18 22:52:27 +01:00
// Syntax represents a complete grammar definition consisting of multiple named parsers.
//
// The lifecycle of a Syntax instance consists of three phases:
//
// 1. Definition: Define parsers using methods like AnyChar, Sequence, and Choice, or load a definition via
// ReadSyntax.
//
// 2. Initialization: Call Init() to validate definitions, resolve references, and seal the syntax.
//
// 3. Execution: Use Parse() to process input or Generate() to create Go source code.
2017-06-25 17:51:08 +02:00
type Syntax struct {
2025-08-20 00:45:32 +02:00
registry *registry
initialized bool
errInitFailed error
explicitRoot bool
keywords []definition
root definition
2017-06-25 17:51:08 +02:00
}
2026-01-18 22:52:27 +01:00
// GeneratorOptions control the behavior of the Go code generator.
2018-01-05 19:06:10 +01:00
type GeneratorOptions struct {
2026-01-18 22:52:27 +01:00
// PackageName sets the package name for the generated source file. Defaults to main.
2018-01-05 19:06:10 +01:00
PackageName string
2026-01-18 22:52:27 +01:00
// Export determines whether the generated Parse function is exported (public) or unexported (private)
// within the package.
Export bool
2018-01-05 19:06:10 +01:00
}
// applied in a non-type-checked way
type generator interface {
generate(io.Writer, map[string]bool) error
}
2017-11-01 03:54:53 +01:00
type definition interface {
nodeName() string
2017-11-05 03:28:36 +01:00
setName(string)
2017-11-01 03:54:53 +01:00
nodeID() int
2017-11-05 03:28:36 +01:00
setID(int)
2017-11-01 03:54:53 +01:00
commitType() CommitType
setCommitType(CommitType)
2017-11-04 22:49:42 +01:00
preinit()
2017-11-01 03:54:53 +01:00
validate(*registry) error
init(*registry)
2017-11-02 22:19:03 +01:00
addGeneralization(int)
2017-11-01 03:54:53 +01:00
parser() parser
builder() builder
2017-11-26 01:49:22 +01:00
format(*registry, formatFlags) string
2017-11-01 03:54:53 +01:00
}
2017-06-25 17:51:08 +02:00
var (
2026-01-18 22:52:27 +01:00
// ErrSyntaxInitialized is returned when attempting to modify a syntax that has already been initialized.
ErrSyntaxInitialized = errors.New("syntax initialized")
// ErrNoParsersDefined is returned when attempting to initialize a syntax containing no parser definitions.
ErrNoParsersDefined = errors.New("no parsers defined")
// ErrMultipleRoots is returned when a syntax definition contains multiple explicit root parsers.
ErrMultipleRoots = errors.New("multiple roots")
// ErrInvalidSymbolName is returned when a named parser is assigned an invalid identifier.
ErrInvalidSymbolName = errors.New("invalid symbol name")
2017-06-25 17:51:08 +02:00
)
2019-02-02 18:07:10 +01:00
func (ct CommitType) String() string {
switch ct {
case None:
return "none"
case Alias:
return "alias"
case Whitespace:
return "whitespace"
case NoWhitespace:
return "no-whitespace"
case Keyword:
return "keyword"
case NoKeyword:
return "no-keyword"
case FailPass:
return "fail-pass"
case Root:
return "root"
default:
return "unknown"
}
}
2017-11-01 03:54:53 +01:00
func duplicateDefinition(name string) error {
return fmt.Errorf("duplicate definition: %s", name)
}
func parserNotFound(name string) error {
return fmt.Errorf("parser not found: %s", name)
}
2018-01-06 21:30:07 +01:00
var symbolChars = []rune("\\ \n\t\b\f\r\v/.[]\"{}^+*?|():=;")
2017-10-31 21:53:09 +01:00
func isValidSymbol(n string) bool {
runes := []rune(n)
for _, r := range runes {
2018-01-06 21:30:07 +01:00
if !matchChar(symbolChars, nil, true, r) {
2017-10-31 21:53:09 +01:00
return false
}
2017-06-25 17:51:08 +02:00
}
2017-10-31 21:53:09 +01:00
return true
2017-06-25 17:51:08 +02:00
}
2017-11-01 03:54:53 +01:00
func intsContain(is []int, i int) bool {
for _, ii := range is {
if ii == i {
return true
}
}
return false
}
2019-02-02 18:07:10 +01:00
var incompatibleCommitTypes = map[CommitType][]CommitType{
Alias: {Root},
Whitespace: {Keyword, NoKeyword, FailPass, Root},
Keyword: {NoKeyword, Root},
FailPass: {Root},
}
func (s *Syntax) checkCommitType(d definition) error {
for ct, ict := range incompatibleCommitTypes {
if d.commitType()&ct == 0 {
continue
}
for _, cti := range ict {
if d.commitType()&cti == 0 {
continue
}
return fmt.Errorf(
"incompatible commit types in %s: %v and %v",
d.nodeName(),
ct,
cti,
)
}
}
return nil
}
2017-11-01 00:19:29 +01:00
func (s *Syntax) applyRoot(d definition) error {
explicitRoot := d.commitType()&Root != 0
if explicitRoot && s.explicitRoot {
return ErrMultipleRoots
2017-06-25 17:51:08 +02:00
}
2017-11-01 00:19:29 +01:00
if s.root != nil && (explicitRoot || !s.explicitRoot) {
s.root.setCommitType(s.root.commitType() &^ Root)
2017-10-31 21:53:09 +01:00
}
2017-11-01 00:19:29 +01:00
if explicitRoot || !s.explicitRoot {
2017-06-25 17:51:08 +02:00
s.root = d
2017-10-28 22:54:15 +02:00
s.root.setCommitType(s.root.commitType() | Root)
2017-11-01 00:19:29 +01:00
}
if explicitRoot {
2017-07-15 21:49:08 +02:00
s.explicitRoot = true
2017-11-01 00:19:29 +01:00
}
2017-10-28 22:54:15 +02:00
2017-11-01 00:19:29 +01:00
return nil
}
func (s *Syntax) register(d definition) error {
if s.initialized {
return ErrSyntaxInitialized
}
if s.registry == nil {
s.registry = newRegistry()
2017-06-25 17:51:08 +02:00
}
2019-02-02 18:07:10 +01:00
if err := s.checkCommitType(d); err != nil {
return err
}
2017-11-01 00:19:29 +01:00
if err := s.applyRoot(d); err != nil {
return err
}
2017-10-28 22:54:15 +02:00
2019-02-02 18:07:10 +01:00
if d.commitType()&Keyword != 0 {
s.keywords = append(s.keywords, d)
}
2017-06-25 17:51:08 +02:00
return s.registry.setDefinition(d)
}
2017-10-31 21:53:09 +01:00
func (s *Syntax) anyChar(name string, ct CommitType) error {
return s.class(name, ct, true, nil, nil)
}
2026-01-18 22:52:27 +01:00
// AnyChar registers a parser that accepts any single character (a wildcard).
2017-06-25 17:51:08 +02:00
func (s *Syntax) AnyChar(name string, ct CommitType) error {
2017-10-31 21:53:09 +01:00
if !isValidSymbol(name) {
return ErrInvalidSymbolName
}
2017-11-25 17:37:05 +01:00
return s.anyChar(name, ct|userDefined)
2017-06-25 17:51:08 +02:00
}
func childName(name string, childIndex int) string {
return fmt.Sprintf("%s:%d", name, childIndex)
}
2017-10-29 16:46:17 +01:00
func namesToSequenceItems(n []string) []SequenceItem {
si := make([]SequenceItem, len(n))
for i := range n {
si[i] = SequenceItem{Name: n[i]}
}
return si
}
2017-10-31 21:53:09 +01:00
func (s *Syntax) class(name string, ct CommitType, not bool, chars []rune, ranges [][]rune) error {
2017-07-17 01:41:38 +02:00
cname := childName(name, 0)
2017-07-29 16:25:17 +02:00
if err := s.register(newChar(cname, not, chars, ranges)); err != nil {
2017-07-17 01:41:38 +02:00
return err
}
2017-10-31 21:53:09 +01:00
return s.sequence(name, ct, SequenceItem{Name: cname})
2017-07-17 01:41:38 +02:00
}
2026-01-18 22:52:27 +01:00
// Class registers a character class parser, accepting characters defined in the specific list or ranges. If
// 'not' is true, it matches any character *except* those defined.
2017-10-31 21:53:09 +01:00
func (s *Syntax) Class(name string, ct CommitType, not bool, chars []rune, ranges [][]rune) error {
if !isValidSymbol(name) {
return ErrInvalidSymbolName
}
2017-11-25 17:37:05 +01:00
return s.class(name, ct|userDefined, not, chars, ranges)
2017-10-31 21:53:09 +01:00
}
func (s *Syntax) charSequence(name string, ct CommitType, chars []rune) error {
2017-06-25 17:51:08 +02:00
var refs []string
for i, ci := range chars {
ref := childName(name, i)
refs = append(refs, ref)
2017-07-29 16:25:17 +02:00
if err := s.register(newChar(ref, false, []rune{ci}, nil)); err != nil {
2017-06-25 17:51:08 +02:00
return err
}
}
2017-10-31 21:53:09 +01:00
return s.sequence(name, ct|NoWhitespace, namesToSequenceItems(refs)...)
2017-06-25 17:51:08 +02:00
}
2026-01-18 22:52:27 +01:00
// CharSequence registers a parser that matches a specific string literal (e.g., "foo").
2017-10-31 21:53:09 +01:00
func (s *Syntax) CharSequence(name string, ct CommitType, chars []rune) error {
if !isValidSymbol(name) {
return ErrInvalidSymbolName
}
2017-11-25 17:37:05 +01:00
return s.charSequence(name, ct|userDefined, chars)
2017-10-31 21:53:09 +01:00
}
func (s *Syntax) sequence(name string, ct CommitType, items ...SequenceItem) error {
2017-06-25 17:51:08 +02:00
return s.register(newSequence(name, ct, items))
}
2026-01-18 22:52:27 +01:00
// Sequence registers a parser that matches a specific order of other named parsers (defined as SequenceItems).
2017-10-31 21:53:09 +01:00
func (s *Syntax) Sequence(name string, ct CommitType, items ...SequenceItem) error {
if !isValidSymbol(name) {
return ErrInvalidSymbolName
}
2017-11-25 17:37:05 +01:00
return s.sequence(name, ct|userDefined, items...)
2017-10-31 21:53:09 +01:00
}
2017-11-02 22:19:03 +01:00
func (s *Syntax) choice(name string, ct CommitType, options ...string) error {
return s.register(newChoice(name, ct, options))
2017-06-25 17:51:08 +02:00
}
2026-01-18 22:52:27 +01:00
// Choice registers a parser that matches exactly one of the provided named options.
2017-11-02 22:19:03 +01:00
func (s *Syntax) Choice(name string, ct CommitType, options ...string) error {
2017-10-31 21:53:09 +01:00
if !isValidSymbol(name) {
return ErrInvalidSymbolName
}
2017-11-25 17:37:05 +01:00
return s.choice(name, ct|userDefined, options...)
2017-10-31 21:53:09 +01:00
}
2026-01-18 22:52:27 +01:00
// ReadSyntax loads a grammar definition from a reader using the Treerack syntax format.
2018-01-04 18:36:59 +01:00
func (s *Syntax) ReadSyntax(r io.Reader) error {
2017-06-25 17:51:08 +02:00
if s.initialized {
return ErrSyntaxInitialized
}
2018-01-05 19:06:10 +01:00
sn, err := self.Parse(r)
2026-01-15 23:33:40 +01:00
var sperr *self.ParseError
if errors.As(err, &sperr) {
var perr ParseError
perr.Input = sperr.Input
perr.Offset = sperr.Offset
perr.Line = sperr.Line
perr.Column = sperr.Column
perr.Definition = sperr.Definition
return &perr
}
2018-01-04 18:36:59 +01:00
if err != nil {
return err
}
2018-01-05 19:06:10 +01:00
n := mapSelfNode(sn)
2018-01-04 18:36:59 +01:00
return define(s, n)
2017-06-25 17:51:08 +02:00
}
2026-01-18 22:52:27 +01:00
// Init validates, initializes, and seals the syntax. This method must be called exactly once before Parsing or
// Generating.
2017-06-25 17:51:08 +02:00
func (s *Syntax) Init() error {
2025-08-20 00:45:32 +02:00
if s.errInitFailed != nil {
return s.errInitFailed
2017-06-25 17:51:08 +02:00
}
if s.initialized {
return nil
}
if s.root == nil {
return ErrNoParsersDefined
}
2019-02-02 18:07:10 +01:00
if err := s.checkCommitType(s.root); err != nil {
return err
2017-12-31 16:14:56 +01:00
}
2018-01-09 03:53:20 +01:00
defs := s.registry.definitions
2019-02-02 18:07:10 +01:00
for i := range defs {
2017-11-04 22:49:42 +01:00
defs[i].preinit()
}
2017-10-28 22:54:15 +02:00
2017-11-01 02:43:46 +01:00
if hasWhitespace(defs) {
defs, s.root = applyWhitespace(defs)
s.registry = newRegistry(defs...)
2017-08-06 20:43:52 +02:00
}
2019-02-02 18:07:10 +01:00
for i := range s.keywords {
if err := s.keywords[i].validate(s.registry); err != nil {
2025-08-20 00:45:32 +02:00
s.errInitFailed = err
2019-02-02 18:07:10 +01:00
return err
}
}
2017-11-01 02:43:46 +01:00
if err := s.root.validate(s.registry); err != nil {
2025-08-20 00:45:32 +02:00
s.errInitFailed = err
2017-06-25 17:51:08 +02:00
return err
}
2019-02-02 18:07:10 +01:00
for i := range s.keywords {
s.keywords[i].init(s.registry)
}
2017-11-01 02:43:46 +01:00
2019-02-02 18:07:10 +01:00
s.root.init(s.registry)
2017-06-25 17:51:08 +02:00
s.initialized = true
return nil
}
2019-02-02 21:27:01 +01:00
func (s *Syntax) keywordParsers() []parser {
var p []parser
for _, kw := range s.keywords {
p = append(p, kw.parser())
}
return p
}
2026-01-18 22:52:27 +01:00
// Generate writes Go source code implementing the parser to the provided writer.
2018-01-05 19:06:10 +01:00
func (s *Syntax) Generate(o GeneratorOptions, w io.Writer) error {
2017-06-25 17:51:08 +02:00
if err := s.Init(); err != nil {
return err
}
2018-01-05 19:06:10 +01:00
if o.PackageName == "" {
o.PackageName = "main"
}
2018-01-04 18:36:59 +01:00
2018-01-05 19:06:10 +01:00
var err error
fprintf := func(f string, args ...interface{}) {
if err != nil {
return
}
2018-01-04 18:36:59 +01:00
2018-01-05 19:06:10 +01:00
_, err = fmt.Fprintf(w, f, args...)
2018-01-04 18:36:59 +01:00
}
2018-01-05 19:06:10 +01:00
fprint := func(args ...interface{}) {
if err != nil {
return
}
_, err = fmt.Fprint(w, args...)
2018-01-04 18:36:59 +01:00
}
2018-01-05 19:06:10 +01:00
fprintln := func() {
fprint("\n")
2018-01-04 18:36:59 +01:00
}
2018-01-05 19:06:10 +01:00
fprint(gendoc)
fprintln()
fprintln()
2018-01-04 18:36:59 +01:00
2018-01-05 19:06:10 +01:00
fprintf("package %s", o.PackageName)
fprintln()
fprintln()
2018-01-04 18:36:59 +01:00
2018-01-05 19:06:10 +01:00
// generate headCode with scripts/createhead.go
2025-08-20 03:22:39 +02:00
hc := headCode
if o.Export {
hc = headCodeExported
}
2025-08-20 03:44:05 +02:00
fprint("// head")
fprintln()
2025-08-20 03:22:39 +02:00
fprint(hc)
2018-01-05 19:06:10 +01:00
fprintln()
2025-08-20 03:44:05 +02:00
fprint("// eo head")
fprintln()
2018-01-05 19:06:10 +01:00
fprintln()
2018-01-04 18:36:59 +01:00
2018-01-07 01:45:56 +01:00
if o.Export {
fprint(`func Parse(r io.Reader) (*Node, error) {`)
} else {
2025-08-20 03:30:46 +02:00
fprint(`func parse(r io.Reader) (*node, error) {`)
2018-01-07 01:45:56 +01:00
}
2018-01-05 19:06:10 +01:00
fprintln()
2018-01-04 18:36:59 +01:00
2018-01-05 19:06:10 +01:00
done := make(map[string]bool)
2019-02-02 21:27:01 +01:00
for _, p := range s.keywordParsers() {
if err := p.(generator).generate(w, done); err != nil {
return err
}
}
fprintln()
2019-02-02 18:07:10 +01:00
if err := s.root.parser().(generator).generate(w, done); err != nil {
2018-01-05 19:06:10 +01:00
return err
}
2018-01-04 18:36:59 +01:00
2018-01-05 19:06:10 +01:00
done = make(map[string]bool)
2019-02-02 18:07:10 +01:00
if err := s.root.builder().(generator).generate(w, done); err != nil {
2018-01-04 18:36:59 +01:00
return err
}
2018-01-05 19:06:10 +01:00
fprintln()
fprintln()
2019-02-02 18:07:10 +01:00
fprint(`var keywords = []parser{`)
for i := range s.keywords {
fprintf(`&p%d, `, s.keywords[i].nodeID())
}
fprint(`}`)
fprintln()
fprintln()
fprintf(`return parseInput(r, &p%d, &b%d, keywords)`, s.root.parser().nodeID(), s.root.builder().nodeID())
2018-01-05 19:06:10 +01:00
fprintln()
fprint(`}`)
fprintln()
2018-01-04 18:36:59 +01:00
return nil
2017-06-25 17:51:08 +02:00
}
2026-01-18 22:52:27 +01:00
// Parse reads from the input stream and constructs an AST based on the defined syntax.
2017-06-25 17:51:08 +02:00
func (s *Syntax) Parse(r io.Reader) (*Node, error) {
if err := s.Init(); err != nil {
return nil, err
}
2019-02-02 18:07:10 +01:00
return parseInput(r, s.root.parser(), s.root.builder(), s.keywordParsers())
2017-06-25 17:51:08 +02:00
}