add whitespace support

2017-10-28 22:54:15 +02:00 · 2017-10-28 22:54:15 +02:00 · 5fd3d6b7ba
commit 5fd3d6b7ba
parent 99246ff28b
10 changed files with 498 additions and 146 deletions
--- a/char.go
+++ b/char.go
@ -24,9 +24,11 @@ func newChar(
 }
 func (p *charParser) nodeName() string                  { return p.name }
 func (p *charParser) setNodeName(n string)              { p.name = n }
 func (p *charParser) nodeID() int                       { return p.id }
 func (p *charParser) setID(id int)                      { p.id = id }
 func (p *charParser) commitType() CommitType            { return Alias }
 func (p *charParser) setCommitType(ct CommitType)       {}
 func (p *charParser) validate(*registry, *idSet) error  { return nil }
 func (p *charParser) normalize(*registry, *idSet) error { return nil }
--- a/choice.go
+++ b/choice.go
@ -33,10 +33,12 @@ func newChoice(name string, ct CommitType, elements []string) *choiceDefinition
 	}
 }
-func (d *choiceDefinition) nodeName() string       { return d.name }
+func (d *choiceDefinition) nodeName() string            { return d.name }
-func (d *choiceDefinition) nodeID() int            { return d.id }
+func (d *choiceDefinition) setNodeName(n string)        { d.name = n }
-func (d *choiceDefinition) setID(id int)           { d.id = id }
+func (d *choiceDefinition) nodeID() int                 { return d.id }
-func (d *choiceDefinition) commitType() CommitType { return d.commit }
+func (d *choiceDefinition) setID(id int)                { d.id = id }
 func (d *choiceDefinition) commitType() CommitType      { return d.commit }
 func (d *choiceDefinition) setCommitType(ct CommitType) { d.commit = ct }
 func (d *choiceDefinition) validate(r *registry, path *idSet) error {
 	for i := range d.elements {
--- a/define.go
+++ b/define.go
@ -83,6 +83,10 @@ func flagsToCommitType(n []*Node) CommitType {
 		switch ni.Name {
 		case "alias":
 			ct |= Alias
 		case "ws":
 			ct |= Whitespace
 		case "nows":
 			ct |= NoWhitespace
 		case "doc":
 			ct |= Documentation
 		case "root":
--- a/notes.txt
+++ b/notes.txt
@ -1,14 +1,3 @@
 [whitespace]
 1. merge whitespaces
 2. set ws to alias
 3. apply whitespace to expressions
 - a a -> a ws* a
 - a | b -> a | b
 - a? -> a{0, 1} -> a{0, 1}
 - a+ -> a{1,} -> a (ws* a){,}
 - a* -> a{0,} -> (a (ws* a){,}){,}
 - root -> ws* root ws*
 error reporting
 - longest parse
 - count the lines
@ -19,9 +8,10 @@ read, with error reporting
 what was the bug with the large json from eskip?
 [next]
-optimization
+missing tests, coverage:
-why normalization failed
+- validation
-why normalization was slower?
+- error cases
 - whitespace cases
 error reporting
 coverage
 custom tokens
--- a/parse.go
+++ b/parse.go
@ -4,8 +4,10 @@ import "fmt"
 type definition interface {
 	nodeName() string
 	setNodeName(string)
 	nodeID() int
 	commitType() CommitType
 	setCommitType(CommitType)
 	setID(int)
 	validate(*registry, *idSet) error
 	normalize(*registry, *idSet) error
--- a/sequence.go
+++ b/sequence.go
@ -39,10 +39,12 @@ func newSequence(name string, ct CommitType, items []SequenceItem) *sequenceDefi
 	}
 }
-func (d *sequenceDefinition) nodeName() string       { return d.name }
+func (d *sequenceDefinition) nodeName() string            { return d.name }
-func (d *sequenceDefinition) nodeID() int            { return d.id }
+func (d *sequenceDefinition) setNodeName(n string)        { d.name = n }
-func (d *sequenceDefinition) setID(id int)           { d.id = id }
+func (d *sequenceDefinition) nodeID() int                 { return d.id }
-func (d *sequenceDefinition) commitType() CommitType { return d.commit }
+func (d *sequenceDefinition) setID(id int)                { d.id = id }
 func (d *sequenceDefinition) commitType() CommitType      { return d.commit }
 func (d *sequenceDefinition) setCommitType(ct CommitType) { d.commit = ct }
 func (d *sequenceDefinition) validate(r *registry, path *idSet) error {
 	for i := range d.items {
@ -153,7 +155,6 @@ func (d *sequenceDefinition) setIncludedBy(r *registry, includedBy int, parsers
 }
 func (d *sequenceDefinition) parser(r *registry, parsers *idSet) (parser, error) {
 	// TODO: what is this for? test with sequence containing a sequence through a choice
 	if parsers.has(d.id) {
 		panic(cannotIncludeParsers(d.name))
 	}
--- a/syntax.go
+++ b/syntax.go
@ -12,6 +12,8 @@ type CommitType int
 const (
 	None  CommitType = 0
 	Alias CommitType = 1 << iota
 	Whitespace
 	NoWhitespace
 	Documentation
 	Root
 )
@ -42,7 +44,9 @@ var (
 	ErrUnexpectedCharacter     = errors.New("unexpected character")
 	ErrInvalidSyntax           = errors.New("invalid syntax")
 	ErrRootAlias               = errors.New("root node cannot be an alias")
 	ErrRootWhitespace          = errors.New("root node cannot be a whitespace")
 	ErrNotImplemented          = errors.New("not implemented")
 	ErrMultipleRoots           = errors.New("multiple roots")
 )
 func duplicateDefinition(name string) error {
@ -70,12 +74,29 @@ func (s *Syntax) register(d definition) error {
 	}
 	if d.commitType()&Root != 0 {
 		if s.explicitRoot {
 			return ErrMultipleRoots
 		}
 		if s.root != nil {
 			s.root.setCommitType(s.root.commitType() &^ Root)
 		}
 		s.root = d
 		s.root.setCommitType(s.root.commitType() | Root)
 		s.explicitRoot = true
 	} else if !s.explicitRoot {
 		if s.root != nil {
 			s.root.setCommitType(s.root.commitType() &^ Root)
 		}
 		s.root = d
 		s.root.setCommitType(s.root.commitType() | Root)
 	}
 	// TODO: verify that definition names match the symbol criteria, or figure a better naming for the
 	// whitespace
 	return s.registry.setDefinition(d)
 }
@ -144,6 +165,19 @@ func (s *Syntax) Init() error {
 		return ErrRootAlias
 	}
 	if s.root.commitType()&Whitespace != 0 {
 		return ErrRootWhitespace
 	}
 	s.registry = initWhitespace(s.registry)
 	for _, def := range s.registry.definitions {
 		if def.commitType()&Root != 0 {
 			s.root = def
 			break
 		}
 	}
 	if err := s.root.validate(s.registry, &idSet{}); err != nil {
 		return err
 	}
--- a/syntax.parser
+++ b/syntax.parser
@ -1,5 +1,5 @@
-ws:alias  = " " | "\t" | "\n" | "\b" | "\f" | "\r" | "\v";
+wschar:alias  = " " | "\t" | "\n" | "\b" | "\f" | "\r" | "\v";
-wsc:alias = ws | comment;
+wsc:alias     = wschar | comment;
 block-comment:alias   = "/*" ("*" [^/] | [^*])* "*/";
 line-comment:alias    = "//" [^\n]*;
@ -57,9 +57,11 @@ expression:alias = terminal
                 | choice;
 alias      = "alias";
 ws         = "ws";
 nows       = "nows";
 doc        = "doc";
 root       = "root";
-flag:alias = alias | doc | root;
+flag:alias = alias | ws | nows | doc | root;
 definition = symbol (":" flag)* wsc* "=" wsc* expression;
 definitions:alias = definition (wsc* ";" (wsc | ";")* definition)*;
--- a/whitespace.go
+++ b/whitespace.go
@ -0,0 +1,172 @@
 package treerack
 import "fmt"
 const whitespaceName = ":ws"
 func brokenRegistryError(err error) error {
 	return fmt.Errorf("broken registry: %v", err)
 }
 func splitWhitespaceDefs(all map[string]definition) ([]definition, []definition) {
 	var whitespaceDefs, nonWhitespaceDefs []definition
 	for _, def := range all {
 		if def.commitType()&Whitespace != 0 {
 			def.setCommitType(def.commitType() | Alias)
 			whitespaceDefs = append(whitespaceDefs, def)
 			continue
 		}
 		nonWhitespaceDefs = append(nonWhitespaceDefs, def)
 	}
 	return whitespaceDefs, nonWhitespaceDefs
 }
 func splitRoot(defs []definition) (definition, []definition) {
 	var (
 		root definition
 		rest []definition
 	)
 	for _, def := range defs {
 		if def.commitType()&Root != 0 {
 			root = def
 			continue
 		}
 		rest = append(rest, def)
 	}
 	return root, rest
 }
 func mergeWhitespaceDefs(ws []definition) definition {
 	var names []string
 	for _, def := range ws {
 		names = append(names, def.nodeName())
 	}
 	return newChoice(whitespaceName, Alias, names)
 }
 // TODO: validate min and max
 func applyWhitespaceToSeq(s *sequenceDefinition) []definition {
 	var (
 		defs  []definition
 		items []SequenceItem
 	)
 	for i, item := range s.items {
 		if i > 0 {
 			items = append(items, SequenceItem{Name: whitespaceName, Min: 0, Max: -1})
 		}
 		if item.Max >= 0 && item.Max <= 1 {
 			items = append(items, item)
 			continue
 		}
 		singleItem := SequenceItem{Name: item.Name, Min: 1, Max: 1}
 		restName := item.Name + ":wsrest"
 		restDef := newSequence(restName, Alias, []SequenceItem{{Name: whitespaceName, Min: 0, Max: -1}, singleItem})
 		defs = append(defs, restDef)
 		restItems := SequenceItem{Name: restName, Min: 0, Max: -1}
 		if item.Min > 0 {
 			restItems.Min = item.Min - 1
 		}
 		if item.Max > 0 {
 			restItems.Min = item.Max - 1
 		}
 		if item.Min > 0 {
 			items = append(items, singleItem, restItems)
 			continue
 		}
 		optName := item.Name + ":wsopt"
 		optDef := newSequence(optName, Alias, []SequenceItem{singleItem, restItems})
 		defs = append(defs, optDef)
 		items = append(items, SequenceItem{Name: optName, Min: 0, Max: 1})
 	}
 	s = newSequence(s.nodeName(), s.commitType(), items)
 	defs = append(defs, s)
 	return defs
 }
 func applyWhitespace(defs []definition) []definition {
 	var defsWS []definition
 	for _, def := range defs {
 		if def.commitType()&NoWhitespace != 0 {
 			defsWS = append(defsWS, def)
 			continue
 		}
 		seq, ok := def.(*sequenceDefinition)
 		if !ok {
 			defsWS = append(defsWS, def)
 			continue
 		}
 		defsWS = append(defsWS, applyWhitespaceToSeq(seq)...)
 	}
 	return defsWS
 }
 func applyWhitespaceRoot(root definition) (definition, definition) {
 	original, name := root, root.nodeName()
 	wsName := ":wsroot:" + name
 	original.setNodeName(wsName)
 	original.setCommitType(original.commitType() &^ Root)
 	original.setCommitType(original.commitType() | Alias)
 	root = newSequence(name, Root, []SequenceItem{{
 		Name: whitespaceName,
 		Min:  0,
 		Max:  -1,
 	}, {
 		Name: wsName,
 		Min:  1,
 		Max:  1,
 	}, {
 		Name: whitespaceName,
 		Min:  0,
 		Max:  -1,
 	}})
 	return original, root
 }
 func registerPatched(r *registry, defs ...definition) {
 	for _, def := range defs {
 		if err := r.setDefinition(def); err != nil {
 			panic(brokenRegistryError(err))
 		}
 	}
 }
 func initWhitespace(r *registry) *registry {
 	whitespaceDefs, defs := splitWhitespaceDefs(r.definitions)
 	if len(whitespaceDefs) == 0 {
 		return r
 	}
 	whitespace := mergeWhitespaceDefs(whitespaceDefs)
 	defs = applyWhitespace(defs)
 	root, defs := splitRoot(defs)
 	originalRoot, root := applyWhitespaceRoot(root)
 	r = newRegistry()
 	registerPatched(r, whitespace)
 	registerPatched(r, whitespaceDefs...)
 	registerPatched(r, defs...)
 	registerPatched(r, originalRoot, root)
 	return r
 }
--- a/whitespace_test.go
+++ b/whitespace_test.go
@ -4,137 +4,280 @@ import "testing"
 const (
 	csvWithoutWhitespaceSupport = `
-		ws:alias = [ \t]*;
+		ws:alias        = [ \t];
 		word-char:alias = [^\n, \t];
 		cell            = (word-char (ws* word-char)*)?;
 		rest-cell:alias = "," ws* cell;
 		line            = cell ws* (rest-cell (ws* rest-cell)*)?;
 		rest-line:alias = "\n" ws* line;
 		document        = ws* (line ws* (rest-line (ws* rest-line)*)?)? ws*;
 	`
 	csvWithWhitespaceSupport = `
 		ws:ws    = [ \t];
 		cell     = [^\n, \t]*;
-		line     = ws cell (ws "," ws cell)* ws;
+		line     = cell ("," cell)*;
 		document = (line ("\n" line)*)?;
 	`
 )
 func TestCSVWhitespace(t *testing.T) {
-	t.Run("wihout whitespace support", func(t *testing.T) {
+	tests := []testItem{{
 		title: "empty",
 		node: &Node{
 			Name: "document",
 		},
 	}, {
 		title:          "only a cell",
 		text:           "abc",
 		ignorePosition: true,
 		node: &Node{
 			Name: "document",
 			Nodes: []*Node{{
 				Name: "line",
 				Nodes: []*Node{{
 					Name: "cell",
 				}},
 			}},
 		},
 	}, {
 		title:          "single line",
 		text:           `a, b, c`,
 		ignorePosition: true,
 		node: &Node{
 			Name: "document",
 			Nodes: []*Node{{
 				Name: "line",
 				Nodes: []*Node{{
 					Name: "cell",
 				}, {
 					Name: "cell",
 				}, {
 					Name: "cell",
 				}},
 			}},
 		},
 	}, {
 		title: "regular csv",
 		text: `a, b, c
 			       d, e, f`,
 		ignorePosition: true,
 		node: &Node{
 			Name: "document",
 			Nodes: []*Node{{
 				Name: "line",
 				Nodes: []*Node{{
 					Name: "cell",
 				}, {
 					Name: "cell",
 				}, {
 					Name: "cell",
 				}},
 			}, {
 				Name: "line",
 				Nodes: []*Node{{
 					Name: "cell",
 				}, {
 					Name: "cell",
 				}, {
 					Name: "cell",
 				}},
 			}},
 		},
 	}, {
 		title: "irregular csv",
 		text: `a,, b, c, 
 			       d, ,,,,`,
 		ignorePosition: true,
 		node: &Node{
 			Name: "document",
 			Nodes: []*Node{{
 				Name: "line",
 				Nodes: []*Node{{
 					Name: "cell",
 				}, {
 					Name: "cell",
 				}, {
 					Name: "cell",
 				}, {
 					Name: "cell",
 				}, {
 					Name: "cell",
 				}},
 			}, {
 				Name: "line",
 				Nodes: []*Node{{
 					Name: "cell",
 				}, {
 					Name: "cell",
 				}, {
 					Name: "cell",
 				}, {
 					Name: "cell",
 				}, {
 					Name: "cell",
 				}, {
 					Name: "cell",
 				}},
 			}},
 		},
 	}, {
 		title:          "too many commas",
 		text:           `a,,`,
 		ignorePosition: true,
 		node: &Node{
 			Name: "document",
 			Nodes: []*Node{{
 				Name: "line",
 				Nodes: []*Node{{
 					Name: "cell",
 				}, {
 					Name: "cell",
 				}, {
 					Name: "cell",
 				}},
 			}},
 		},
 	}, {
 		title:          "csv with tabs",
 		text:           "a,\tb, c",
 		ignorePosition: true,
 		node: &Node{
 			Name: "document",
 			Nodes: []*Node{{
 				Name: "line",
 				Nodes: []*Node{{
 					Name: "cell",
 				}, {
 					Name: "cell",
 				}, {
 					Name: "cell",
 				}},
 			}},
 		},
 	}, {
 		title: "whitespace between lines",
 		text:  " a, b, c \n d, e, f ",
 		node: &Node{
 			Name: "document",
 			To:   19,
 			Nodes: []*Node{{
 				Name: "line",
 				From: 1,
 				To:   8,
 				Nodes: []*Node{{
 					Name: "cell",
 					From: 1,
 					To:   2,
 				}, {
 					Name: "cell",
 					From: 4,
 					To:   5,
 				}, {
 					Name: "cell",
 					From: 7,
 					To:   8,
 				}},
 			}, {
 				Name: "line",
 				From: 11,
 				To:   18,
 				Nodes: []*Node{{
 					Name: "cell",
 					From: 11,
 					To:   12,
 				}, {
 					Name: "cell",
 					From: 14,
 					To:   15,
 				}, {
 					Name: "cell",
 					From: 17,
 					To:   18,
 				}},
 			}},
 		},
 	}, {
 		title:          "just a space",
 		text:           " ",
 		ignorePosition: true,
 		node: &Node{
 			Name: "document",
 		},
 	}, {
 		title: "cell with spaces in it",
 		text:  "cell content 1/1, cell content 1/2\ncell content 2/1, cell content 2/2",
 		node: &Node{
 			Name: "document",
 			To:   69,
 			Nodes: []*Node{{
 				Name: "line",
 				To:   34,
 				Nodes: []*Node{{
 					Name: "cell",
 					To:   16,
 				}, {
 					Name: "cell",
 					From: 18,
 					To:   34,
 				}},
 			}, {
 				Name: "line",
 				From: 35,
 				To:   69,
 				Nodes: []*Node{{
 					Name: "cell",
 					From: 35,
 					To:   51,
 				}, {
 					Name: "cell",
 					From: 53,
 					To:   69,
 				}},
 			}},
 		},
 	}, {
 		title:          "multiple empty lines",
 		text:           "\n\n",
 		ignorePosition: true,
 		node: &Node{
 			Name: "document",
 			Nodes: []*Node{{
 				Name: "line",
 				Nodes: []*Node{{
 					Name: "cell",
 				}},
 			}, {
 				Name: "line",
 				Nodes: []*Node{{
 					Name: "cell",
 				}},
 			}, {
 				Name: "line",
 				Nodes: []*Node{{
 					Name: "cell",
 				}},
 			}},
 		},
 	}}
 	t.Run("without whitespace support", func(t *testing.T) {
 		s, err := openSyntaxString(csvWithoutWhitespaceSupport)
 		if err != nil {
 			t.Error(err)
 			return
 		}
-		runTestsSyntax(t, s, []testItem{{
+		runTestsSyntax(t, s, tests)
-			title: "empty",
+	})
-			node: &Node{
+
-				Name: "document",
+	t.Run("with whitespace support", func(t *testing.T) {
-			},
+		s, err := openSyntaxString(csvWithWhitespaceSupport)
-		}, {
+		if err != nil {
-			title:          "only a cell",
+			t.Error(err)
-			text:           "abc",
+			return
-			ignorePosition: true,
+		}
-			node: &Node{
+
-				Name: "document",
+		runTestsSyntax(t, s, tests)
 				Nodes: []*Node{{
 					Name: "line",
 					Nodes: []*Node{{
 						Name: "cell",
 					}},
 				}},
 			},
 		}, {
 			title:          "single line",
 			text:           `a, b, c`,
 			ignorePosition: true,
 			node: &Node{
 				Name: "document",
 				Nodes: []*Node{{
 					Name: "line",
 					Nodes: []*Node{{
 						Name: "cell",
 					}, {
 						Name: "cell",
 					}, {
 						Name: "cell",
 					}},
 				}},
 			},
 		}, {
 			title: "regular csv",
 			text: `a, b, c
 			       d, e, f`,
 			ignorePosition: true,
 			node: &Node{
 				Name: "document",
 				Nodes: []*Node{{
 					Name: "line",
 					Nodes: []*Node{{
 						Name: "cell",
 					}, {
 						Name: "cell",
 					}, {
 						Name: "cell",
 					}},
 				}, {
 					Name: "line",
 					Nodes: []*Node{{
 						Name: "cell",
 					}, {
 						Name: "cell",
 					}, {
 						Name: "cell",
 					}},
 				}},
 			},
 		}, {
 			title: "irregular csv",
 			text: `a,, b, c, 
 			       d, ,,,,`,
 			ignorePosition: true,
 			node: &Node{
 				Name: "document",
 				Nodes: []*Node{{
 					Name: "line",
 					Nodes: []*Node{{
 						Name: "cell",
 					}, {
 						Name: "cell",
 					}, {
 						Name: "cell",
 					}, {
 						Name: "cell",
 					}, {
 						Name: "cell",
 					}},
 				}, {
 					Name: "line",
 					Nodes: []*Node{{
 						Name: "cell",
 					}, {
 						Name: "cell",
 					}, {
 						Name: "cell",
 					}, {
 						Name: "cell",
 					}, {
 						Name: "cell",
 					}, {
 						Name: "cell",
 					}},
 				}},
 			},
 		}, {
 			title:          "too many commas",
 			text:           `a,,`,
 			ignorePosition: true,
 			node: &Node{
 				Name: "document",
 				Nodes: []*Node{{
 					Name: "line",
 					Nodes: []*Node{{
 						Name: "cell",
 					}, {
 						Name: "cell",
 					}, {
 						Name: "cell",
 					}},
 				}},
 			},
 		}})
 	})
 }