From 5fd3d6b7bab6ddb31c8b2e79253b8f03d85d137a Mon Sep 17 00:00:00 2001
From: Arpad Ryszka <arpad.ryszka@gmail.com>
Date: Sat, 28 Oct 2017 22:54:15 +0200
Subject: [PATCH] add whitespace support

---
 char.go            |   2 +
 choice.go          |  10 +-
 define.go          |   4 +
 notes.txt          |  18 +--
 parse.go           |   2 +
 sequence.go        |  11 +-
 syntax.go          |  34 ++++
 syntax.parser      |   8 +-
 whitespace.go      | 172 ++++++++++++++++++++
 whitespace_test.go | 383 +++++++++++++++++++++++++++++++--------------
 10 files changed, 498 insertions(+), 146 deletions(-)
 create mode 100644 whitespace.go

diff --git a/char.go b/char.go
index ff2c184..750085d 100644
--- a/char.go
+++ b/char.go
@@ -24,9 +24,11 @@ func newChar(
 }
 
 func (p *charParser) nodeName() string                  { return p.name }
+func (p *charParser) setNodeName(n string)              { p.name = n }
 func (p *charParser) nodeID() int                       { return p.id }
 func (p *charParser) setID(id int)                      { p.id = id }
 func (p *charParser) commitType() CommitType            { return Alias }
+func (p *charParser) setCommitType(ct CommitType)       {}
 func (p *charParser) validate(*registry, *idSet) error  { return nil }
 func (p *charParser) normalize(*registry, *idSet) error { return nil }
 
diff --git a/choice.go b/choice.go
index 4b2aa36..9789280 100644
--- a/choice.go
+++ b/choice.go
@@ -33,10 +33,12 @@ func newChoice(name string, ct CommitType, elements []string) *choiceDefinition
 	}
 }
 
-func (d *choiceDefinition) nodeName() string       { return d.name }
-func (d *choiceDefinition) nodeID() int            { return d.id }
-func (d *choiceDefinition) setID(id int)           { d.id = id }
-func (d *choiceDefinition) commitType() CommitType { return d.commit }
+func (d *choiceDefinition) nodeName() string            { return d.name }
+func (d *choiceDefinition) setNodeName(n string)        { d.name = n }
+func (d *choiceDefinition) nodeID() int                 { return d.id }
+func (d *choiceDefinition) setID(id int)                { d.id = id }
+func (d *choiceDefinition) commitType() CommitType      { return d.commit }
+func (d *choiceDefinition) setCommitType(ct CommitType) { d.commit = ct }
 
 func (d *choiceDefinition) validate(r *registry, path *idSet) error {
 	for i := range d.elements {
diff --git a/define.go b/define.go
index 78ed61e..f744acb 100644
--- a/define.go
+++ b/define.go
@@ -83,6 +83,10 @@ func flagsToCommitType(n []*Node) CommitType {
 		switch ni.Name {
 		case "alias":
 			ct |= Alias
+		case "ws":
+			ct |= Whitespace
+		case "nows":
+			ct |= NoWhitespace
 		case "doc":
 			ct |= Documentation
 		case "root":
diff --git a/notes.txt b/notes.txt
index cc6ffdb..ad64be7 100644
--- a/notes.txt
+++ b/notes.txt
@@ -1,14 +1,3 @@
-[whitespace]
-1. merge whitespaces
-2. set ws to alias
-3. apply whitespace to expressions
-- a a -> a ws* a
-- a | b -> a | b
-- a? -> a{0, 1} -> a{0, 1}
-- a+ -> a{1,} -> a (ws* a){,}
-- a* -> a{0,} -> (a (ws* a){,}){,}
-- root -> ws* root ws*
-
 error reporting
 - longest parse
 - count the lines
@@ -19,9 +8,10 @@ read, with error reporting
 what was the bug with the large json from eskip?
 
 [next]
-optimization
-why normalization failed
-why normalization was slower?
+missing tests, coverage:
+- validation
+- error cases
+- whitespace cases
 error reporting
 coverage
 custom tokens
diff --git a/parse.go b/parse.go
index b2f9509..1601a43 100644
--- a/parse.go
+++ b/parse.go
@@ -4,8 +4,10 @@ import "fmt"
 
 type definition interface {
 	nodeName() string
+	setNodeName(string)
 	nodeID() int
 	commitType() CommitType
+	setCommitType(CommitType)
 	setID(int)
 	validate(*registry, *idSet) error
 	normalize(*registry, *idSet) error
diff --git a/sequence.go b/sequence.go
index 4b3cbc6..9b6d401 100644
--- a/sequence.go
+++ b/sequence.go
@@ -39,10 +39,12 @@ func newSequence(name string, ct CommitType, items []SequenceItem) *sequenceDefi
 	}
 }
 
-func (d *sequenceDefinition) nodeName() string       { return d.name }
-func (d *sequenceDefinition) nodeID() int            { return d.id }
-func (d *sequenceDefinition) setID(id int)           { d.id = id }
-func (d *sequenceDefinition) commitType() CommitType { return d.commit }
+func (d *sequenceDefinition) nodeName() string            { return d.name }
+func (d *sequenceDefinition) setNodeName(n string)        { d.name = n }
+func (d *sequenceDefinition) nodeID() int                 { return d.id }
+func (d *sequenceDefinition) setID(id int)                { d.id = id }
+func (d *sequenceDefinition) commitType() CommitType      { return d.commit }
+func (d *sequenceDefinition) setCommitType(ct CommitType) { d.commit = ct }
 
 func (d *sequenceDefinition) validate(r *registry, path *idSet) error {
 	for i := range d.items {
@@ -153,7 +155,6 @@ func (d *sequenceDefinition) setIncludedBy(r *registry, includedBy int, parsers
 }
 
 func (d *sequenceDefinition) parser(r *registry, parsers *idSet) (parser, error) {
-	// TODO: what is this for? test with sequence containing a sequence through a choice
 	if parsers.has(d.id) {
 		panic(cannotIncludeParsers(d.name))
 	}
diff --git a/syntax.go b/syntax.go
index a9637e6..c5b1183 100644
--- a/syntax.go
+++ b/syntax.go
@@ -12,6 +12,8 @@ type CommitType int
 const (
 	None  CommitType = 0
 	Alias CommitType = 1 << iota
+	Whitespace
+	NoWhitespace
 	Documentation
 	Root
 )
@@ -42,7 +44,9 @@ var (
 	ErrUnexpectedCharacter     = errors.New("unexpected character")
 	ErrInvalidSyntax           = errors.New("invalid syntax")
 	ErrRootAlias               = errors.New("root node cannot be an alias")
+	ErrRootWhitespace          = errors.New("root node cannot be a whitespace")
 	ErrNotImplemented          = errors.New("not implemented")
+	ErrMultipleRoots           = errors.New("multiple roots")
 )
 
 func duplicateDefinition(name string) error {
@@ -70,12 +74,29 @@ func (s *Syntax) register(d definition) error {
 	}
 
 	if d.commitType()&Root != 0 {
+		if s.explicitRoot {
+			return ErrMultipleRoots
+		}
+
+		if s.root != nil {
+			s.root.setCommitType(s.root.commitType() &^ Root)
+		}
+
 		s.root = d
+		s.root.setCommitType(s.root.commitType() | Root)
 		s.explicitRoot = true
 	} else if !s.explicitRoot {
+		if s.root != nil {
+			s.root.setCommitType(s.root.commitType() &^ Root)
+		}
+
 		s.root = d
+		s.root.setCommitType(s.root.commitType() | Root)
 	}
 
+	// TODO: verify that definition names match the symbol criteria, or figure a better naming for the
+	// whitespace
+
 	return s.registry.setDefinition(d)
 }
 
@@ -144,6 +165,19 @@ func (s *Syntax) Init() error {
 		return ErrRootAlias
 	}
 
+	if s.root.commitType()&Whitespace != 0 {
+		return ErrRootWhitespace
+	}
+
+	s.registry = initWhitespace(s.registry)
+
+	for _, def := range s.registry.definitions {
+		if def.commitType()&Root != 0 {
+			s.root = def
+			break
+		}
+	}
+
 	if err := s.root.validate(s.registry, &idSet{}); err != nil {
 		return err
 	}
diff --git a/syntax.parser b/syntax.parser
index 7c3fd86..5fccf15 100644
--- a/syntax.parser
+++ b/syntax.parser
@@ -1,5 +1,5 @@
-ws:alias  = " " | "\t" | "\n" | "\b" | "\f" | "\r" | "\v";
-wsc:alias = ws | comment;
+wschar:alias  = " " | "\t" | "\n" | "\b" | "\f" | "\r" | "\v";
+wsc:alias     = wschar | comment;
 
 block-comment:alias   = "/*" ("*" [^/] | [^*])* "*/";
 line-comment:alias    = "//" [^\n]*;
@@ -57,9 +57,11 @@ expression:alias = terminal
                  | choice;
 
 alias      = "alias";
+ws         = "ws";
+nows       = "nows";
 doc        = "doc";
 root       = "root";
-flag:alias = alias | doc | root;
+flag:alias = alias | ws | nows | doc | root;
 definition = symbol (":" flag)* wsc* "=" wsc* expression;
 
 definitions:alias = definition (wsc* ";" (wsc | ";")* definition)*;
diff --git a/whitespace.go b/whitespace.go
new file mode 100644
index 0000000..aad0843
--- /dev/null
+++ b/whitespace.go
@@ -0,0 +1,172 @@
+package treerack
+
+import "fmt"
+
+const whitespaceName = ":ws"
+
+func brokenRegistryError(err error) error {
+	return fmt.Errorf("broken registry: %v", err)
+}
+
+func splitWhitespaceDefs(all map[string]definition) ([]definition, []definition) {
+	var whitespaceDefs, nonWhitespaceDefs []definition
+	for _, def := range all {
+		if def.commitType()&Whitespace != 0 {
+			def.setCommitType(def.commitType() | Alias)
+			whitespaceDefs = append(whitespaceDefs, def)
+			continue
+		}
+
+		nonWhitespaceDefs = append(nonWhitespaceDefs, def)
+	}
+
+	return whitespaceDefs, nonWhitespaceDefs
+}
+
+func splitRoot(defs []definition) (definition, []definition) {
+	var (
+		root definition
+		rest []definition
+	)
+
+	for _, def := range defs {
+		if def.commitType()&Root != 0 {
+			root = def
+			continue
+		}
+
+		rest = append(rest, def)
+	}
+
+	return root, rest
+}
+
+func mergeWhitespaceDefs(ws []definition) definition {
+	var names []string
+	for _, def := range ws {
+		names = append(names, def.nodeName())
+	}
+
+	return newChoice(whitespaceName, Alias, names)
+}
+
+// TODO: validate min and max
+
+func applyWhitespaceToSeq(s *sequenceDefinition) []definition {
+	var (
+		defs  []definition
+		items []SequenceItem
+	)
+
+	for i, item := range s.items {
+		if i > 0 {
+			items = append(items, SequenceItem{Name: whitespaceName, Min: 0, Max: -1})
+		}
+
+		if item.Max >= 0 && item.Max <= 1 {
+			items = append(items, item)
+			continue
+		}
+
+		singleItem := SequenceItem{Name: item.Name, Min: 1, Max: 1}
+
+		restName := item.Name + ":wsrest"
+		restDef := newSequence(restName, Alias, []SequenceItem{{Name: whitespaceName, Min: 0, Max: -1}, singleItem})
+		defs = append(defs, restDef)
+
+		restItems := SequenceItem{Name: restName, Min: 0, Max: -1}
+		if item.Min > 0 {
+			restItems.Min = item.Min - 1
+		}
+		if item.Max > 0 {
+			restItems.Min = item.Max - 1
+		}
+
+		if item.Min > 0 {
+			items = append(items, singleItem, restItems)
+			continue
+		}
+
+		optName := item.Name + ":wsopt"
+		optDef := newSequence(optName, Alias, []SequenceItem{singleItem, restItems})
+		defs = append(defs, optDef)
+		items = append(items, SequenceItem{Name: optName, Min: 0, Max: 1})
+	}
+
+	s = newSequence(s.nodeName(), s.commitType(), items)
+	defs = append(defs, s)
+	return defs
+}
+
+func applyWhitespace(defs []definition) []definition {
+	var defsWS []definition
+	for _, def := range defs {
+		if def.commitType()&NoWhitespace != 0 {
+			defsWS = append(defsWS, def)
+			continue
+		}
+
+		seq, ok := def.(*sequenceDefinition)
+		if !ok {
+			defsWS = append(defsWS, def)
+			continue
+		}
+
+		defsWS = append(defsWS, applyWhitespaceToSeq(seq)...)
+	}
+
+	return defsWS
+}
+
+func applyWhitespaceRoot(root definition) (definition, definition) {
+	original, name := root, root.nodeName()
+	wsName := ":wsroot:" + name
+
+	original.setNodeName(wsName)
+	original.setCommitType(original.commitType() &^ Root)
+	original.setCommitType(original.commitType() | Alias)
+
+	root = newSequence(name, Root, []SequenceItem{{
+		Name: whitespaceName,
+		Min:  0,
+		Max:  -1,
+	}, {
+		Name: wsName,
+		Min:  1,
+		Max:  1,
+	}, {
+		Name: whitespaceName,
+		Min:  0,
+		Max:  -1,
+	}})
+
+	return original, root
+}
+
+func registerPatched(r *registry, defs ...definition) {
+	for _, def := range defs {
+		if err := r.setDefinition(def); err != nil {
+			panic(brokenRegistryError(err))
+		}
+	}
+}
+
+func initWhitespace(r *registry) *registry {
+	whitespaceDefs, defs := splitWhitespaceDefs(r.definitions)
+	if len(whitespaceDefs) == 0 {
+		return r
+	}
+
+	whitespace := mergeWhitespaceDefs(whitespaceDefs)
+	defs = applyWhitespace(defs)
+
+	root, defs := splitRoot(defs)
+	originalRoot, root := applyWhitespaceRoot(root)
+
+	r = newRegistry()
+	registerPatched(r, whitespace)
+	registerPatched(r, whitespaceDefs...)
+	registerPatched(r, defs...)
+	registerPatched(r, originalRoot, root)
+	return r
+}
diff --git a/whitespace_test.go b/whitespace_test.go
index 6dba6f1..e29ad54 100644
--- a/whitespace_test.go
+++ b/whitespace_test.go
@@ -4,137 +4,280 @@ import "testing"
 
 const (
 	csvWithoutWhitespaceSupport = `
-		ws:alias = [ \t]*;
+		ws:alias        = [ \t];
+		word-char:alias = [^\n, \t];
+		cell            = (word-char (ws* word-char)*)?;
+		rest-cell:alias = "," ws* cell;
+		line            = cell ws* (rest-cell (ws* rest-cell)*)?;
+		rest-line:alias = "\n" ws* line;
+		document        = ws* (line ws* (rest-line (ws* rest-line)*)?)? ws*;
+	`
+
+	csvWithWhitespaceSupport = `
+		ws:ws    = [ \t];
 		cell     = [^\n, \t]*;
-		line     = ws cell (ws "," ws cell)* ws;
+		line     = cell ("," cell)*;
 		document = (line ("\n" line)*)?;
 	`
 )
 
 func TestCSVWhitespace(t *testing.T) {
-	t.Run("wihout whitespace support", func(t *testing.T) {
+	tests := []testItem{{
+		title: "empty",
+		node: &Node{
+			Name: "document",
+		},
+	}, {
+		title:          "only a cell",
+		text:           "abc",
+		ignorePosition: true,
+		node: &Node{
+			Name: "document",
+			Nodes: []*Node{{
+				Name: "line",
+				Nodes: []*Node{{
+					Name: "cell",
+				}},
+			}},
+		},
+	}, {
+		title:          "single line",
+		text:           `a, b, c`,
+		ignorePosition: true,
+		node: &Node{
+			Name: "document",
+			Nodes: []*Node{{
+				Name: "line",
+				Nodes: []*Node{{
+					Name: "cell",
+				}, {
+					Name: "cell",
+				}, {
+					Name: "cell",
+				}},
+			}},
+		},
+	}, {
+		title: "regular csv",
+		text: `a, b, c
+			       d, e, f`,
+		ignorePosition: true,
+		node: &Node{
+			Name: "document",
+			Nodes: []*Node{{
+				Name: "line",
+				Nodes: []*Node{{
+					Name: "cell",
+				}, {
+					Name: "cell",
+				}, {
+					Name: "cell",
+				}},
+			}, {
+				Name: "line",
+				Nodes: []*Node{{
+					Name: "cell",
+				}, {
+					Name: "cell",
+				}, {
+					Name: "cell",
+				}},
+			}},
+		},
+	}, {
+		title: "irregular csv",
+		text: `a,, b, c, 
+			       d, ,,,,`,
+		ignorePosition: true,
+		node: &Node{
+			Name: "document",
+			Nodes: []*Node{{
+				Name: "line",
+				Nodes: []*Node{{
+					Name: "cell",
+				}, {
+					Name: "cell",
+				}, {
+					Name: "cell",
+				}, {
+					Name: "cell",
+				}, {
+					Name: "cell",
+				}},
+			}, {
+				Name: "line",
+				Nodes: []*Node{{
+					Name: "cell",
+				}, {
+					Name: "cell",
+				}, {
+					Name: "cell",
+				}, {
+					Name: "cell",
+				}, {
+					Name: "cell",
+				}, {
+					Name: "cell",
+				}},
+			}},
+		},
+	}, {
+		title:          "too many commas",
+		text:           `a,,`,
+		ignorePosition: true,
+		node: &Node{
+			Name: "document",
+			Nodes: []*Node{{
+				Name: "line",
+				Nodes: []*Node{{
+					Name: "cell",
+				}, {
+					Name: "cell",
+				}, {
+					Name: "cell",
+				}},
+			}},
+		},
+	}, {
+		title:          "csv with tabs",
+		text:           "a,\tb, c",
+		ignorePosition: true,
+		node: &Node{
+			Name: "document",
+			Nodes: []*Node{{
+				Name: "line",
+				Nodes: []*Node{{
+					Name: "cell",
+				}, {
+					Name: "cell",
+				}, {
+					Name: "cell",
+				}},
+			}},
+		},
+	}, {
+		title: "whitespace between lines",
+		text:  " a, b, c \n d, e, f ",
+		node: &Node{
+			Name: "document",
+			To:   19,
+			Nodes: []*Node{{
+				Name: "line",
+				From: 1,
+				To:   8,
+				Nodes: []*Node{{
+					Name: "cell",
+					From: 1,
+					To:   2,
+				}, {
+					Name: "cell",
+					From: 4,
+					To:   5,
+				}, {
+					Name: "cell",
+					From: 7,
+					To:   8,
+				}},
+			}, {
+				Name: "line",
+				From: 11,
+				To:   18,
+				Nodes: []*Node{{
+					Name: "cell",
+					From: 11,
+					To:   12,
+				}, {
+					Name: "cell",
+					From: 14,
+					To:   15,
+				}, {
+					Name: "cell",
+					From: 17,
+					To:   18,
+				}},
+			}},
+		},
+	}, {
+		title:          "just a space",
+		text:           " ",
+		ignorePosition: true,
+		node: &Node{
+			Name: "document",
+		},
+	}, {
+		title: "cell with spaces in it",
+		text:  "cell content 1/1, cell content 1/2\ncell content 2/1, cell content 2/2",
+		node: &Node{
+			Name: "document",
+			To:   69,
+			Nodes: []*Node{{
+				Name: "line",
+				To:   34,
+				Nodes: []*Node{{
+					Name: "cell",
+					To:   16,
+				}, {
+					Name: "cell",
+					From: 18,
+					To:   34,
+				}},
+			}, {
+				Name: "line",
+				From: 35,
+				To:   69,
+				Nodes: []*Node{{
+					Name: "cell",
+					From: 35,
+					To:   51,
+				}, {
+					Name: "cell",
+					From: 53,
+					To:   69,
+				}},
+			}},
+		},
+	}, {
+		title:          "multiple empty lines",
+		text:           "\n\n",
+		ignorePosition: true,
+		node: &Node{
+			Name: "document",
+			Nodes: []*Node{{
+				Name: "line",
+				Nodes: []*Node{{
+					Name: "cell",
+				}},
+			}, {
+				Name: "line",
+				Nodes: []*Node{{
+					Name: "cell",
+				}},
+			}, {
+				Name: "line",
+				Nodes: []*Node{{
+					Name: "cell",
+				}},
+			}},
+		},
+	}}
+
+	t.Run("without whitespace support", func(t *testing.T) {
 		s, err := openSyntaxString(csvWithoutWhitespaceSupport)
 		if err != nil {
 			t.Error(err)
 			return
 		}
 
-		runTestsSyntax(t, s, []testItem{{
-			title: "empty",
-			node: &Node{
-				Name: "document",
-			},
-		}, {
-			title:          "only a cell",
-			text:           "abc",
-			ignorePosition: true,
-			node: &Node{
-				Name: "document",
-				Nodes: []*Node{{
-					Name: "line",
-					Nodes: []*Node{{
-						Name: "cell",
-					}},
-				}},
-			},
-		}, {
-			title:          "single line",
-			text:           `a, b, c`,
-			ignorePosition: true,
-			node: &Node{
-				Name: "document",
-				Nodes: []*Node{{
-					Name: "line",
-					Nodes: []*Node{{
-						Name: "cell",
-					}, {
-						Name: "cell",
-					}, {
-						Name: "cell",
-					}},
-				}},
-			},
-		}, {
-			title: "regular csv",
-			text: `a, b, c
-			       d, e, f`,
-			ignorePosition: true,
-			node: &Node{
-				Name: "document",
-				Nodes: []*Node{{
-					Name: "line",
-					Nodes: []*Node{{
-						Name: "cell",
-					}, {
-						Name: "cell",
-					}, {
-						Name: "cell",
-					}},
-				}, {
-					Name: "line",
-					Nodes: []*Node{{
-						Name: "cell",
-					}, {
-						Name: "cell",
-					}, {
-						Name: "cell",
-					}},
-				}},
-			},
-		}, {
-			title: "irregular csv",
-			text: `a,, b, c, 
-			       d, ,,,,`,
-			ignorePosition: true,
-			node: &Node{
-				Name: "document",
-				Nodes: []*Node{{
-					Name: "line",
-					Nodes: []*Node{{
-						Name: "cell",
-					}, {
-						Name: "cell",
-					}, {
-						Name: "cell",
-					}, {
-						Name: "cell",
-					}, {
-						Name: "cell",
-					}},
-				}, {
-					Name: "line",
-					Nodes: []*Node{{
-						Name: "cell",
-					}, {
-						Name: "cell",
-					}, {
-						Name: "cell",
-					}, {
-						Name: "cell",
-					}, {
-						Name: "cell",
-					}, {
-						Name: "cell",
-					}},
-				}},
-			},
-		}, {
-			title:          "too many commas",
-			text:           `a,,`,
-			ignorePosition: true,
-			node: &Node{
-				Name: "document",
-				Nodes: []*Node{{
-					Name: "line",
-					Nodes: []*Node{{
-						Name: "cell",
-					}, {
-						Name: "cell",
-					}, {
-						Name: "cell",
-					}},
-				}},
-			},
-		}})
+		runTestsSyntax(t, s, tests)
+	})
+
+	t.Run("with whitespace support", func(t *testing.T) {
+		s, err := openSyntaxString(csvWithWhitespaceSupport)
+		if err != nil {
+			t.Error(err)
+			return
+		}
+
+		runTestsSyntax(t, s, tests)
 	})
 }