From 5fd3d6b7bab6ddb31c8b2e79253b8f03d85d137a Mon Sep 17 00:00:00 2001 From: Arpad Ryszka Date: Sat, 28 Oct 2017 22:54:15 +0200 Subject: [PATCH] add whitespace support --- char.go | 2 + choice.go | 10 +- define.go | 4 + notes.txt | 18 +-- parse.go | 2 + sequence.go | 11 +- syntax.go | 34 ++++ syntax.parser | 8 +- whitespace.go | 172 ++++++++++++++++++++ whitespace_test.go | 383 +++++++++++++++++++++++++++++++-------------- 10 files changed, 498 insertions(+), 146 deletions(-) create mode 100644 whitespace.go diff --git a/char.go b/char.go index ff2c184..750085d 100644 --- a/char.go +++ b/char.go @@ -24,9 +24,11 @@ func newChar( } func (p *charParser) nodeName() string { return p.name } +func (p *charParser) setNodeName(n string) { p.name = n } func (p *charParser) nodeID() int { return p.id } func (p *charParser) setID(id int) { p.id = id } func (p *charParser) commitType() CommitType { return Alias } +func (p *charParser) setCommitType(ct CommitType) {} func (p *charParser) validate(*registry, *idSet) error { return nil } func (p *charParser) normalize(*registry, *idSet) error { return nil } diff --git a/choice.go b/choice.go index 4b2aa36..9789280 100644 --- a/choice.go +++ b/choice.go @@ -33,10 +33,12 @@ func newChoice(name string, ct CommitType, elements []string) *choiceDefinition } } -func (d *choiceDefinition) nodeName() string { return d.name } -func (d *choiceDefinition) nodeID() int { return d.id } -func (d *choiceDefinition) setID(id int) { d.id = id } -func (d *choiceDefinition) commitType() CommitType { return d.commit } +func (d *choiceDefinition) nodeName() string { return d.name } +func (d *choiceDefinition) setNodeName(n string) { d.name = n } +func (d *choiceDefinition) nodeID() int { return d.id } +func (d *choiceDefinition) setID(id int) { d.id = id } +func (d *choiceDefinition) commitType() CommitType { return d.commit } +func (d *choiceDefinition) setCommitType(ct CommitType) { d.commit = ct } func (d *choiceDefinition) validate(r *registry, path *idSet) error { for i := range d.elements { diff --git a/define.go b/define.go index 78ed61e..f744acb 100644 --- a/define.go +++ b/define.go @@ -83,6 +83,10 @@ func flagsToCommitType(n []*Node) CommitType { switch ni.Name { case "alias": ct |= Alias + case "ws": + ct |= Whitespace + case "nows": + ct |= NoWhitespace case "doc": ct |= Documentation case "root": diff --git a/notes.txt b/notes.txt index cc6ffdb..ad64be7 100644 --- a/notes.txt +++ b/notes.txt @@ -1,14 +1,3 @@ -[whitespace] -1. merge whitespaces -2. set ws to alias -3. apply whitespace to expressions -- a a -> a ws* a -- a | b -> a | b -- a? -> a{0, 1} -> a{0, 1} -- a+ -> a{1,} -> a (ws* a){,} -- a* -> a{0,} -> (a (ws* a){,}){,} -- root -> ws* root ws* - error reporting - longest parse - count the lines @@ -19,9 +8,10 @@ read, with error reporting what was the bug with the large json from eskip? [next] -optimization -why normalization failed -why normalization was slower? +missing tests, coverage: +- validation +- error cases +- whitespace cases error reporting coverage custom tokens diff --git a/parse.go b/parse.go index b2f9509..1601a43 100644 --- a/parse.go +++ b/parse.go @@ -4,8 +4,10 @@ import "fmt" type definition interface { nodeName() string + setNodeName(string) nodeID() int commitType() CommitType + setCommitType(CommitType) setID(int) validate(*registry, *idSet) error normalize(*registry, *idSet) error diff --git a/sequence.go b/sequence.go index 4b3cbc6..9b6d401 100644 --- a/sequence.go +++ b/sequence.go @@ -39,10 +39,12 @@ func newSequence(name string, ct CommitType, items []SequenceItem) *sequenceDefi } } -func (d *sequenceDefinition) nodeName() string { return d.name } -func (d *sequenceDefinition) nodeID() int { return d.id } -func (d *sequenceDefinition) setID(id int) { d.id = id } -func (d *sequenceDefinition) commitType() CommitType { return d.commit } +func (d *sequenceDefinition) nodeName() string { return d.name } +func (d *sequenceDefinition) setNodeName(n string) { d.name = n } +func (d *sequenceDefinition) nodeID() int { return d.id } +func (d *sequenceDefinition) setID(id int) { d.id = id } +func (d *sequenceDefinition) commitType() CommitType { return d.commit } +func (d *sequenceDefinition) setCommitType(ct CommitType) { d.commit = ct } func (d *sequenceDefinition) validate(r *registry, path *idSet) error { for i := range d.items { @@ -153,7 +155,6 @@ func (d *sequenceDefinition) setIncludedBy(r *registry, includedBy int, parsers } func (d *sequenceDefinition) parser(r *registry, parsers *idSet) (parser, error) { - // TODO: what is this for? test with sequence containing a sequence through a choice if parsers.has(d.id) { panic(cannotIncludeParsers(d.name)) } diff --git a/syntax.go b/syntax.go index a9637e6..c5b1183 100644 --- a/syntax.go +++ b/syntax.go @@ -12,6 +12,8 @@ type CommitType int const ( None CommitType = 0 Alias CommitType = 1 << iota + Whitespace + NoWhitespace Documentation Root ) @@ -42,7 +44,9 @@ var ( ErrUnexpectedCharacter = errors.New("unexpected character") ErrInvalidSyntax = errors.New("invalid syntax") ErrRootAlias = errors.New("root node cannot be an alias") + ErrRootWhitespace = errors.New("root node cannot be a whitespace") ErrNotImplemented = errors.New("not implemented") + ErrMultipleRoots = errors.New("multiple roots") ) func duplicateDefinition(name string) error { @@ -70,12 +74,29 @@ func (s *Syntax) register(d definition) error { } if d.commitType()&Root != 0 { + if s.explicitRoot { + return ErrMultipleRoots + } + + if s.root != nil { + s.root.setCommitType(s.root.commitType() &^ Root) + } + s.root = d + s.root.setCommitType(s.root.commitType() | Root) s.explicitRoot = true } else if !s.explicitRoot { + if s.root != nil { + s.root.setCommitType(s.root.commitType() &^ Root) + } + s.root = d + s.root.setCommitType(s.root.commitType() | Root) } + // TODO: verify that definition names match the symbol criteria, or figure a better naming for the + // whitespace + return s.registry.setDefinition(d) } @@ -144,6 +165,19 @@ func (s *Syntax) Init() error { return ErrRootAlias } + if s.root.commitType()&Whitespace != 0 { + return ErrRootWhitespace + } + + s.registry = initWhitespace(s.registry) + + for _, def := range s.registry.definitions { + if def.commitType()&Root != 0 { + s.root = def + break + } + } + if err := s.root.validate(s.registry, &idSet{}); err != nil { return err } diff --git a/syntax.parser b/syntax.parser index 7c3fd86..5fccf15 100644 --- a/syntax.parser +++ b/syntax.parser @@ -1,5 +1,5 @@ -ws:alias = " " | "\t" | "\n" | "\b" | "\f" | "\r" | "\v"; -wsc:alias = ws | comment; +wschar:alias = " " | "\t" | "\n" | "\b" | "\f" | "\r" | "\v"; +wsc:alias = wschar | comment; block-comment:alias = "/*" ("*" [^/] | [^*])* "*/"; line-comment:alias = "//" [^\n]*; @@ -57,9 +57,11 @@ expression:alias = terminal | choice; alias = "alias"; +ws = "ws"; +nows = "nows"; doc = "doc"; root = "root"; -flag:alias = alias | doc | root; +flag:alias = alias | ws | nows | doc | root; definition = symbol (":" flag)* wsc* "=" wsc* expression; definitions:alias = definition (wsc* ";" (wsc | ";")* definition)*; diff --git a/whitespace.go b/whitespace.go new file mode 100644 index 0000000..aad0843 --- /dev/null +++ b/whitespace.go @@ -0,0 +1,172 @@ +package treerack + +import "fmt" + +const whitespaceName = ":ws" + +func brokenRegistryError(err error) error { + return fmt.Errorf("broken registry: %v", err) +} + +func splitWhitespaceDefs(all map[string]definition) ([]definition, []definition) { + var whitespaceDefs, nonWhitespaceDefs []definition + for _, def := range all { + if def.commitType()&Whitespace != 0 { + def.setCommitType(def.commitType() | Alias) + whitespaceDefs = append(whitespaceDefs, def) + continue + } + + nonWhitespaceDefs = append(nonWhitespaceDefs, def) + } + + return whitespaceDefs, nonWhitespaceDefs +} + +func splitRoot(defs []definition) (definition, []definition) { + var ( + root definition + rest []definition + ) + + for _, def := range defs { + if def.commitType()&Root != 0 { + root = def + continue + } + + rest = append(rest, def) + } + + return root, rest +} + +func mergeWhitespaceDefs(ws []definition) definition { + var names []string + for _, def := range ws { + names = append(names, def.nodeName()) + } + + return newChoice(whitespaceName, Alias, names) +} + +// TODO: validate min and max + +func applyWhitespaceToSeq(s *sequenceDefinition) []definition { + var ( + defs []definition + items []SequenceItem + ) + + for i, item := range s.items { + if i > 0 { + items = append(items, SequenceItem{Name: whitespaceName, Min: 0, Max: -1}) + } + + if item.Max >= 0 && item.Max <= 1 { + items = append(items, item) + continue + } + + singleItem := SequenceItem{Name: item.Name, Min: 1, Max: 1} + + restName := item.Name + ":wsrest" + restDef := newSequence(restName, Alias, []SequenceItem{{Name: whitespaceName, Min: 0, Max: -1}, singleItem}) + defs = append(defs, restDef) + + restItems := SequenceItem{Name: restName, Min: 0, Max: -1} + if item.Min > 0 { + restItems.Min = item.Min - 1 + } + if item.Max > 0 { + restItems.Min = item.Max - 1 + } + + if item.Min > 0 { + items = append(items, singleItem, restItems) + continue + } + + optName := item.Name + ":wsopt" + optDef := newSequence(optName, Alias, []SequenceItem{singleItem, restItems}) + defs = append(defs, optDef) + items = append(items, SequenceItem{Name: optName, Min: 0, Max: 1}) + } + + s = newSequence(s.nodeName(), s.commitType(), items) + defs = append(defs, s) + return defs +} + +func applyWhitespace(defs []definition) []definition { + var defsWS []definition + for _, def := range defs { + if def.commitType()&NoWhitespace != 0 { + defsWS = append(defsWS, def) + continue + } + + seq, ok := def.(*sequenceDefinition) + if !ok { + defsWS = append(defsWS, def) + continue + } + + defsWS = append(defsWS, applyWhitespaceToSeq(seq)...) + } + + return defsWS +} + +func applyWhitespaceRoot(root definition) (definition, definition) { + original, name := root, root.nodeName() + wsName := ":wsroot:" + name + + original.setNodeName(wsName) + original.setCommitType(original.commitType() &^ Root) + original.setCommitType(original.commitType() | Alias) + + root = newSequence(name, Root, []SequenceItem{{ + Name: whitespaceName, + Min: 0, + Max: -1, + }, { + Name: wsName, + Min: 1, + Max: 1, + }, { + Name: whitespaceName, + Min: 0, + Max: -1, + }}) + + return original, root +} + +func registerPatched(r *registry, defs ...definition) { + for _, def := range defs { + if err := r.setDefinition(def); err != nil { + panic(brokenRegistryError(err)) + } + } +} + +func initWhitespace(r *registry) *registry { + whitespaceDefs, defs := splitWhitespaceDefs(r.definitions) + if len(whitespaceDefs) == 0 { + return r + } + + whitespace := mergeWhitespaceDefs(whitespaceDefs) + defs = applyWhitespace(defs) + + root, defs := splitRoot(defs) + originalRoot, root := applyWhitespaceRoot(root) + + r = newRegistry() + registerPatched(r, whitespace) + registerPatched(r, whitespaceDefs...) + registerPatched(r, defs...) + registerPatched(r, originalRoot, root) + return r +} diff --git a/whitespace_test.go b/whitespace_test.go index 6dba6f1..e29ad54 100644 --- a/whitespace_test.go +++ b/whitespace_test.go @@ -4,137 +4,280 @@ import "testing" const ( csvWithoutWhitespaceSupport = ` - ws:alias = [ \t]*; + ws:alias = [ \t]; + word-char:alias = [^\n, \t]; + cell = (word-char (ws* word-char)*)?; + rest-cell:alias = "," ws* cell; + line = cell ws* (rest-cell (ws* rest-cell)*)?; + rest-line:alias = "\n" ws* line; + document = ws* (line ws* (rest-line (ws* rest-line)*)?)? ws*; + ` + + csvWithWhitespaceSupport = ` + ws:ws = [ \t]; cell = [^\n, \t]*; - line = ws cell (ws "," ws cell)* ws; + line = cell ("," cell)*; document = (line ("\n" line)*)?; ` ) func TestCSVWhitespace(t *testing.T) { - t.Run("wihout whitespace support", func(t *testing.T) { + tests := []testItem{{ + title: "empty", + node: &Node{ + Name: "document", + }, + }, { + title: "only a cell", + text: "abc", + ignorePosition: true, + node: &Node{ + Name: "document", + Nodes: []*Node{{ + Name: "line", + Nodes: []*Node{{ + Name: "cell", + }}, + }}, + }, + }, { + title: "single line", + text: `a, b, c`, + ignorePosition: true, + node: &Node{ + Name: "document", + Nodes: []*Node{{ + Name: "line", + Nodes: []*Node{{ + Name: "cell", + }, { + Name: "cell", + }, { + Name: "cell", + }}, + }}, + }, + }, { + title: "regular csv", + text: `a, b, c + d, e, f`, + ignorePosition: true, + node: &Node{ + Name: "document", + Nodes: []*Node{{ + Name: "line", + Nodes: []*Node{{ + Name: "cell", + }, { + Name: "cell", + }, { + Name: "cell", + }}, + }, { + Name: "line", + Nodes: []*Node{{ + Name: "cell", + }, { + Name: "cell", + }, { + Name: "cell", + }}, + }}, + }, + }, { + title: "irregular csv", + text: `a,, b, c, + d, ,,,,`, + ignorePosition: true, + node: &Node{ + Name: "document", + Nodes: []*Node{{ + Name: "line", + Nodes: []*Node{{ + Name: "cell", + }, { + Name: "cell", + }, { + Name: "cell", + }, { + Name: "cell", + }, { + Name: "cell", + }}, + }, { + Name: "line", + Nodes: []*Node{{ + Name: "cell", + }, { + Name: "cell", + }, { + Name: "cell", + }, { + Name: "cell", + }, { + Name: "cell", + }, { + Name: "cell", + }}, + }}, + }, + }, { + title: "too many commas", + text: `a,,`, + ignorePosition: true, + node: &Node{ + Name: "document", + Nodes: []*Node{{ + Name: "line", + Nodes: []*Node{{ + Name: "cell", + }, { + Name: "cell", + }, { + Name: "cell", + }}, + }}, + }, + }, { + title: "csv with tabs", + text: "a,\tb, c", + ignorePosition: true, + node: &Node{ + Name: "document", + Nodes: []*Node{{ + Name: "line", + Nodes: []*Node{{ + Name: "cell", + }, { + Name: "cell", + }, { + Name: "cell", + }}, + }}, + }, + }, { + title: "whitespace between lines", + text: " a, b, c \n d, e, f ", + node: &Node{ + Name: "document", + To: 19, + Nodes: []*Node{{ + Name: "line", + From: 1, + To: 8, + Nodes: []*Node{{ + Name: "cell", + From: 1, + To: 2, + }, { + Name: "cell", + From: 4, + To: 5, + }, { + Name: "cell", + From: 7, + To: 8, + }}, + }, { + Name: "line", + From: 11, + To: 18, + Nodes: []*Node{{ + Name: "cell", + From: 11, + To: 12, + }, { + Name: "cell", + From: 14, + To: 15, + }, { + Name: "cell", + From: 17, + To: 18, + }}, + }}, + }, + }, { + title: "just a space", + text: " ", + ignorePosition: true, + node: &Node{ + Name: "document", + }, + }, { + title: "cell with spaces in it", + text: "cell content 1/1, cell content 1/2\ncell content 2/1, cell content 2/2", + node: &Node{ + Name: "document", + To: 69, + Nodes: []*Node{{ + Name: "line", + To: 34, + Nodes: []*Node{{ + Name: "cell", + To: 16, + }, { + Name: "cell", + From: 18, + To: 34, + }}, + }, { + Name: "line", + From: 35, + To: 69, + Nodes: []*Node{{ + Name: "cell", + From: 35, + To: 51, + }, { + Name: "cell", + From: 53, + To: 69, + }}, + }}, + }, + }, { + title: "multiple empty lines", + text: "\n\n", + ignorePosition: true, + node: &Node{ + Name: "document", + Nodes: []*Node{{ + Name: "line", + Nodes: []*Node{{ + Name: "cell", + }}, + }, { + Name: "line", + Nodes: []*Node{{ + Name: "cell", + }}, + }, { + Name: "line", + Nodes: []*Node{{ + Name: "cell", + }}, + }}, + }, + }} + + t.Run("without whitespace support", func(t *testing.T) { s, err := openSyntaxString(csvWithoutWhitespaceSupport) if err != nil { t.Error(err) return } - runTestsSyntax(t, s, []testItem{{ - title: "empty", - node: &Node{ - Name: "document", - }, - }, { - title: "only a cell", - text: "abc", - ignorePosition: true, - node: &Node{ - Name: "document", - Nodes: []*Node{{ - Name: "line", - Nodes: []*Node{{ - Name: "cell", - }}, - }}, - }, - }, { - title: "single line", - text: `a, b, c`, - ignorePosition: true, - node: &Node{ - Name: "document", - Nodes: []*Node{{ - Name: "line", - Nodes: []*Node{{ - Name: "cell", - }, { - Name: "cell", - }, { - Name: "cell", - }}, - }}, - }, - }, { - title: "regular csv", - text: `a, b, c - d, e, f`, - ignorePosition: true, - node: &Node{ - Name: "document", - Nodes: []*Node{{ - Name: "line", - Nodes: []*Node{{ - Name: "cell", - }, { - Name: "cell", - }, { - Name: "cell", - }}, - }, { - Name: "line", - Nodes: []*Node{{ - Name: "cell", - }, { - Name: "cell", - }, { - Name: "cell", - }}, - }}, - }, - }, { - title: "irregular csv", - text: `a,, b, c, - d, ,,,,`, - ignorePosition: true, - node: &Node{ - Name: "document", - Nodes: []*Node{{ - Name: "line", - Nodes: []*Node{{ - Name: "cell", - }, { - Name: "cell", - }, { - Name: "cell", - }, { - Name: "cell", - }, { - Name: "cell", - }}, - }, { - Name: "line", - Nodes: []*Node{{ - Name: "cell", - }, { - Name: "cell", - }, { - Name: "cell", - }, { - Name: "cell", - }, { - Name: "cell", - }, { - Name: "cell", - }}, - }}, - }, - }, { - title: "too many commas", - text: `a,,`, - ignorePosition: true, - node: &Node{ - Name: "document", - Nodes: []*Node{{ - Name: "line", - Nodes: []*Node{{ - Name: "cell", - }, { - Name: "cell", - }, { - Name: "cell", - }}, - }}, - }, - }}) + runTestsSyntax(t, s, tests) + }) + + t.Run("with whitespace support", func(t *testing.T) { + s, err := openSyntaxString(csvWithWhitespaceSupport) + if err != nil { + t.Error(err) + return + } + + runTestsSyntax(t, s, tests) }) }