add whitespace support

This commit is contained in:
Arpad Ryszka 2017-10-28 22:54:15 +02:00
parent 99246ff28b
commit 5fd3d6b7ba
10 changed files with 498 additions and 146 deletions

View File

@ -24,9 +24,11 @@ func newChar(
} }
func (p *charParser) nodeName() string { return p.name } func (p *charParser) nodeName() string { return p.name }
func (p *charParser) setNodeName(n string) { p.name = n }
func (p *charParser) nodeID() int { return p.id } func (p *charParser) nodeID() int { return p.id }
func (p *charParser) setID(id int) { p.id = id } func (p *charParser) setID(id int) { p.id = id }
func (p *charParser) commitType() CommitType { return Alias } func (p *charParser) commitType() CommitType { return Alias }
func (p *charParser) setCommitType(ct CommitType) {}
func (p *charParser) validate(*registry, *idSet) error { return nil } func (p *charParser) validate(*registry, *idSet) error { return nil }
func (p *charParser) normalize(*registry, *idSet) error { return nil } func (p *charParser) normalize(*registry, *idSet) error { return nil }

View File

@ -33,10 +33,12 @@ func newChoice(name string, ct CommitType, elements []string) *choiceDefinition
} }
} }
func (d *choiceDefinition) nodeName() string { return d.name } func (d *choiceDefinition) nodeName() string { return d.name }
func (d *choiceDefinition) nodeID() int { return d.id } func (d *choiceDefinition) setNodeName(n string) { d.name = n }
func (d *choiceDefinition) setID(id int) { d.id = id } func (d *choiceDefinition) nodeID() int { return d.id }
func (d *choiceDefinition) commitType() CommitType { return d.commit } func (d *choiceDefinition) setID(id int) { d.id = id }
func (d *choiceDefinition) commitType() CommitType { return d.commit }
func (d *choiceDefinition) setCommitType(ct CommitType) { d.commit = ct }
func (d *choiceDefinition) validate(r *registry, path *idSet) error { func (d *choiceDefinition) validate(r *registry, path *idSet) error {
for i := range d.elements { for i := range d.elements {

View File

@ -83,6 +83,10 @@ func flagsToCommitType(n []*Node) CommitType {
switch ni.Name { switch ni.Name {
case "alias": case "alias":
ct |= Alias ct |= Alias
case "ws":
ct |= Whitespace
case "nows":
ct |= NoWhitespace
case "doc": case "doc":
ct |= Documentation ct |= Documentation
case "root": case "root":

View File

@ -1,14 +1,3 @@
[whitespace]
1. merge whitespaces
2. set ws to alias
3. apply whitespace to expressions
- a a -> a ws* a
- a | b -> a | b
- a? -> a{0, 1} -> a{0, 1}
- a+ -> a{1,} -> a (ws* a){,}
- a* -> a{0,} -> (a (ws* a){,}){,}
- root -> ws* root ws*
error reporting error reporting
- longest parse - longest parse
- count the lines - count the lines
@ -19,9 +8,10 @@ read, with error reporting
what was the bug with the large json from eskip? what was the bug with the large json from eskip?
[next] [next]
optimization missing tests, coverage:
why normalization failed - validation
why normalization was slower? - error cases
- whitespace cases
error reporting error reporting
coverage coverage
custom tokens custom tokens

View File

@ -4,8 +4,10 @@ import "fmt"
type definition interface { type definition interface {
nodeName() string nodeName() string
setNodeName(string)
nodeID() int nodeID() int
commitType() CommitType commitType() CommitType
setCommitType(CommitType)
setID(int) setID(int)
validate(*registry, *idSet) error validate(*registry, *idSet) error
normalize(*registry, *idSet) error normalize(*registry, *idSet) error

View File

@ -39,10 +39,12 @@ func newSequence(name string, ct CommitType, items []SequenceItem) *sequenceDefi
} }
} }
func (d *sequenceDefinition) nodeName() string { return d.name } func (d *sequenceDefinition) nodeName() string { return d.name }
func (d *sequenceDefinition) nodeID() int { return d.id } func (d *sequenceDefinition) setNodeName(n string) { d.name = n }
func (d *sequenceDefinition) setID(id int) { d.id = id } func (d *sequenceDefinition) nodeID() int { return d.id }
func (d *sequenceDefinition) commitType() CommitType { return d.commit } func (d *sequenceDefinition) setID(id int) { d.id = id }
func (d *sequenceDefinition) commitType() CommitType { return d.commit }
func (d *sequenceDefinition) setCommitType(ct CommitType) { d.commit = ct }
func (d *sequenceDefinition) validate(r *registry, path *idSet) error { func (d *sequenceDefinition) validate(r *registry, path *idSet) error {
for i := range d.items { for i := range d.items {
@ -153,7 +155,6 @@ func (d *sequenceDefinition) setIncludedBy(r *registry, includedBy int, parsers
} }
func (d *sequenceDefinition) parser(r *registry, parsers *idSet) (parser, error) { func (d *sequenceDefinition) parser(r *registry, parsers *idSet) (parser, error) {
// TODO: what is this for? test with sequence containing a sequence through a choice
if parsers.has(d.id) { if parsers.has(d.id) {
panic(cannotIncludeParsers(d.name)) panic(cannotIncludeParsers(d.name))
} }

View File

@ -12,6 +12,8 @@ type CommitType int
const ( const (
None CommitType = 0 None CommitType = 0
Alias CommitType = 1 << iota Alias CommitType = 1 << iota
Whitespace
NoWhitespace
Documentation Documentation
Root Root
) )
@ -42,7 +44,9 @@ var (
ErrUnexpectedCharacter = errors.New("unexpected character") ErrUnexpectedCharacter = errors.New("unexpected character")
ErrInvalidSyntax = errors.New("invalid syntax") ErrInvalidSyntax = errors.New("invalid syntax")
ErrRootAlias = errors.New("root node cannot be an alias") ErrRootAlias = errors.New("root node cannot be an alias")
ErrRootWhitespace = errors.New("root node cannot be a whitespace")
ErrNotImplemented = errors.New("not implemented") ErrNotImplemented = errors.New("not implemented")
ErrMultipleRoots = errors.New("multiple roots")
) )
func duplicateDefinition(name string) error { func duplicateDefinition(name string) error {
@ -70,12 +74,29 @@ func (s *Syntax) register(d definition) error {
} }
if d.commitType()&Root != 0 { if d.commitType()&Root != 0 {
if s.explicitRoot {
return ErrMultipleRoots
}
if s.root != nil {
s.root.setCommitType(s.root.commitType() &^ Root)
}
s.root = d s.root = d
s.root.setCommitType(s.root.commitType() | Root)
s.explicitRoot = true s.explicitRoot = true
} else if !s.explicitRoot { } else if !s.explicitRoot {
if s.root != nil {
s.root.setCommitType(s.root.commitType() &^ Root)
}
s.root = d s.root = d
s.root.setCommitType(s.root.commitType() | Root)
} }
// TODO: verify that definition names match the symbol criteria, or figure a better naming for the
// whitespace
return s.registry.setDefinition(d) return s.registry.setDefinition(d)
} }
@ -144,6 +165,19 @@ func (s *Syntax) Init() error {
return ErrRootAlias return ErrRootAlias
} }
if s.root.commitType()&Whitespace != 0 {
return ErrRootWhitespace
}
s.registry = initWhitespace(s.registry)
for _, def := range s.registry.definitions {
if def.commitType()&Root != 0 {
s.root = def
break
}
}
if err := s.root.validate(s.registry, &idSet{}); err != nil { if err := s.root.validate(s.registry, &idSet{}); err != nil {
return err return err
} }

View File

@ -1,5 +1,5 @@
ws:alias = " " | "\t" | "\n" | "\b" | "\f" | "\r" | "\v"; wschar:alias = " " | "\t" | "\n" | "\b" | "\f" | "\r" | "\v";
wsc:alias = ws | comment; wsc:alias = wschar | comment;
block-comment:alias = "/*" ("*" [^/] | [^*])* "*/"; block-comment:alias = "/*" ("*" [^/] | [^*])* "*/";
line-comment:alias = "//" [^\n]*; line-comment:alias = "//" [^\n]*;
@ -57,9 +57,11 @@ expression:alias = terminal
| choice; | choice;
alias = "alias"; alias = "alias";
ws = "ws";
nows = "nows";
doc = "doc"; doc = "doc";
root = "root"; root = "root";
flag:alias = alias | doc | root; flag:alias = alias | ws | nows | doc | root;
definition = symbol (":" flag)* wsc* "=" wsc* expression; definition = symbol (":" flag)* wsc* "=" wsc* expression;
definitions:alias = definition (wsc* ";" (wsc | ";")* definition)*; definitions:alias = definition (wsc* ";" (wsc | ";")* definition)*;

172
whitespace.go Normal file
View File

@ -0,0 +1,172 @@
package treerack
import "fmt"
const whitespaceName = ":ws"
func brokenRegistryError(err error) error {
return fmt.Errorf("broken registry: %v", err)
}
func splitWhitespaceDefs(all map[string]definition) ([]definition, []definition) {
var whitespaceDefs, nonWhitespaceDefs []definition
for _, def := range all {
if def.commitType()&Whitespace != 0 {
def.setCommitType(def.commitType() | Alias)
whitespaceDefs = append(whitespaceDefs, def)
continue
}
nonWhitespaceDefs = append(nonWhitespaceDefs, def)
}
return whitespaceDefs, nonWhitespaceDefs
}
func splitRoot(defs []definition) (definition, []definition) {
var (
root definition
rest []definition
)
for _, def := range defs {
if def.commitType()&Root != 0 {
root = def
continue
}
rest = append(rest, def)
}
return root, rest
}
func mergeWhitespaceDefs(ws []definition) definition {
var names []string
for _, def := range ws {
names = append(names, def.nodeName())
}
return newChoice(whitespaceName, Alias, names)
}
// TODO: validate min and max
func applyWhitespaceToSeq(s *sequenceDefinition) []definition {
var (
defs []definition
items []SequenceItem
)
for i, item := range s.items {
if i > 0 {
items = append(items, SequenceItem{Name: whitespaceName, Min: 0, Max: -1})
}
if item.Max >= 0 && item.Max <= 1 {
items = append(items, item)
continue
}
singleItem := SequenceItem{Name: item.Name, Min: 1, Max: 1}
restName := item.Name + ":wsrest"
restDef := newSequence(restName, Alias, []SequenceItem{{Name: whitespaceName, Min: 0, Max: -1}, singleItem})
defs = append(defs, restDef)
restItems := SequenceItem{Name: restName, Min: 0, Max: -1}
if item.Min > 0 {
restItems.Min = item.Min - 1
}
if item.Max > 0 {
restItems.Min = item.Max - 1
}
if item.Min > 0 {
items = append(items, singleItem, restItems)
continue
}
optName := item.Name + ":wsopt"
optDef := newSequence(optName, Alias, []SequenceItem{singleItem, restItems})
defs = append(defs, optDef)
items = append(items, SequenceItem{Name: optName, Min: 0, Max: 1})
}
s = newSequence(s.nodeName(), s.commitType(), items)
defs = append(defs, s)
return defs
}
func applyWhitespace(defs []definition) []definition {
var defsWS []definition
for _, def := range defs {
if def.commitType()&NoWhitespace != 0 {
defsWS = append(defsWS, def)
continue
}
seq, ok := def.(*sequenceDefinition)
if !ok {
defsWS = append(defsWS, def)
continue
}
defsWS = append(defsWS, applyWhitespaceToSeq(seq)...)
}
return defsWS
}
func applyWhitespaceRoot(root definition) (definition, definition) {
original, name := root, root.nodeName()
wsName := ":wsroot:" + name
original.setNodeName(wsName)
original.setCommitType(original.commitType() &^ Root)
original.setCommitType(original.commitType() | Alias)
root = newSequence(name, Root, []SequenceItem{{
Name: whitespaceName,
Min: 0,
Max: -1,
}, {
Name: wsName,
Min: 1,
Max: 1,
}, {
Name: whitespaceName,
Min: 0,
Max: -1,
}})
return original, root
}
func registerPatched(r *registry, defs ...definition) {
for _, def := range defs {
if err := r.setDefinition(def); err != nil {
panic(brokenRegistryError(err))
}
}
}
func initWhitespace(r *registry) *registry {
whitespaceDefs, defs := splitWhitespaceDefs(r.definitions)
if len(whitespaceDefs) == 0 {
return r
}
whitespace := mergeWhitespaceDefs(whitespaceDefs)
defs = applyWhitespace(defs)
root, defs := splitRoot(defs)
originalRoot, root := applyWhitespaceRoot(root)
r = newRegistry()
registerPatched(r, whitespace)
registerPatched(r, whitespaceDefs...)
registerPatched(r, defs...)
registerPatched(r, originalRoot, root)
return r
}

View File

@ -4,137 +4,280 @@ import "testing"
const ( const (
csvWithoutWhitespaceSupport = ` csvWithoutWhitespaceSupport = `
ws:alias = [ \t]*; ws:alias = [ \t];
word-char:alias = [^\n, \t];
cell = (word-char (ws* word-char)*)?;
rest-cell:alias = "," ws* cell;
line = cell ws* (rest-cell (ws* rest-cell)*)?;
rest-line:alias = "\n" ws* line;
document = ws* (line ws* (rest-line (ws* rest-line)*)?)? ws*;
`
csvWithWhitespaceSupport = `
ws:ws = [ \t];
cell = [^\n, \t]*; cell = [^\n, \t]*;
line = ws cell (ws "," ws cell)* ws; line = cell ("," cell)*;
document = (line ("\n" line)*)?; document = (line ("\n" line)*)?;
` `
) )
func TestCSVWhitespace(t *testing.T) { func TestCSVWhitespace(t *testing.T) {
t.Run("wihout whitespace support", func(t *testing.T) { tests := []testItem{{
title: "empty",
node: &Node{
Name: "document",
},
}, {
title: "only a cell",
text: "abc",
ignorePosition: true,
node: &Node{
Name: "document",
Nodes: []*Node{{
Name: "line",
Nodes: []*Node{{
Name: "cell",
}},
}},
},
}, {
title: "single line",
text: `a, b, c`,
ignorePosition: true,
node: &Node{
Name: "document",
Nodes: []*Node{{
Name: "line",
Nodes: []*Node{{
Name: "cell",
}, {
Name: "cell",
}, {
Name: "cell",
}},
}},
},
}, {
title: "regular csv",
text: `a, b, c
d, e, f`,
ignorePosition: true,
node: &Node{
Name: "document",
Nodes: []*Node{{
Name: "line",
Nodes: []*Node{{
Name: "cell",
}, {
Name: "cell",
}, {
Name: "cell",
}},
}, {
Name: "line",
Nodes: []*Node{{
Name: "cell",
}, {
Name: "cell",
}, {
Name: "cell",
}},
}},
},
}, {
title: "irregular csv",
text: `a,, b, c,
d, ,,,,`,
ignorePosition: true,
node: &Node{
Name: "document",
Nodes: []*Node{{
Name: "line",
Nodes: []*Node{{
Name: "cell",
}, {
Name: "cell",
}, {
Name: "cell",
}, {
Name: "cell",
}, {
Name: "cell",
}},
}, {
Name: "line",
Nodes: []*Node{{
Name: "cell",
}, {
Name: "cell",
}, {
Name: "cell",
}, {
Name: "cell",
}, {
Name: "cell",
}, {
Name: "cell",
}},
}},
},
}, {
title: "too many commas",
text: `a,,`,
ignorePosition: true,
node: &Node{
Name: "document",
Nodes: []*Node{{
Name: "line",
Nodes: []*Node{{
Name: "cell",
}, {
Name: "cell",
}, {
Name: "cell",
}},
}},
},
}, {
title: "csv with tabs",
text: "a,\tb, c",
ignorePosition: true,
node: &Node{
Name: "document",
Nodes: []*Node{{
Name: "line",
Nodes: []*Node{{
Name: "cell",
}, {
Name: "cell",
}, {
Name: "cell",
}},
}},
},
}, {
title: "whitespace between lines",
text: " a, b, c \n d, e, f ",
node: &Node{
Name: "document",
To: 19,
Nodes: []*Node{{
Name: "line",
From: 1,
To: 8,
Nodes: []*Node{{
Name: "cell",
From: 1,
To: 2,
}, {
Name: "cell",
From: 4,
To: 5,
}, {
Name: "cell",
From: 7,
To: 8,
}},
}, {
Name: "line",
From: 11,
To: 18,
Nodes: []*Node{{
Name: "cell",
From: 11,
To: 12,
}, {
Name: "cell",
From: 14,
To: 15,
}, {
Name: "cell",
From: 17,
To: 18,
}},
}},
},
}, {
title: "just a space",
text: " ",
ignorePosition: true,
node: &Node{
Name: "document",
},
}, {
title: "cell with spaces in it",
text: "cell content 1/1, cell content 1/2\ncell content 2/1, cell content 2/2",
node: &Node{
Name: "document",
To: 69,
Nodes: []*Node{{
Name: "line",
To: 34,
Nodes: []*Node{{
Name: "cell",
To: 16,
}, {
Name: "cell",
From: 18,
To: 34,
}},
}, {
Name: "line",
From: 35,
To: 69,
Nodes: []*Node{{
Name: "cell",
From: 35,
To: 51,
}, {
Name: "cell",
From: 53,
To: 69,
}},
}},
},
}, {
title: "multiple empty lines",
text: "\n\n",
ignorePosition: true,
node: &Node{
Name: "document",
Nodes: []*Node{{
Name: "line",
Nodes: []*Node{{
Name: "cell",
}},
}, {
Name: "line",
Nodes: []*Node{{
Name: "cell",
}},
}, {
Name: "line",
Nodes: []*Node{{
Name: "cell",
}},
}},
},
}}
t.Run("without whitespace support", func(t *testing.T) {
s, err := openSyntaxString(csvWithoutWhitespaceSupport) s, err := openSyntaxString(csvWithoutWhitespaceSupport)
if err != nil { if err != nil {
t.Error(err) t.Error(err)
return return
} }
runTestsSyntax(t, s, []testItem{{ runTestsSyntax(t, s, tests)
title: "empty", })
node: &Node{
Name: "document", t.Run("with whitespace support", func(t *testing.T) {
}, s, err := openSyntaxString(csvWithWhitespaceSupport)
}, { if err != nil {
title: "only a cell", t.Error(err)
text: "abc", return
ignorePosition: true, }
node: &Node{
Name: "document", runTestsSyntax(t, s, tests)
Nodes: []*Node{{
Name: "line",
Nodes: []*Node{{
Name: "cell",
}},
}},
},
}, {
title: "single line",
text: `a, b, c`,
ignorePosition: true,
node: &Node{
Name: "document",
Nodes: []*Node{{
Name: "line",
Nodes: []*Node{{
Name: "cell",
}, {
Name: "cell",
}, {
Name: "cell",
}},
}},
},
}, {
title: "regular csv",
text: `a, b, c
d, e, f`,
ignorePosition: true,
node: &Node{
Name: "document",
Nodes: []*Node{{
Name: "line",
Nodes: []*Node{{
Name: "cell",
}, {
Name: "cell",
}, {
Name: "cell",
}},
}, {
Name: "line",
Nodes: []*Node{{
Name: "cell",
}, {
Name: "cell",
}, {
Name: "cell",
}},
}},
},
}, {
title: "irregular csv",
text: `a,, b, c,
d, ,,,,`,
ignorePosition: true,
node: &Node{
Name: "document",
Nodes: []*Node{{
Name: "line",
Nodes: []*Node{{
Name: "cell",
}, {
Name: "cell",
}, {
Name: "cell",
}, {
Name: "cell",
}, {
Name: "cell",
}},
}, {
Name: "line",
Nodes: []*Node{{
Name: "cell",
}, {
Name: "cell",
}, {
Name: "cell",
}, {
Name: "cell",
}, {
Name: "cell",
}, {
Name: "cell",
}},
}},
},
}, {
title: "too many commas",
text: `a,,`,
ignorePosition: true,
node: &Node{
Name: "document",
Nodes: []*Node{{
Name: "line",
Nodes: []*Node{{
Name: "cell",
}, {
Name: "cell",
}, {
Name: "cell",
}},
}},
},
}})
}) })
} }