500 lines
12 KiB
Python
500 lines
12 KiB
Python
#!/usr/bin/python2.7
|
|
|
|
import collections
|
|
|
|
class MatchResult(collections.namedtuple('MatchResult', ['context', 'nodes'])):
|
|
pass
|
|
|
|
|
|
class TextNode(object):
|
|
def __init__(self, textContent):
|
|
self.nodeName = '#text'
|
|
self.parentNode = None
|
|
self.textContent = textContent
|
|
self.previousSibling = None
|
|
self.nextSibling = None
|
|
self.childNodes = []
|
|
|
|
def cloneNode(self, deep):
|
|
return TextNode(self.textContent)
|
|
|
|
def getTextContent(self):
|
|
return self.textContent
|
|
|
|
def __str__(self):
|
|
return (self.textContent
|
|
.replace('&', '&')
|
|
.replace('<', '<')
|
|
.replace('>', '>'))
|
|
|
|
|
|
class Element(object):
|
|
def __init__(self, nodeName):
|
|
self.nodeName = nodeName
|
|
self.parentNode = None
|
|
self.previousSibling = None
|
|
self.nextSibling = None
|
|
self.childNodes = []
|
|
self.attributes = {}
|
|
|
|
def appendChild(self, child):
|
|
if child.parentNode:
|
|
child.parentNode.removeChild(child)
|
|
child.parentNode = self
|
|
self.childNodes.append(child)
|
|
if len(self.childNodes) == 1:
|
|
child.previousSibling = None
|
|
else:
|
|
beforeChild = self.childNodes[len(self.childNodes) - 2]
|
|
child.previousSibling = beforeChild
|
|
beforeChild.nextSibling = child
|
|
|
|
def removeChild(self, child):
|
|
self.childNodes.remove(child)
|
|
if child.previousSibling:
|
|
child.previousSibling.nextSibling = child.nextSibling
|
|
if child.nextSibling:
|
|
child.nextSibling.previousSibling = child.previousSibling
|
|
child.parentNode = None
|
|
child.previousSibling = None
|
|
child.nextSibling = None
|
|
|
|
def replaceChild(self, newNode, oldNode):
|
|
index = self.childNodes.index(oldNode)
|
|
self.childNodes[index] = newNode
|
|
newNode.parentNode = self
|
|
newNode.previousSibling = oldNode.previousSibling
|
|
newNode.nextSibling = oldNode.nextSibling
|
|
if newNode.previousSibling:
|
|
newNode.previousSibling.nextSibling = newNode
|
|
if newNode.nextSibling:
|
|
newNode.nextSibling.previousSibling = newNode
|
|
oldNode.parentNode = None
|
|
oldNode.previousSibling = None
|
|
oldNode.nextSibling = None
|
|
|
|
def insertBefore(self, newNode, oldNode):
|
|
index = self.childNodes.index(oldNode)
|
|
self.childNodes.insert(index, newNode)
|
|
if oldNode.previousSibling:
|
|
oldNode.previousSibling.nextSibling = newNode
|
|
newNode.previousSibling = oldNode.previousSibling
|
|
oldNode.previousSibling = newNode
|
|
newNode.nextSibling = oldNode
|
|
|
|
def normalize(self):
|
|
lastTextNode = None
|
|
for childNode in list(self.childNodes):
|
|
if isinstance(childNode, TextNode):
|
|
if lastTextNode:
|
|
lastTextNode.textContent += childNode.textContent
|
|
self.removeChild(childNode)
|
|
else:
|
|
lastTextNode = childNode
|
|
else:
|
|
lastTextNode = None
|
|
|
|
def renameNode(self, nodeName):
|
|
self.nodeName = nodeName
|
|
|
|
def setAttribute(self, key, value):
|
|
self.attributes[key] = value
|
|
|
|
def getAttribute(self, key):
|
|
return self.attributes[key]
|
|
|
|
def cloneNode(self, deep):
|
|
element = Element(self.nodeName)
|
|
if not deep:
|
|
return element
|
|
for childNode in self.childNodes:
|
|
element.appendChild(childNode.cloneNode(True))
|
|
for key, value in self.attributes.iteritems():
|
|
element.setAttribute(key, value)
|
|
return element
|
|
|
|
def getTextContent(self):
|
|
return ''.join(x.getTextContent() for x in self.childNodes)
|
|
|
|
def __str__(self):
|
|
# TODO: attributes
|
|
values = map(str, self.childNodes)
|
|
return '<%s%s>%s</%s>' % (
|
|
self.nodeName,
|
|
''.join(' %s="%s"' % (k, v.replace('"', '"'))
|
|
for k, v in self.attributes.iteritems()),
|
|
''.join(values),
|
|
self.nodeName)
|
|
|
|
|
|
# ============ Matchers ============
|
|
|
|
|
|
class Matcher(object):
|
|
pass
|
|
|
|
|
|
class CharExcept(Matcher):
|
|
def __init__(self, chars):
|
|
self._chars = chars
|
|
|
|
def match(self, context):
|
|
c = context.stringAfter(1)
|
|
if c and c not in self._chars:
|
|
yield MatchResult(
|
|
context.advance(1),
|
|
[TextNode(c)])
|
|
|
|
|
|
class EndOfLine(Matcher):
|
|
def match(self, context):
|
|
if context.atEnd():
|
|
yield MatchResult(
|
|
context,
|
|
[])
|
|
if context.stringAfter(1) == '\n':
|
|
yield MatchResult(
|
|
context.advance(1),
|
|
[])
|
|
if context.stringBefore(1) == '\n':
|
|
yield MatchResult(
|
|
context,
|
|
[])
|
|
|
|
|
|
class EndOfText(Matcher):
|
|
def match(self, context):
|
|
if context.atEnd():
|
|
yield MatchResult(
|
|
context,
|
|
[])
|
|
|
|
|
|
class Hidden(Matcher):
|
|
def __init__(self, child):
|
|
self._child = child
|
|
|
|
def match(self, context):
|
|
for result in self._child.match(context):
|
|
yield MatchResult(
|
|
result.context,
|
|
[])
|
|
|
|
|
|
class Insert(Matcher):
|
|
def __init__(self, value):
|
|
self._value = value
|
|
|
|
def match(self, context):
|
|
yield MatchResult(
|
|
context,
|
|
[TextNode(self._value)])
|
|
|
|
|
|
class Literal(Matcher):
|
|
def __init__(self, value):
|
|
self._value = value
|
|
|
|
def match(self, context):
|
|
if context.stringAfter(len(self._value)) == self._value:
|
|
yield MatchResult(
|
|
context.advance(len(self._value)),
|
|
[])
|
|
|
|
|
|
class Node(Matcher):
|
|
def __init__(self, name, child):
|
|
self._name = name
|
|
self._child = child
|
|
|
|
def match(self, context):
|
|
for result in self._child.match(context):
|
|
element = Element(self._name)
|
|
for node in result.nodes:
|
|
element.appendChild(node.cloneNode(True))
|
|
element.normalize()
|
|
yield MatchResult(
|
|
result.context,
|
|
[element])
|
|
|
|
|
|
class Or(Matcher):
|
|
def __init__(self, *options):
|
|
self._options = options
|
|
|
|
def match(self, context):
|
|
for option in self._options:
|
|
for result in option.match(context):
|
|
yield result
|
|
|
|
|
|
class Ref(Matcher):
|
|
def __init__(self, key):
|
|
self._key = key
|
|
|
|
def match(self, context):
|
|
return context.rules[self._key].match(context)
|
|
|
|
|
|
class SequentialPair(Matcher):
|
|
def __init__(self, child1, child2):
|
|
self._child1 = child1
|
|
self._child2 = child2
|
|
|
|
def match(self, context):
|
|
for result1 in self._child1.match(context):
|
|
for result2 in self._child2.match(result1.context):
|
|
yield MatchResult(
|
|
result2.context,
|
|
result1.nodes + result2.nodes)
|
|
|
|
|
|
class StartOfLine(Matcher):
|
|
def match(self, context):
|
|
if context.atStart():
|
|
yield MatchResult(
|
|
context,
|
|
[])
|
|
if context.stringAfter(1) == '\n':
|
|
yield MatchResult(
|
|
context.advance(1),
|
|
[])
|
|
if context.stringBefore(1) == '\n':
|
|
yield MatchResult(
|
|
context,
|
|
[])
|
|
|
|
|
|
class ZeroOrMore(Matcher):
|
|
def __init__(self, child):
|
|
self._pair = SequentialPair(child, self)
|
|
|
|
def match(self, context):
|
|
yield MatchResult(
|
|
context,
|
|
[])
|
|
for result in self._pair.match(context):
|
|
if result.context.remaining() == context.remaining():
|
|
raise Exception(
|
|
"Child or ZeroOrMore didn't consume input; grammar bug?")
|
|
yield result
|
|
|
|
|
|
# ============ Convenience factories ============
|
|
|
|
|
|
def Char():
|
|
return CharExcept('')
|
|
|
|
|
|
def MultiLineText():
|
|
return OneOrMore(Char())
|
|
|
|
|
|
def OneOrMore(child):
|
|
return SequentialPair(child, ZeroOrMore(child))
|
|
|
|
|
|
def Sequence(*children):
|
|
if len(children) == 1:
|
|
return children[0]
|
|
return SequentialPair(
|
|
children[0],
|
|
Sequence(*children[1:]))
|
|
|
|
|
|
def SingleLineText():
|
|
return OneOrMore(CharExcept('\n'))
|
|
|
|
|
|
|
|
# ============ Filter factories ============
|
|
|
|
|
|
def ChildToAttribute(parentName, childName):
|
|
def Filter(node):
|
|
if node.nodeName != parentName:
|
|
return
|
|
for childNode in node.childNodes:
|
|
if childNode.nodeName == childName:
|
|
node.setAttribute(childName, childNode.getTextContent())
|
|
node.removeChild(childNode)
|
|
break
|
|
return Filter
|
|
|
|
|
|
def ExtractElement(nodeName):
|
|
def Filter(node):
|
|
if node.nodeName != nodeName:
|
|
return
|
|
parentNode = node.parentNode
|
|
for childNode in node.childNodes:
|
|
parentNode.insertBefore(childNode, node)
|
|
parentNode.removeChild(node)
|
|
parentNode.normalize()
|
|
return Filter
|
|
|
|
|
|
def GroupSiblings(parentName, childNames):
|
|
def Filter(node):
|
|
if node.nodeName not in childNames:
|
|
return
|
|
nodes = []
|
|
iterNode = node
|
|
while iterNode and iterNode.nodeName in childNames:
|
|
nodes.append(iterNode)
|
|
iterNode = iterNode.nextSibling
|
|
newNode = Element(parentName)
|
|
node.parentNode.replaceChild(newNode, node)
|
|
for childNode in nodes:
|
|
newNode.appendChild(childNode)
|
|
return Filter
|
|
|
|
|
|
def RenameElement(oldName, newName):
|
|
def Filter(node):
|
|
if node.nodeName != oldName:
|
|
return
|
|
node.renameNode(newName)
|
|
return Filter
|
|
|
|
|
|
def SplitElementAndNest(originalName, newNames):
|
|
def Filter(node):
|
|
if node.nodeName != originalName:
|
|
return
|
|
outerNode = innerNode = None
|
|
for newName in newNames:
|
|
newNode = Element(newName)
|
|
if not outerNode:
|
|
outerNode = innerNode = newNode
|
|
else:
|
|
innerNode.appendChild(newNode)
|
|
innerNode = newNode
|
|
for childNode in node.childNodes:
|
|
innerNode.appendChild(childNode)
|
|
node.parentNode.replaceChild(outerNode, node)
|
|
return Filter
|
|
|
|
|
|
|
|
# ============ Scaffolding ============
|
|
|
|
|
|
def ApplyFilter(node, callback):
|
|
callback(node)
|
|
i = 0
|
|
while i < len(node.childNodes):
|
|
ApplyFilter(node.childNodes[i], callback)
|
|
i += 1
|
|
|
|
|
|
def ApplyFilters(node, filters):
|
|
for callback in filters:
|
|
ApplyFilter(node, callback)
|
|
|
|
|
|
class Context(object):
|
|
def __init__(self, rules, string, inputIndex=0):
|
|
self.rules = rules
|
|
self.string = string
|
|
self.inputIndex = inputIndex
|
|
|
|
def copy(self):
|
|
return Context(self.rules, self.string, self.inputIndex)
|
|
|
|
def stringAfter(self, numChars=None):
|
|
if numChars is None:
|
|
numChars = self.remaining()
|
|
return self.string[self.inputIndex:self.inputIndex + numChars]
|
|
|
|
def stringBefore(self, numChars):
|
|
start = self.inputIndex - numChars
|
|
if start < 0:
|
|
numChars += start
|
|
start = 0
|
|
return self.string[start:start + numChars]
|
|
|
|
def atStart(self):
|
|
return self.inputIndex == 0
|
|
|
|
def atEnd(self):
|
|
return self.remaining() == 0
|
|
|
|
def remaining(self):
|
|
return len(self.string) - self.inputIndex
|
|
|
|
def advance(self, numChars):
|
|
if not numChars:
|
|
raise Exception('Context.advance(0) called')
|
|
context = self.copy()
|
|
context.inputIndex += numChars
|
|
return context
|
|
|
|
|
|
class Parser(object):
|
|
@classmethod
|
|
def fromFile(cls, filename):
|
|
fh = open(filename, 'r')
|
|
grammar = fh.read()
|
|
compiled = compile(grammar, filename, 'exec')
|
|
glbls = {
|
|
'rr': rr(),
|
|
}
|
|
eval(compiled, glbls)
|
|
newKeys = (set(glbls) - {'__builtins__', 'rr'})
|
|
assert len(newKeys) == 1, newKeys
|
|
value = glbls[newKeys.pop()]
|
|
assert isinstance(value, cls), value
|
|
return value
|
|
|
|
def __init__(self, rules, filters):
|
|
self.rules = rules
|
|
self.filters = filters
|
|
|
|
def parseFromString(self, string):
|
|
context = Context(self.rules, string)
|
|
for result in context.rules['main'].match(context):
|
|
rootNode = result.nodes[0]
|
|
ApplyFilters(rootNode, self.filters)
|
|
return rootNode
|
|
return None
|
|
|
|
|
|
class rr(object):
|
|
_SYMBOLS = {
|
|
# Matchers
|
|
'CharExcept': CharExcept,
|
|
'EndOfLine': EndOfLine,
|
|
'EndOfText': EndOfText,
|
|
'Hidden': Hidden,
|
|
'Insert': Insert,
|
|
'Literal': Literal,
|
|
'Node': Node,
|
|
'Or': Or,
|
|
'Ref': Ref,
|
|
'SequentialPair': SequentialPair,
|
|
'StartOfLine': StartOfLine,
|
|
'ZeroOrMore': ZeroOrMore,
|
|
|
|
# Convenience factories
|
|
'Char': Char,
|
|
'MultiLineText': MultiLineText,
|
|
'OneOrMore': OneOrMore,
|
|
'Sequence': Sequence,
|
|
'SingleLineText': SingleLineText,
|
|
|
|
# Filter factories
|
|
'ChildToAttribute': ChildToAttribute,
|
|
'ExtractElement': ExtractElement,
|
|
'GroupSiblings': GroupSiblings,
|
|
'RenameElement': RenameElement,
|
|
'SplitElementAndNest': SplitElementAndNest,
|
|
|
|
# Scaffolding
|
|
'Parser': Parser,
|
|
}
|
|
|
|
def __getattr__(self, key):
|
|
return self._SYMBOLS[key]
|