From 835979f95e75c6806b248321271843278b216768 Mon Sep 17 00:00:00 2001 From: Ian Gulliver Date: Tue, 20 May 2014 16:40:35 -0700 Subject: [PATCH] Start of parsing of mediawiki text. --- grammars/mediawiki.js | 5 +- recentrunes.js | 192 +++++++++++++++++++++++++++++++++++++----- test.js | 15 ++-- 3 files changed, 183 insertions(+), 29 deletions(-) diff --git a/grammars/mediawiki.js b/grammars/mediawiki.js index e75e5ae..c574979 100644 --- a/grammars/mediawiki.js +++ b/grammars/mediawiki.js @@ -9,7 +9,7 @@ var mediawiki = { 'hr': [rr.StartOfLine(), rr.Literal('----'), rr.EndOfLine()], 'i': [rr.Literal("''"), rr.Ref('wikichunk'), rr.Literal("''")], 'nowiki': [rr.Literal(''), rr.MultiLineText(), rr.Literal('')], - 'nowikiword': [rr.WordText(), rr.Literal('')], + 'text': [rr.MultiLineText()], 'wikichunk': [rr.Or( 'b', 'del', @@ -21,8 +21,7 @@ var mediawiki = { 'hr', 'i', 'nowiki', - 'nowikiword', - rr.MultiLineText() + 'text' )], 'wikidoc': [rr.ZeroOrMore('wikichunk')], }; diff --git a/recentrunes.js b/recentrunes.js index 6d78900..1208d9f 100644 --- a/recentrunes.js +++ b/recentrunes.js @@ -1,5 +1,7 @@ var rr = {}; + + rr.Literal_ = function(value) { this.value_ = value; }; @@ -28,10 +30,19 @@ rr.Literal = function(value) { rr.Literal.cache = {}; + rr.Ref_ = function(key) { this.key_ = key; }; +rr.Ref_.prototype.minimize = function(parser) { + return parser.minimize(this.key_); +}; + +rr.Ref_.prototype.match = function(input, fullInput, inputIndex, parser) { + return parser.parse(this.key_, input); +}; + rr.Ref = function(key) { return (rr.Ref.cache[key] || (rr.Ref.cache[key] = new rr.Ref_(key))); @@ -39,19 +50,74 @@ rr.Ref = function(key) { rr.Ref.cache = {}; + rr.EndOfLine_ = function() { }; +rr.EndOfLine_.prototype.match = function(input, fullInput, inputIndex) { + if (input.length == 0) { + return [0, null]; + } + if (input[0] == '\n') { + return [1, null]; + } + if (inputIndex > 0 && fullInput[inputIndex - 1] == '\n') { + return [0, null]; + } + return null; +}; + +rr.EndOfLine_.prototype.search = function(input, fullInput, inputIndex) { + if (input.length == 0) { + return 0; + } + var loc = input.indexOf('\n'); + if (loc == -1) { + return input.length; + } else { + return loc; + } +}; + rr.EndOfLine = function() { return rr.EndOfLine.cache; -} +}; rr.EndOfLine.cache = new rr.EndOfLine_(); + +rr.EndOfText_ = function() { +}; + +rr.EndOfText_.prototype.match = function(input) { + if (input.length) { + return null; + } else { + return [0, null]; + } +}; + +rr.EndOfText_.prototype.search = function(input) { + return input.length; +}; + +rr.EndOfText = function() { + return rr.EndOfText.cache; +}; +rr.EndOfText.cache = new rr.EndOfText_(); + + + rr.MultiLineText_ = function() { }; -rr.MultiLineText_.prototype.minimize = true; +rr.MultiLineText_.prototype.minimize = function() { + return true; +}; + +rr.MultiLineText_.prototype.match = function(input) { + return [input.length, document.createTextNode(input)]; +}; rr.MultiLineText = function() { return rr.MultiLineText.cache; @@ -59,19 +125,44 @@ rr.MultiLineText = function() { rr.MultiLineText.cache = new rr.MultiLineText_(); + rr.Or_ = function(options) { this.options_ = options; }; +rr.Or_.prototype.minimize = function(parser) { + for (var i = 0; i < this.options_.length; i++) { + var option = this.options_[i]; + if (parser.minimize(option)) { + return true; + }; + } + return false; +}; + +rr.Or_.prototype.match = function(input, fullInput, inputIndex, parser) { + for (var i = 0; i < this.options_.length; i++) { + var option = this.options_[i]; + var result = parser.parse(option, input); + if (result) { + return result; + } + } + return null; +}; + rr.Or = function() { return new rr.Or_(arguments); }; + rr.SingleLineText_ = function() { }; -rr.SingleLineText_.prototype.minimize = true; +rr.SingleLineText_.prototype.minimize = function() { + return true; +} rr.SingleLineText_.prototype.match = function(input) { var newLine = input.indexOf('\n'); @@ -88,30 +179,65 @@ rr.SingleLineText = function() { rr.SingleLineText.cache = new rr.SingleLineText_(); + rr.StartOfLine_ = function() { }; +rr.StartOfLine_.prototype.match = function(input, fullInput, inputIndex) { + if (inputIndex == 0) { + return [0, null]; + } + if (input[0] == '\n') { + return [1, null]; + } + if (fullInput[inputIndex - 1] == '\n') { + return [0, null]; + } + return null; +}; + +rr.StartOfLine_.prototype.search = function(input, fullInput, inputIndex) { + if (inputIndex == 0) { + return 0; + } + var loc = input.indexOf('\n'); + if (loc == -1) { + return null; + } + return loc + 1; +}; + rr.StartOfLine = function() { return rr.StartOfLine.cache; }; rr.StartOfLine.cache = new rr.StartOfLine_(); -rr.WordText_ = function() { -}; - -rr.WordText_.prototype.minimize = true; - -rr.WordText = function() { - return rr.WordText.cache; -}; -rr.WordText.cache = new rr.WordText_(); - rr.ZeroOrMore_ = function(key) { this.key_ = key; }; +rr.ZeroOrMore_.prototype.minimize = function(parser) { + return parser.minimize(this.key_); +}; + +rr.ZeroOrMore_.prototype.match = function(input, fullInput, inputIndex, parser) { + var ret = document.createElement('group'); + var parseIndex = 0; + while (parseIndex < input.length - 1) { + var result = parser.parse(this.key_, input.slice(parseIndex)); + if (!result) { + break; + } + parseIndex += result[0]; + if (result[1]) { + ret.appendChild(result[1]); + } + }; + return [parseIndex, ret]; +}; + rr.ZeroOrMore = function(key) { return (rr.ZeroOrMore.cache[key] || (rr.ZeroOrMore.cache[key] = new rr.ZeroOrMore_(key))); @@ -119,10 +245,21 @@ rr.ZeroOrMore = function(key) { rr.ZeroOrMore.cache = {}; + var RecentRunes = function(dictionary) { this.dictionary_ = dictionary; }; +RecentRunes.prototype.minimize = function(nodeType) { + var rules = this.dictionary_[nodeType]; + for (var i = 0; i < rules.length; i++) { + if (rules.minimize && rules.minimize(this)) { + return true; + } + } + return false; +}; + RecentRunes.prototype.parse = function(nodeType, input) { var ret = document.createElement(nodeType); var rules = this.dictionary_[nodeType]; @@ -130,12 +267,12 @@ RecentRunes.prototype.parse = function(nodeType, input) { var lastRuleMinimize = false; for (var i = 0; i < rules.length; i++) { var rule = rules[i]; - if (rule.minimize) { + if (rule.minimize && rule.minimize(this)) { if (lastRuleMinimize) { // Two minimize rules in a row is ambiguous return null; } - lastRuleMinimize = true; + lastRuleMinimize = rule; continue; } if (lastRuleMinimize) { @@ -146,17 +283,21 @@ RecentRunes.prototype.parse = function(nodeType, input) { } // Check if the previous rule will match the interim data - var prevMatch = rules[i - 1].match( - input.slice(inputIndex, inputIndex + loc)); - if (!prevMatch) { + var prevMatch = lastRuleMinimize.match( + input.slice(inputIndex, inputIndex + loc), + input, inputIndex, this); + if (!prevMatch || prevMatch[0] != loc) { return null; }; inputIndex += prevMatch[0]; if (prevMatch[1]) { ret.appendChild(prevMatch[1]); } + + lastRuleMinimize = false; } - var match = rule.match(input.slice(inputIndex)); + var match = rule.match( + input.slice(inputIndex), input, inputIndex, this); if (!match) { return null; } @@ -165,5 +306,18 @@ RecentRunes.prototype.parse = function(nodeType, input) { ret.appendChild(match[1]); } }; + + if (lastRuleMinimize) { + var lastMatch = lastRuleMinimize.match( + input.slice(inputIndex), input, inputIndex, this); + if (!lastMatch || lastMatch[0] != input.length - inputIndex) { + return null; + } + inputIndex += lastMatch[0]; + if (lastMatch[1]) { + ret.appendChild(lastMatch[1]); + } + } + return [inputIndex, ret]; }; diff --git a/test.js b/test.js index 6b664d2..4bf5b60 100644 --- a/test.js +++ b/test.js @@ -1,10 +1,11 @@ test('Simple', function() { expect(0); - var grammar = { - 'rule1': [rr.Literal('=== '), rr.SingleLineText(), rr.Literal(' ===')], - 'rule2': [rr.SingleLineText(), rr.Literal('=')], - }; - var parser = new RecentRunes(grammar); - console.log(parser.parse('rule1', '=== bar ===')); - console.log(parser.parse('rule2', 'foo=\nbar=')); + var parser = new RecentRunes(mediawiki); + var result = parser.parse('wikidoc', +'=== Heading ===\n\ +This is a wiki doc.\n\ +How about some bold and bold italic.\n\ +I would also love some nowiki foo'); + console.log(result); + document.body.appendChild(result[1]); });