1
0
www/src/project/sl3.html

554 lines
25 KiB
HTML

<html>
<head>
<title>Comparative SL3 Parsing</title>
<script type="text/javascript">
// Functions that append special characters to the sentence box
function band() { document.getElementById("sentence").value += "\u2227"; }
function bor() { document.getElementById("sentence").value += "\u2228"; }
function barr() { document.getElementById("sentence").value += "\u2192"; }
function bbic() { document.getElementById("sentence").value += "\u2194"; }
// Declare aliases for token identifiers
var ERROR = 0, ATOM = 1, NEGATE = 2, LPAR = 3, RPAR = 4, ARROW = 5,
BICOND = 6, AND = 7, OR = 8;
// Converts each symbol to a token and returns a list of tokens
function tokenize(s) {
var tokens = [];
for (var i = 0; i < s.length; i++) {
if (/[ABC]/.test(s[i])) {
tokens.push(ATOM);
} else if (s[i] == ("~")) {
tokens.push(NEGATE);
} else if (s[i] == ("(")) {
tokens.push(LPAR);
} else if (s[i] == (")")) {
tokens.push(RPAR);
} else if (s[i] == ("\u2192")) {
tokens.push(ARROW);
} else if (s[i] == ("\u2194")) {
tokens.push(BICOND);
} else if (s[i] == ("\u2227")) {
tokens.push(AND);
} else if (s[i] == ("\u2228")) {
tokens.push(OR);
} else {
return [ ERROR ];
console.log("Parsing error")
}
}
return tokens;
}
function parseInputBox() {
// Get the sentence to validate
var input = document.getElementById("sentence").value.replace(/^\s+|\s+$/g, '');
// Tokenize the sentence into numbers so it's easier to deal with
var tokens = tokenize(input);
// Attempt to parse the sentence without extended junctions
var parsedBinary = parseSentence(tokens, false);
// The sentence should parse validly with no leftover tokens
var validBinary = parsedBinary[0] && parsedBinary[1].length == 0;
// Attempt to parse the sentence with extended junctions
var parsedExtended = parseSentence(tokens, true);
var validExtended = parsedExtended[0] && parsedExtended[1].length == 0;
// Print result to console
console.log(input + " -> [" + tokens.join(",") + "] -> " + validBinary + "/" + validExtended);
// Update UI display with sentence validity
var left = document.getElementById("status-left");
var right = document.getElementById("status-right");
left.innerHTML = validBinary ? "\u2714 Valid" : "\u2716 Invalid";
left.className = validBinary ? "green" : "red";
right.innerHTML = validExtended ? "\u2714 Valid" : "\u2716 Invalid";
right.className = validExtended ? "green" : "red";
}
function parseAtom(tokens, flag) { // Attempt to parse an atomic sentence
if (tokens[0] != ATOM) {
return [ false, tokens ]; // Fail if not an atomic sentence
}
return [ true, tokens.slice(1) ]; // Succeed and consume an ATOM token
}
function parseNegation(tokens, flag) { // Attempt to parse a negation
if (tokens[0] != NEGATE) {
return [ false, tokens ]; // Fail of it didn't begin with a negate
}
var iter = tokens.slice(1); // Consume the negate
var parseTry = parseSentence(iter, flag); // Try to parse a sentence after the negate
if (!parseTry[0]) {
return [ false, tokens ]; // Fail if it wasn't a sentence
}
return parseTry; // Succeed and consume a NEGATE and a sentence
}
function parseBinaryOperator(tokens, operator, flag) { // Attempt to parse a binary operator
if (tokens[0] != LPAR) { // Begin by trying for a left paren
return [ false, tokens ]; // Fail if there was no left paren
}
var iterA = tokens.slice(1); // Otherwise, consume the LPAR
var parseTryOne = parseSentence(iterA, flag); // Then try to parse a sentence
if (!parseTryOne[0]) {
return [ false, tokens ]; // Fail if there wasn't a sentence
}
if (parseTryOne[1][0] != operator) { // Next try to consume the operator
return [ false, tokens ]; // Fail if there wasn't one
}
var iterB = parseTryOne[1].slice(1); // Consume the operator
var parseTryTwo = parseSentence(iterB, flag); // Try to parse the next sentence
if (!parseTryTwo[0]) {
return [ false, tokens ]; // Fail if there wasn't a sentence
}
if (parseTryTwo[1][0] != RPAR) { // Finally try to consume a right paren
return [ false, tokens ]; // Fail if there was no right paren
}
return [ true, parseTryTwo[1].slice(1) ]; // Consume the used tokens
}
function parseNAryOperator(tokens, operator, flag) { // Attempt to parse an extended junction
if (tokens[0] != LPAR) { // Begin by trying for a left paren
return [ false, tokens ]; // Fail if there was no left paren
}
var iterA = tokens.slice(1); // Otherwise, consume the LPAR
var parseTryOne = parseSentence(iterA, flag); // Then try to parse a sentence
if (!parseTryOne[0]) {
return [ false, tokens ]; // Fail if there wasn't a sentence
}
if (parseTryOne[1][0] != operator) { // Next try to consume the operator
return [ false, tokens ]; // Fail if there wasn't one
}
var iterB = parseTryOne[1].slice(1); // Consume the operator
var parseTryTwo = parseSentence(iterB, flag); // Try to parse the next sentence
if (!parseTryTwo[0]) {
return [ false, tokens ]; // Fail if there wasn't a sentence
}
var tokensRemaining = parseTryTwo[1]; // Prepare to iteratively search the remaining tokens
while (tokensRemaining.length > 0) {
if (tokensRemaining[0] == RPAR) { // If the junction ends validly,
return [ true, tokensRemaining.slice(1) ]; // consume the RPAR and return
}
if (tokensRemaining[0] != operator) { // If the last sentence wasn't followed by the ending paren,
return [ false, tokens ]; // it should be followed by an operator; if not, fail
}
tokensRemaining = tokensRemaining.slice(1); // Consume the operator
var parseTryLoop = parseSentence(tokensRemaining, flag); // Try a sentence
if (!parseTryLoop[0]) {
return [ false, tokens ]; // Fail if there wasn't a sentence
}
tokensRemaining = parseTryLoop[1]; // Consume the sentence and loop back if there was
}
return [ false, tokens ]; // Fail if we ran out of tokens without the junction ending
}
function parseSentence(tokens, flag) { // Attempt to parse a sentence
var tryAtom = parseAtom(tokens, flag);
if (tryAtom[0]) // Is it an atomic sentence?
return tryAtom; // If it is, consume and return
var tryNegation = parseNegation(tokens, flag);
if (tryNegation[0]) // If not, is it a negation?
return tryNegation; // If it is, consume and return
var tryConditional = parseBinaryOperator(tokens, ARROW, flag);
if (tryConditional[0]) // If it's not, is it a conditional?
return tryConditional; // If it is, consume and return
var tryBicondition = parseBinaryOperator(tokens, BICOND, flag);
if (tryBicondition[0]) // If it's not, is it a biconditional?
return tryBicondition; // If it is, consume and return
var tryConjunction = flag
? parseNAryOperator(tokens, AND, flag)
: parseBinaryOperator(tokens, AND, flag);
if (tryConjunction[0]) // If it's not, is it a conjunction?
return tryConjunction; // If it is, consume and return
var tryDisjunction = flag
? parseNAryOperator(tokens, OR, flag)
: parseBinaryOperator(tokens, OR, flag);
if (tryDisjunction[0]) // If it's not, is it a biconditional?
return tryDisjunction; // If it is, consume and return
return [ false, tokens ]; // By closure, if it's not one of those, it isn't a sentence
}
</script>
<style>
#sentence {
text-align: center;
}
#status-box-left {
display: inline-block;
border: 1px solid black;
width: 400px;
}
#status-box-right {
display: inline-block;
border: 1px solid black;
width: 400px;
}
.green {
color: #228b22;
}
.red {
color: #c00000;
}
.text1 {
text-align: left;
margin: 0 200px;
}
.text2 {
vertical-align:top;
display: inline-block;
width: 500px;
text-align: left;
}
</style>
</head>
<body>
<center>
<h1>Comparative SL3 Parsing</h1>
<p>
<input id="sentence" type="text" value="A" onchange="javascript:parseInputBox()">
<button onclick="javascript:parseInputBox()">&#8634;</button>
</p>
<p>
<button onclick="javascript:band()">&#8743;</button>
<button onclick="javascript:bor()">&#8744;</button>
<button onclick="javascript:barr()">&#8594;</button>
<button onclick="javascript:bbic()">&#8596;</button>
</p>
<br>
<div id="status-box-left">
<p>Input sentence validity in SL3 without extended junctions:</p>
<h2 id="status-left" class="green">&#10004; Valid</h2>
</div>
<div id="status-box-right">
<p>Input sentence validity in SL3 with extended junctions:</p>
<h2 id="status-right" class="green">&#10004; Valid</h2>
</div>
<br>
<div class="text1">
<center><h2>Recursive descent parsing</h2></center>
<p>Languages that are defined by recursive grammars can be parsed by means of a recursive descent parser.
A recursive descent parser will consist of two main pieces: the tokenizer and the parser. The tokenizer
converts the input, which is a string of characters where one element may consist of multiple characters,
into a sequence of tokens, which abstractly represent the structural elements of the sequence. For example,
a natural language parser might tokenize a string of characters by splitting on whitespace and converting
each group of letters into a WORD token, and a JSON parser might convert a string of numerals and notation
characters into a JNUMBER token.</p>
<p>The parser takes in a sequence of tokens and decides whether it can be parsed according to a set grammar.
To do this, the parser defines recursive functions that correspond to each rule in the recursive grammar.
A particular method call to one of these functions can then call whatever other recursive functions are
required to validate that grammar rule. For example, a function that parses math expressions might call an
expression parser on each side of a + token. At each step in the parsing procedure, recursive calls will
consume some number of tokens off of the token list. If a recursive call fails, the original token list
will be returned instead of a list with some tokens consumed, so the parent function call can try another
recursive call or fail.</p>
<center><h2>Writing an SL3 RDP: Tokenizer</h2></center>
<p>SL3 is defined by the following recursive grammar rule, where &#934; is a sentence:</p>
<pre>
&#934; := A | B | C | ~&#934; | (&#934;&#8743;&#934;) | (&#934;&#8744;&#934;) | (&#934;&#8594;&#934;) | (&#934;&#8596;&#934;)</pre>
<p>The closure clause is implicit. Thus the characters we should expect to see are A, B, C, (, ), &#8743;,
&#8744;, &#8594;, and &#8596;. Fortunately for the tokenizer, we don't have any sequence elements that are
more than one character long, so we can simply convert the sequence of characters into tokens. We represent
the tokens as integers under the hood, but in the code for our parser we'll use aliases so it's clear what
kind of token we're talking about.</p>
<pre>
1 // Declare aliases for token identifiers
2 var ERROR = 0, ATOM = 1, NEGATE = 2, LPAR = 3, RPAR = 4, ARROW = 5,
3 BICOND = 6, AND = 7, OR = 8;
4
5 // Converts each symbol to a token and returns a list of tokens
6 function tokenize(s) {
7 var tokens = [];
8 for (var i = 0; i < s.length; i++) {
9 if (/[ABC]/.test(s[i])) {
10 tokens.push(ATOM);
11 } else if (s[i] == ("~")) {
12 tokens.push(NEGATE);
13 } else if (s[i] == ("(")) {
14 tokens.push(LPAR);
15 } else if (s[i] == (")")) {
16 tokens.push(RPAR);
17 } else if (s[i] == ("\u2192")) {
18 tokens.push(ARROW);
19 } else if (s[i] == ("\u2194")) {
20 tokens.push(BICOND);
21 } else if (s[i] == ("\u2227")) {
22 tokens.push(AND);
23 } else if (s[i] == ("\u2228")) {
24 tokens.push(OR);
25 } else {
26 return [ ERROR ];
27 console.log("Parsing error")
28 }
29 }
30 return tokens;
31 }</pre>
<p>Some things to note: We define an ERROR token that gets returned when we encounter a character that
isn't part of the grammar (25-28). If we find an illegal character, we know immediately that the sequence
won't parse. The parser won't try to interpret an ERROR token and fail immediately. Also note that we can
match all the atomic sentences with a single regular expression match (9).</p>
<center><h2>Writing an SL3 RDP: Parser</h2></center>
<p>For the parser, we can simplify the number of functions required by matching the rules by their structural
similarity:</p>
<pre>
&#934; := A | B | C
&#934; := ~&#934;
&#934; := (&#934;&#8743;&#934;) | (&#934;&#8744;&#934;) | (&#934;&#8594;&#934;) | (&#934;&#8596;&#934;)</pre>
<p>The atomic sentences consist of a single ATOM token, negative sentences of a NEGATE token followed by a
sentence, and the other complicated sentences by two sentences around an operator token, flanked by an LPAR
token and an RPAR token. Using the computer scientist's first resort, anotehr layer of indirection, we can
get away with writing three functions: one to try and parse an atomic sentence from the token list, one to
try to parse a negative sentence, and one to parse a sentence given some binary operator. Let's look at these
in order. Ignore <i>flag</i> for now; we'll make use of that later.</p>
<pre>
1 function parseAtom(tokens, flag) {
2 if (tokens[0] != ATOM) {
3 return [ false, tokens ];
4 }
5 return [ true, tokens.slice(1) ];
6 }</pre>
<p>If all we need to successfully parse an atomic sentence is a single ATOM token, then it suffices to find
one of them at the head of the token list. If we don't find one, we report that we failed to parse an
atomic sentence and return an untouched token list (2-4). If we do find one, we consume the ATOM token and return
rest of the tokens as a success (5). The <i>.slice()</i> function in Javascript returns a subsequence of a list,
which we use to remove tokens from the head of the list.</p>
<pre>
1 function parseNegation(tokens, flag) {
2 if (tokens[0] != NEGATE) {
3 return [ false, tokens ];
4 }
5 var iter = tokens.slice(1);
6 var parseTry = parseSentence(iter, flag);
7 if (!parseTry[0]) {
8 return [ false, tokens ];
9 }
10 return parseTry;
11 }</pre>
<p>Here we see our first instance of recursion. A negative sentence <i>in toto</i> must consist in a NEGATE
token, followed by a sequence of tokens that make up a valid sentence. Thus, we first check to see if the
first condition is met, returning a failure if it is not (2-4). If it is, then we consume it (5) and pass the
rest of the tokens recursively to another parsing function (6). We'll see the internals of parseSentence()
later. For now, we'll note that it returns the same sort of data structure that the other parsing functions
return, which is a tuple of whether the parsing succeeded and a list of unconsumed tokens. If the parsing was a
failure, we return a failure as in line 3 (7-9). Note that in line 8, we return <i>tokens</i>, not
<i>parseTry[1]</i>. We don't want to consume the NEGATE token if we couldn't parse a negative sentence after
all. If the parsing did succeed, then the return value is a tuple with a parsing success and the tokens
left unconsumed by the sentence, which is just what <i>parseNegation()</i> wants to return.</p>
<pre>
1 function parseBinaryOperator(tokens, operator, flag) {
2 if (tokens[0] != LPAR) {
3 return [ false, tokens ];
4 }
5 var iterA = tokens.slice(1);
6 var parseTryOne = parseSentence(iterA, flag);
7 if (!parseTryOne[0]) {
8 return [ false, tokens ];
9 }
10 if (parseTryOne[1][0] != operator) {
11 return [ false, tokens ];
12 }
13 var iterB = parseTryOne[1].slice(1);
14 var parseTryTwo = parseSentence(iterB, flag);
15 if (!parseTryTwo[0]) {
16 return [ false, tokens ];
17 }
18 if (parseTryTwo[1][0] != RPAR) {
19 return [ false, tokens ];
20 }
21 return [ true, parseTryTwo[1].slice(1) ];
22 }</pre>
<p>Finally, we come to the binary operator function. All four of the binary operator sentence types share a common
structure, differing only in which operator token is in the middle, so if we pass that in as the <i>operator</i>
argument to the function, we can represent all four sentences by passing different tokens to the function. In this
function we also see two instances of recursion on lines 6 and 14. The overall effect is to try and consume an
LPAR token (2-5), then however many tokens are required to parse a valid sentence (6-9), then whichever token is
the operator for this function call (10-13), then however many tokens are required to parse another valid sentence
(14-17), then finally an RPAR token (18-21). If at any point the expected tokens aren't found, then a binary
operator sentence can't be parsed, and the token list is returned unchanged as a failure.</p>
<p><i>parseNegation()</i> and <i>parseBinaryOperator()</i> accomplished their recursive descent by calling a
generalized <i>parseSentence()</i> function. What does this function do? <i>parseSentence()</i> is what ties all of
the recursive descent parsing functions together by trying each of them in turn and returning the results of
whichever of them worked:</p>
<pre>
1 function parseSentence(tokens, flag) {
2 var tryAtom = parseAtom(tokens, flag);
3 if (tryAtom[0])
4 return tryAtom;
5 var tryNegation = parseNegation(tokens, flag);
6 if (tryNegation[0])
7 return tryNegation;
8 var tryConditional = parseBinaryOperator(tokens, ARROW, flag);
9 if (tryConditional[0])
10 return tryConditional;
11 var tryBicondition = parseBinaryOperator(tokens, BICOND, flag);
12 if (tryBicondition[0])
13 return tryBicondition;
14 var tryConjunction = parseBinaryOperator(tokens, AND, flag);
15 if (tryConjunction[0])
16 return tryConjunction;
17 var tryDisjunction = parseBinaryOperator(tokens, OR, flag);
18 if (tryDisjunction[0])
19 return tryDisjunction;
20 return [ false, tokens ];
21 }</pre>
<p>Note that line 20 perfectly encapsulates what the closure clause does in the definition of SL3: if something
does not follow by one of the given rules i.e. does not parse according to any of the defined parsing functions,
then it is not a sentence, and the parser returns a failure to the calling context.</p>
<p>All that's left is to put the two pieces together. The function to parse a given input will look something like
this:</p>
<pre>
1 function parseSL3(input) {
2 var tokens = tokenize(input);
3 var parsed = parseSentence(tokens, false);
4 var valid = parsed[0] && parsed[1].length == 0;
5
6 // Do something with the result
7 }</pre>
<p>On line 2 we pass the input to the tokenizer to convert it from a character sequence to a token sequence. On
line 3 we pass this token sequence to the general parsing function, which will then try all the parsing functions
until it finds one that works, which will itself do some parsing, possibly including more recursive calls to
parsing functions, and so on. After all of that is done, we receive back a tuple of whether it worked and a list
of all the leftover tokens. On line 4 we establish that an input sentence is valid not only if it parsed validly,
but also if there were no tokens left over. This must be checked because otherwise superfluous characters could
be added to the end of a valid sentence.</p>
<center><h2>Writing an SL3 RDP: Extended junctions</h2></center>
<p>The above code will parse any sentence according to the given rules of SL3 sentences:
<pre>
&#934; := A | B | C
&#934; := ~&#934;
&#934; := (&#934;&#8743;&#934;) | (&#934;&#8744;&#934;) | (&#934;&#8594;&#934;) | (&#934;&#8596;&#934;)</pre>
<p>However, we might want to extend these rules so that long junctions don't have to involve nesting each
subsentence within another pair of parentheses:</p>
<pre>
&#934; := A | B | C
&#934; := ~&#934;
&#934; := (&#934;&#8594;&#934;) | (&#934;&#8596;&#934;)
&#934; := (&#934;&#8743;...&#8743;&#934;) | (&#934;&#8744;...&#8744;&#934;)</pre>
<p>We can accomplish this by writing a different parsing function for junctions that can handle an arbitrary
finite number of conjuncts or disjuncts. This function will begin much like the binary operator parser, before
diverging at the end. The different steps have been slightly separated to make the parallels clearer.</p>
</div>
<div class="text2"><pre>
1 function parseBinaryOperator(tokens, operator, flag) {
2 if (tokens[0] != LPAR) {
3 return [ false, tokens ];
4 }
5 var iterA = tokens.slice(1);
6 var parseTryOne = parseSentence(iterA, flag);
7 if (!parseTryOne[0]) {
8 return [ false, tokens ];
9 }
10 if (parseTryOne[1][0] != operator) {
11 return [ false, tokens ];
12 }
13 var iterB = parseTryOne[1].slice(1);
14 var parseTryTwo = parseSentence(iterB, flag);
15 if (!parseTryTwo[0]) {
16 return [ false, tokens ];
17 }
18 if (parseTryTwo[1][0] != RPAR) {
19 return [ false, tokens ];
20 }
21 return [ true, parseTryTwo[1].slice(1) ];
22 }</pre>
</div>
<div class="text2"><pre>
1 function parseNAryOperator(tokens, operator, flag) {
2 if (tokens[0] != LPAR) {
3 return [ false, tokens ];
4 }
5 var iterA = tokens.slice(1);
6 var parseTryOne = parseSentence(iterA, flag);
7 if (!parseTryOne[0]) {
8 return [ false, tokens ];
9 }
10 if (parseTryOne[1][0] != operator) {
11 return [ false, tokens ];
12 }
13 var iterB = parseTryOne[1].slice(1);
14 var parseTryTwo = parseSentence(iterB, flag);
15 if (!parseTryTwo[0]) {
16 return [ false, tokens ];
17 }
18 var tokensRemaining = parseTryTwo[1];
19 while (tokensRemaining.length > 0) {
20 if (tokensRemaining[0] == RPAR) {
21 return [ true, tokensRemaining.slice(1) ];
22 }
23 if (tokensRemaining[0] != operator) {
24 return [ false, tokens ];
25 }
26 tokensRemaining = tokensRemaining.slice(1);
27 var parseTryLoop = parseSentence(tokensRemaining, flag);
28 if (!parseTryLoop[0]) {
29 return [ false, tokens ];
30 }
31 tokensRemaining = parseTryLoop[1];
32 }
33 return [ false, tokens ];
34 }</pre>
</div>
<div class="text1">
<p>Both functions begin by attempting to parse an LPAR, a sentence, an operator, and a second sentence. However,
where the binary parser need only look for an RPAR token, the extended parser needs a loop to check for an
arbitrary number of addtinal conjuncts or disjuncts. The check for an RPAR is done first, so that it can still
validate 2-sentence junctions (20-22). If there isn't one, then the junction must still be continuing, which
means that the sentence must be followed by the same operator (23-26) and another sentence (27-31). If at any
point either a sentence is not followed by the operator or the operator is not followed by a sentence, the
parser will fail (24,29). After consuming the sentence, the function loops back to line 19 to check for an
RPAR token again. If the loop ever runs out of tokens, then it must not have found an RPAR that closed the
current junction, in which case the sentence is invalid (33).</p>
<p>With this alternative parsing function, we can finally make use of <i>flag</i> by letting the value of
<i>flag</i> determine whether we use extended junction rules. To do this, we rewrite</p>
<pre>
14 var tryConjunction = parseBinaryOperator(tokens, AND, flag);
15 if (tryConjunction[0])
16 return tryConjunction;
17 var tryDisjunction = parseBinaryOperator(tokens, OR, flag);
18 if (tryDisjunction[0])
19 return tryDisjunction;</pre>
<p>to use the extended functions when flagged, as so:</p>
<pre>
14 var tryConjunction = flag ? parseNAryOperator(tokens, AND, flag) : parseBinaryOperator(tokens, AND, flag);
15 if (tryConjunction[0])
16 return tryConjunction;
17 var tryDisjunction = flag ? parseNAryOperator(tokens, OR, flag) : parseBinaryOperator(tokens, OR, flag);
18 if (tryDisjunction[0])
19 return tryDisjunction;</pre>
<p>The ?: expression above is called a ternary operator, and is just shorthand for an if/else statement. Thus,
if <i>flag</i> is true, then conjunctions and disjunctions will be parsed using the extended junction rules; and
if false, using the binary rules. Then we can call <i>parseSentence()</i> on the same list of tokens with
<i>flag</i> set to true or false in order to see which rules a sentence is valid under.</p>
<p>The real implementation under the hood has to deal with updating the UI at the top of the page, so the highest
parsing function concerns itself with some other details. To see what the full implementation does, right click
on this page and select "View page source" (or press Ctrl+U or Cmd+U) to see the code for the embedded
Javascript. The functions have comments in them that outline what is going on at each point in the function.</p>
</div>
</center>
<hr>
Tim Van Baak<br>
Rice University '18<br>
PHIL 357 - Incompleteness, Undecidability, and Computability<br>
31 January 2017<br>
</body>
</html>