Skip to content

Commit

Permalink
alas, SeqValiant works! the dream becomes reality...
Browse files Browse the repository at this point in the history
  • Loading branch information
breandan committed Sep 24, 2023
1 parent df725b7 commit 14dc51d
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
package ai.hypergraph.kaliningraph.parsing

import ai.hypergraph.kaliningraph.tensor.UTMatrix
import ai.hypergraph.kaliningraph.types.*

typealias PForest = Set<PTree>
operator fun PForest.contains(v: Σᐩ) = PTree(v) in this
fun PSingleton(v: Σᐩ): List<Π2A<PTree>> = listOf(PTree(v) to PTree())

// Algebraic data type / polynomial functor for parse forests
data class PTree(val root: Σᐩ = "ε", val children: List<Π2A<PTree>> = listOf()) {
// Returns the set of all strings derivable from the given PTree
fun choose(): Sequence<Σᐩ> =
if (children.isEmpty()) sequenceOf(if("ε" in root) "" else root)
else children.asSequence().flatMap { (l, r) ->
// TODO: Use weighted choice mechanism
(l.choose() * r.choose()).map { (a, b) ->
if (a == "") b else if (b == "") a else "$a $b"
}
}

override fun hashCode(): Int = root.hashCode()
override fun equals(other: Any?) = other is PTree && root == other.root
}

// Lazily computes all syntactically strings compatible with the given template
fun CFG.solveSeq(s: Σᐩ): Sequence<Σᐩ> = solveSeq(s.tokenizeByWhitespace())

fun CFG.solveSeq(s: List<Σᐩ>): Sequence<Σᐩ> =
try { solvePTreeFPSeq(s) }
catch (e: Exception) { e.printStackTrace(); null } ?: sequenceOf()

fun CFG.solvePTreeFPSeq(
tokens: List<Σᐩ>,
utMatrix: UTMatrix<PForest> = initPForestMatrix(tokens, pforestAlgebra()),
) =
utMatrix.seekFixpoint().toFullMatrix()[0].last()
.firstOrNull { it.root == START_SYMBOL }?.choose() ?: emptySequence()

fun CFG.initPForestMatrix(
tokens: List<Σᐩ>,
algebra: Ring<PForest>
): UTMatrix<PForest> =
UTMatrix(
ts = tokens.map { token ->
(if (token != HOLE_MARKER) bimap[listOf(token)] else unitNonterminals)
.associateWith { nt ->
if (token != HOLE_MARKER) PSingleton(token)
else bimap.UNITS[nt]?.map { PSingleton(it) }?.flatten()?.toSet()?.toList() ?: listOf()
}.map { (k, v) -> PTree(k, v) }.toSet()
}.toTypedArray(),
algebra = algebra
)

// Maintains a sorted list of nonterminal roots and their leaves
fun CFG.pforestAlgebra(): Ring<PForest> =
Ring.of(
nil = emptySet(),
plus = { x, y -> x union y },
times = { x, y -> joinSeq(x, y) },
)

// X ⊗ Z := { w | <x, z> ∈ X × Z, (w -> xz) ∈ P }
fun CFG.joinSeq(X: PForest, Z: PForest): PForest =
bimap.TRIPL.filter { (_, x, z) -> x in X && z in Z }
.groupingBy { it.first }.aggregate { _, acc: List<Π2A<PTree>>?, it, _->
val (w, x, z) = it
val ptreeX = X.first { it.root == x }
val ptreeZ = Z.first { it.root == z }
val pair = ptreeX to ptreeZ
if (acc == null) listOf(pair) else acc + pair
}.map { (k, v) -> PTree(k, v) }.toSet()
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@ import ai.hypergraph.kaliningraph.levenshtein
import ai.hypergraph.kaliningraph.tensor.UTMatrix
import ai.hypergraph.kaliningraph.types.*

// The main issue with SortValiant is we eagerly compute the Cartesian product
// and this blows up very quickly, so we need to sort and prune aggressively.
// We can instead use a lazy Cartesian product, which is what SeqValiant does.
// The downside is that we lose the ability to sort the results while parsing,
// but we can still use a metric to sort the results after the fact.

// Returns all syntactically strings ordered by distance to withRespect
fun CFG.solve(s: Σᐩ, metric: ChoiceMetric): Set<Σᐩ> =
solve(s.tokenizeByWhitespace(), metric)
Expand Down Expand Up @@ -49,6 +55,9 @@ const val MAX_CAPACITY = 100
fun CFG.join(X: Sort, Z: Sort, metric: ChoiceMetric = { it.weight }): Sort =
bimap.TRIPL.filter { (_, x, z) -> x in X && z in Z }
.map { (w, x, z) ->
// This Cartesian product becomes expensive quickly so MAX_CAPACITY is used
// to limit the number of elements in the product. This is a greedy approach
// and we always take the top MAX_CAPACITY-elements by the provided metric.
((X[x] ?: setOf()) * (Z[z] ?: setOf()))
.map { (q, r) -> w to (q + r) }
}.flatten().groupingBy { it.first }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,16 @@ class SetValiantTest {
}.also { println("Finished in ${it.inWholeMilliseconds}ms.") }
}

/*
./gradlew jvmTest --tests "ai.hypergraph.kaliningraph.parsing.SetValiantTest.testSeqValiant"
*/
@Test
fun testSeqValiant() {
val allSols = seq2parsePythonCFG.solveSeq("_ _ _ _ _").sortedBy { it.length }.toList()
allSols.forEach { println(it); assertTrue("\"$it\" was invalid!") { it in seq2parsePythonCFG.language } }
println("Found ${allSols.size} solutions, all were valid!")
}

val seq2parsePythonCFG: CFG =
"""
START -> Stmts_Or_Newlines Endmarker
Expand Down

0 comments on commit 14dc51d

Please sign in to comment.