diff --git a/src/commonMain/kotlin/ai/hypergraph/kaliningraph/parsing/SeqValiant.kt b/src/commonMain/kotlin/ai/hypergraph/kaliningraph/parsing/SeqValiant.kt new file mode 100644 index 00000000..b94a9519 --- /dev/null +++ b/src/commonMain/kotlin/ai/hypergraph/kaliningraph/parsing/SeqValiant.kt @@ -0,0 +1,72 @@ +package ai.hypergraph.kaliningraph.parsing + +import ai.hypergraph.kaliningraph.tensor.UTMatrix +import ai.hypergraph.kaliningraph.types.* + +typealias PForest = Set +operator fun PForest.contains(v: Σᐩ) = PTree(v) in this +fun PSingleton(v: Σᐩ): List<Π2A> = listOf(PTree(v) to PTree()) + +// Algebraic data type / polynomial functor for parse forests +data class PTree(val root: Σᐩ = "ε", val children: List<Π2A> = listOf()) { + // Returns the set of all strings derivable from the given PTree + fun choose(): Sequence<Σᐩ> = + if (children.isEmpty()) sequenceOf(if("ε" in root) "" else root) + else children.asSequence().flatMap { (l, r) -> + // TODO: Use weighted choice mechanism + (l.choose() * r.choose()).map { (a, b) -> + if (a == "") b else if (b == "") a else "$a $b" + } + } + + override fun hashCode(): Int = root.hashCode() + override fun equals(other: Any?) = other is PTree && root == other.root +} + +// Lazily computes all syntactically strings compatible with the given template +fun CFG.solveSeq(s: Σᐩ): Sequence<Σᐩ> = solveSeq(s.tokenizeByWhitespace()) + +fun CFG.solveSeq(s: List<Σᐩ>): Sequence<Σᐩ> = + try { solvePTreeFPSeq(s) } + catch (e: Exception) { e.printStackTrace(); null } ?: sequenceOf() + +fun CFG.solvePTreeFPSeq( + tokens: List<Σᐩ>, + utMatrix: UTMatrix = initPForestMatrix(tokens, pforestAlgebra()), +) = + utMatrix.seekFixpoint().toFullMatrix()[0].last() + .firstOrNull { it.root == START_SYMBOL }?.choose() ?: emptySequence() + +fun CFG.initPForestMatrix( + tokens: List<Σᐩ>, + algebra: Ring +): UTMatrix = + UTMatrix( + ts = tokens.map { token -> + (if (token != HOLE_MARKER) bimap[listOf(token)] else unitNonterminals) + .associateWith { nt -> + if (token != HOLE_MARKER) PSingleton(token) + else bimap.UNITS[nt]?.map { PSingleton(it) }?.flatten()?.toSet()?.toList() ?: listOf() + }.map { (k, v) -> PTree(k, v) }.toSet() + }.toTypedArray(), + algebra = algebra + ) + +// Maintains a sorted list of nonterminal roots and their leaves +fun CFG.pforestAlgebra(): Ring = + Ring.of( + nil = emptySet(), + plus = { x, y -> x union y }, + times = { x, y -> joinSeq(x, y) }, + ) + +// X ⊗ Z := { w | ∈ X × Z, (w -> xz) ∈ P } +fun CFG.joinSeq(X: PForest, Z: PForest): PForest = + bimap.TRIPL.filter { (_, x, z) -> x in X && z in Z } + .groupingBy { it.first }.aggregate { _, acc: List<Π2A>?, it, _-> + val (w, x, z) = it + val ptreeX = X.first { it.root == x } + val ptreeZ = Z.first { it.root == z } + val pair = ptreeX to ptreeZ + if (acc == null) listOf(pair) else acc + pair + }.map { (k, v) -> PTree(k, v) }.toSet() \ No newline at end of file diff --git a/src/commonMain/kotlin/ai/hypergraph/kaliningraph/parsing/SortValiant.kt b/src/commonMain/kotlin/ai/hypergraph/kaliningraph/parsing/SortValiant.kt index 33bf5652..9eba9322 100644 --- a/src/commonMain/kotlin/ai/hypergraph/kaliningraph/parsing/SortValiant.kt +++ b/src/commonMain/kotlin/ai/hypergraph/kaliningraph/parsing/SortValiant.kt @@ -6,6 +6,12 @@ import ai.hypergraph.kaliningraph.levenshtein import ai.hypergraph.kaliningraph.tensor.UTMatrix import ai.hypergraph.kaliningraph.types.* +// The main issue with SortValiant is we eagerly compute the Cartesian product +// and this blows up very quickly, so we need to sort and prune aggressively. +// We can instead use a lazy Cartesian product, which is what SeqValiant does. +// The downside is that we lose the ability to sort the results while parsing, +// but we can still use a metric to sort the results after the fact. + // Returns all syntactically strings ordered by distance to withRespect fun CFG.solve(s: Σᐩ, metric: ChoiceMetric): Set<Σᐩ> = solve(s.tokenizeByWhitespace(), metric) @@ -49,6 +55,9 @@ const val MAX_CAPACITY = 100 fun CFG.join(X: Sort, Z: Sort, metric: ChoiceMetric = { it.weight }): Sort = bimap.TRIPL.filter { (_, x, z) -> x in X && z in Z } .map { (w, x, z) -> + // This Cartesian product becomes expensive quickly so MAX_CAPACITY is used + // to limit the number of elements in the product. This is a greedy approach + // and we always take the top MAX_CAPACITY-elements by the provided metric. ((X[x] ?: setOf()) * (Z[z] ?: setOf())) .map { (q, r) -> w to (q + r) } }.flatten().groupingBy { it.first } diff --git a/src/commonTest/kotlin/ai/hypergraph/kaliningraph/parsing/SetValiantTest.kt b/src/commonTest/kotlin/ai/hypergraph/kaliningraph/parsing/SetValiantTest.kt index 4232dfff..0f611f83 100644 --- a/src/commonTest/kotlin/ai/hypergraph/kaliningraph/parsing/SetValiantTest.kt +++ b/src/commonTest/kotlin/ai/hypergraph/kaliningraph/parsing/SetValiantTest.kt @@ -369,6 +369,16 @@ class SetValiantTest { }.also { println("Finished in ${it.inWholeMilliseconds}ms.") } } +/* +./gradlew jvmTest --tests "ai.hypergraph.kaliningraph.parsing.SetValiantTest.testSeqValiant" +*/ + @Test + fun testSeqValiant() { + val allSols = seq2parsePythonCFG.solveSeq("_ _ _ _ _").sortedBy { it.length }.toList() + allSols.forEach { println(it); assertTrue("\"$it\" was invalid!") { it in seq2parsePythonCFG.language } } + println("Found ${allSols.size} solutions, all were valid!") + } + val seq2parsePythonCFG: CFG = """ START -> Stmts_Or_Newlines Endmarker