From 480e79bfa764c6218ebfaeb6c93e00184e7fefbd Mon Sep 17 00:00:00 2001 From: David Baker Effendi Date: Tue, 25 Jul 2023 15:47:36 +0200 Subject: [PATCH 1/3] Fine-Tune Type Prop Queries & Added Iterations CLI Opt * Optimized queries and data structures here and there, was able to reduce the memory consumption but it only defers a huge allocation later. * Added CLI opt for JS and Python `--type-prop-iterations` where one can set how many type propagation iterations run. 2 onwards typically involves interprocedural propagation. e.g. `./joern-parse .../../Workspace/test-repos/python/open-mmlab --frontend-args --type-prop-iterations 1` finishes for me but only performs intraprocedural type propagation. @yzn12345 The type propagation is pretty rough around the edges and likely can be rewritten to be a bit more precise and performant but I hope this helps generate python CPGs with some type recovery! Somewhat resolves #3211 --- .../cpgcreation/PythonSrcCpgGenerator.scala | 8 +- .../passes/JavaTypeRecoveryPass.scala | 3 - .../scala/io/joern/jssrc2cpg/JsSrc2Cpg.scala | 7 +- .../main/scala/io/joern/jssrc2cpg/Main.scala | 17 +++- .../main/scala/io/joern/pysrc2cpg/Main.scala | 6 +- .../joern/pysrc2cpg/Py2CpgOnFileSystem.scala | 7 +- .../joern/pysrc2cpg/PythonTypeRecovery.scala | 6 +- .../x2cpg/passes/frontend/XTypeRecovery.scala | 82 +++++++++---------- 8 files changed, 78 insertions(+), 58 deletions(-) diff --git a/console/src/main/scala/io/joern/console/cpgcreation/PythonSrcCpgGenerator.scala b/console/src/main/scala/io/joern/console/cpgcreation/PythonSrcCpgGenerator.scala index 438b0ecaf701..919ce9e51eb2 100644 --- a/console/src/main/scala/io/joern/console/cpgcreation/PythonSrcCpgGenerator.scala +++ b/console/src/main/scala/io/joern/console/cpgcreation/PythonSrcCpgGenerator.scala @@ -1,7 +1,7 @@ package io.joern.console.cpgcreation import io.joern.console.FrontendConfig -import io.joern.pysrc2cpg._ +import io.joern.pysrc2cpg.* import io.joern.x2cpg.X2Cpg import io.joern.x2cpg.passes.base.AstLinkerPass import io.joern.x2cpg.passes.callgraph.NaiveCallLinker @@ -31,8 +31,10 @@ case class PythonSrcCpgGenerator(config: FrontendConfig, rootPath: Path) extends new ImportResolverPass(cpg).createAndApply() new DynamicTypeHintFullNamePass(cpg).createAndApply() new PythonInheritanceNamePass(cpg).createAndApply() - new PythonTypeRecoveryPass(cpg, XTypeRecoveryConfig(enabledDummyTypes = !pyConfig.forall(_.disableDummyTypes))) - .createAndApply() + val typeRecoveryConfig = pyConfig match + case Some(config) => XTypeRecoveryConfig(config.typePropagationIterations, !config.disableDummyTypes) + case None => XTypeRecoveryConfig() + new PythonTypeRecoveryPass(cpg, typeRecoveryConfig).createAndApply() new PythonTypeHintCallLinker(cpg).createAndApply() new NaiveCallLinker(cpg).createAndApply() diff --git a/joern-cli/frontends/javasrc2cpg/src/main/scala/io/joern/javasrc2cpg/passes/JavaTypeRecoveryPass.scala b/joern-cli/frontends/javasrc2cpg/src/main/scala/io/joern/javasrc2cpg/passes/JavaTypeRecoveryPass.scala index 2e95e3aa8be4..d51e25d510ff 100644 --- a/joern-cli/frontends/javasrc2cpg/src/main/scala/io/joern/javasrc2cpg/passes/JavaTypeRecoveryPass.scala +++ b/joern-cli/frontends/javasrc2cpg/src/main/scala/io/joern/javasrc2cpg/passes/JavaTypeRecoveryPass.scala @@ -51,9 +51,6 @@ private class RecoverForJavaFile(cpg: Cpg, cu: Method, builder: DiffGraphBuilder } } - override protected def nodeExistingTypes(storedNode: StoredNode): Seq[String] = - super.nodeExistingTypes(storedNode).filterNot(_.startsWith(Defines.UnresolvedNamespace)) - // There seems to be issues with inferring these, often due to situations where super and this are confused on name // and code properties. override protected def storeIdentifierTypeInfo(i: Identifier, types: Seq[String]): Unit = if (i.name != "this") { diff --git a/joern-cli/frontends/jssrc2cpg/src/main/scala/io/joern/jssrc2cpg/JsSrc2Cpg.scala b/joern-cli/frontends/jssrc2cpg/src/main/scala/io/joern/jssrc2cpg/JsSrc2Cpg.scala index 8cd85e594609..b0e10b6d08c1 100644 --- a/joern-cli/frontends/jssrc2cpg/src/main/scala/io/joern/jssrc2cpg/JsSrc2Cpg.scala +++ b/joern-cli/frontends/jssrc2cpg/src/main/scala/io/joern/jssrc2cpg/JsSrc2Cpg.scala @@ -3,7 +3,7 @@ package io.joern.jssrc2cpg import better.files.File import io.joern.dataflowengineoss.layers.dataflows.{OssDataFlow, OssDataFlowOptions} import io.joern.jssrc2cpg.JsSrc2Cpg.postProcessingPasses -import io.joern.jssrc2cpg.passes._ +import io.joern.jssrc2cpg.passes.* import io.joern.jssrc2cpg.utils.AstGenRunner import io.joern.x2cpg.X2Cpg.withNewEmptyCpg import io.joern.x2cpg.X2CpgFrontend @@ -57,11 +57,14 @@ class JsSrc2Cpg extends X2CpgFrontend[Config] { object JsSrc2Cpg { def postProcessingPasses(cpg: Cpg, config: Option[Config] = None): List[CpgPassBase] = { + val typeRecoveryConfig = config match + case Some(config) => XTypeRecoveryConfig(config.typePropagationIterations, !config.disableDummyTypes) + case None => XTypeRecoveryConfig() List( new JavaScriptInheritanceNamePass(cpg), new ConstClosurePass(cpg), new ImportResolverPass(cpg), - new JavaScriptTypeRecoveryPass(cpg, XTypeRecoveryConfig(enabledDummyTypes = !config.exists(_.disableDummyTypes))), + new JavaScriptTypeRecoveryPass(cpg, typeRecoveryConfig), new JavaScriptTypeHintCallLinker(cpg), new NaiveCallLinker(cpg) ) diff --git a/joern-cli/frontends/jssrc2cpg/src/main/scala/io/joern/jssrc2cpg/Main.scala b/joern-cli/frontends/jssrc2cpg/src/main/scala/io/joern/jssrc2cpg/Main.scala index 2cee202f6be7..32eeb8fc0d78 100644 --- a/joern-cli/frontends/jssrc2cpg/src/main/scala/io/joern/jssrc2cpg/Main.scala +++ b/joern-cli/frontends/jssrc2cpg/src/main/scala/io/joern/jssrc2cpg/Main.scala @@ -1,13 +1,14 @@ package io.joern.jssrc2cpg -import io.joern.jssrc2cpg.Frontend._ +import io.joern.jssrc2cpg.Frontend.* import io.joern.x2cpg.utils.Environment import io.joern.x2cpg.{X2CpgConfig, X2CpgMain} import scopt.OParser import java.nio.file.Paths -final case class Config(tsTypes: Boolean = true, disableDummyTypes: Boolean = false) extends X2CpgConfig[Config] { +final case class Config(tsTypes: Boolean = true, disableDummyTypes: Boolean = false, typePropagationIterations: Int = 2) + extends X2CpgConfig[Config] { def withTsTypes(value: Boolean): Config = { copy(tsTypes = value).withInheritedFields(this) } @@ -15,6 +16,10 @@ final case class Config(tsTypes: Boolean = true, disableDummyTypes: Boolean = fa def withDisableDummyTypes(value: Boolean): Config = { copy(disableDummyTypes = value).withInheritedFields(this) } + + def withTypePropagationIterations(value: Int): Config = { + copy(typePropagationIterations = value).withInheritedFields(this) + } } object Frontend { @@ -22,7 +27,7 @@ object Frontend { val cmdLineParser: OParser[Unit, Config] = { val builder = OParser.builder[Config] - import builder._ + import builder.* OParser.sequence( programName("jssrc2cpg"), opt[Unit]("no-tsTypes") @@ -32,7 +37,11 @@ object Frontend { opt[Unit]("no-dummyTypes") .hidden() .action((_, c) => c.withDisableDummyTypes(true)) - .text("disable generation of dummy types during type recovery") + .text("disable generation of dummy types during type propagation"), + opt[Int]("type-prop-iterations") + .hidden() + .action((x, c) => c.withTypePropagationIterations(x)) + .text("maximum iterations of type propagation") ) } diff --git a/joern-cli/frontends/pysrc2cpg/src/main/scala/io/joern/pysrc2cpg/Main.scala b/joern-cli/frontends/pysrc2cpg/src/main/scala/io/joern/pysrc2cpg/Main.scala index 528e420904df..b1c1b2efa32c 100644 --- a/joern-cli/frontends/pysrc2cpg/src/main/scala/io/joern/pysrc2cpg/Main.scala +++ b/joern-cli/frontends/pysrc2cpg/src/main/scala/io/joern/pysrc2cpg/Main.scala @@ -23,7 +23,11 @@ private object Frontend { opt[Unit]("no-dummyTypes") .hidden() .action((_, c) => c.withDisableDummyTypes(true)) - .text("disable generation of dummy types during type recovery") + .text("disable generation of dummy types during type propagation"), + opt[Int]("type-prop-iterations") + .hidden() + .action((x, c) => c.withTypePropagationIterations(x)) + .text("maximum iterations of type propagation") ) } } diff --git a/joern-cli/frontends/pysrc2cpg/src/main/scala/io/joern/pysrc2cpg/Py2CpgOnFileSystem.scala b/joern-cli/frontends/pysrc2cpg/src/main/scala/io/joern/pysrc2cpg/Py2CpgOnFileSystem.scala index 2ed99239bd68..78f8577536e9 100644 --- a/joern-cli/frontends/pysrc2cpg/src/main/scala/io/joern/pysrc2cpg/Py2CpgOnFileSystem.scala +++ b/joern-cli/frontends/pysrc2cpg/src/main/scala/io/joern/pysrc2cpg/Py2CpgOnFileSystem.scala @@ -5,13 +5,14 @@ import io.shiftleft.codepropertygraph.Cpg import io.shiftleft.utils.IOUtils import org.slf4j.LoggerFactory -import java.nio.file._ +import java.nio.file.* import scala.util.Try case class Py2CpgOnFileSystemConfig( venvDir: Path = Paths.get(".venv"), ignoreVenvDir: Boolean = true, disableDummyTypes: Boolean = false, + typePropagationIterations: Int = 2, requirementsTxt: String = "requirements.txt" ) extends X2CpgConfig[Py2CpgOnFileSystemConfig] { def withVenvDir(venvDir: Path): Py2CpgOnFileSystemConfig = { @@ -26,6 +27,10 @@ case class Py2CpgOnFileSystemConfig( copy(disableDummyTypes = value).withInheritedFields(this) } + def withTypePropagationIterations(value: Int): Py2CpgOnFileSystemConfig = { + copy(typePropagationIterations = value).withInheritedFields(this) + } + def withRequirementsTxt(text: String): Py2CpgOnFileSystemConfig = { copy(requirementsTxt = text).withInheritedFields(this) } diff --git a/joern-cli/frontends/pysrc2cpg/src/main/scala/io/joern/pysrc2cpg/PythonTypeRecovery.scala b/joern-cli/frontends/pysrc2cpg/src/main/scala/io/joern/pysrc2cpg/PythonTypeRecovery.scala index 3e8708bd8148..a84a423baafa 100644 --- a/joern-cli/frontends/pysrc2cpg/src/main/scala/io/joern/pysrc2cpg/PythonTypeRecovery.scala +++ b/joern-cli/frontends/pysrc2cpg/src/main/scala/io/joern/pysrc2cpg/PythonTypeRecovery.scala @@ -148,9 +148,9 @@ private class RecoverForPythonFile(cpg: Cpg, cu: File, builder: DiffGraphBuilder if (fa.method.name == "") { Set(fa.method.fullName) } else if (fa.method.typeDecl.nonEmpty) { - val parentTypes = fa.method.typeDecl.fullName.toSeq - val baseTypeFullNames = cpg.typeDecl.fullNameExact(parentTypes: _*).inheritsFromTypeFullName.toSeq - (parentTypes ++ baseTypeFullNames).filterNot(_.toLowerCase.matches("(any|object)")).toSet + val parentTypes = fa.method.typeDecl.fullName.toSet + val baseTypeFullNames = cpg.typeDecl.fullNameExact(parentTypes.toSeq: _*).inheritsFromTypeFullName.toSet + (parentTypes ++ baseTypeFullNames).filterNot(_.matches("(?i)(any|object)")) } else { super.getFieldParents(fa) } diff --git a/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/passes/frontend/XTypeRecovery.scala b/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/passes/frontend/XTypeRecovery.scala index 62ffb404963c..005b9cde529e 100644 --- a/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/passes/frontend/XTypeRecovery.scala +++ b/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/passes/frontend/XTypeRecovery.scala @@ -2,10 +2,10 @@ package io.joern.x2cpg.passes.frontend import io.joern.x2cpg.Defines import io.shiftleft.codepropertygraph.Cpg -import io.shiftleft.codepropertygraph.generated.nodes._ -import io.shiftleft.codepropertygraph.generated.{EdgeTypes, Operators, PropertyNames} +import io.shiftleft.codepropertygraph.generated.nodes.* +import io.shiftleft.codepropertygraph.generated.{EdgeTypes, NodeTypes, Operators, PropertyNames} import io.shiftleft.passes.CpgPass -import io.shiftleft.semanticcpg.language._ +import io.shiftleft.semanticcpg.language.* import io.shiftleft.semanticcpg.language.operatorextension.OpNodes import io.shiftleft.semanticcpg.language.operatorextension.OpNodes.{Assignment, FieldAccess} import org.slf4j.{Logger, LoggerFactory} @@ -190,7 +190,7 @@ abstract class RecoverForXCompilationUnit[CompilationUnitType <: AstNode]( /** New node tracking set. */ - protected val addedNodes = mutable.HashSet.empty[(Long, String)] + protected val addedNodes = mutable.HashSet.empty[String] /** For tracking members and the type operations that need to be performed. Since these are mostly out of scope * locally it helps to track these separately. @@ -261,7 +261,7 @@ abstract class RecoverForXCompilationUnit[CompilationUnitType <: AstNode]( resolvedImport <- i.call.tag alias <- i.importedAs } { - import io.joern.x2cpg.passes.frontend.ImportsPass._ + import io.joern.x2cpg.passes.frontend.ImportsPass.* ResolvedImport.tagToResolvedImport(resolvedImport).foreach { case ResolvedMethod(fullName, alias, receiver, _) => @@ -418,7 +418,7 @@ abstract class RecoverForXCompilationUnit[CompilationUnitType <: AstNode]( */ protected def getFieldParents(fa: FieldAccess): Set[String] = { val fieldName = getFieldName(fa).split(pathSep).last - cpg.typeDecl.where(_.member.nameExact(fieldName)).fullName.filterNot(_.contains("ANY")).toSet + cpg.member.nameExact(fieldName).typeDecl.fullName.filterNot(_.contains("ANY")).toSet } /** Associates the types with the identifier. This may sometimes be an identifier that should be considered a field @@ -479,7 +479,7 @@ abstract class RecoverForXCompilationUnit[CompilationUnitType <: AstNode]( operation match { case Operators.alloc => visitIdentifierAssignedToConstructor(i, c) case Operators.fieldAccess => visitIdentifierAssignedToFieldLoad(i, new FieldAccess(c)) - case Operators.indexAccess => visitIdentifierAssignedToIndexAcess(i, c) + case Operators.indexAccess => visitIdentifierAssignedToIndexAccess(i, c) case Operators.cast => visitIdentifierAssignedToCast(i, c) case x => logger.debug(s"Unhandled operation $x (${c.code}) @ ${debugLocation(c)}"); Set.empty } @@ -592,7 +592,7 @@ abstract class RecoverForXCompilationUnit[CompilationUnitType <: AstNode]( case Some(cVar) if symbolTable.contains(cVar) => symbolTable.get(cVar) case Some(cVar) if symbolTable.contains(LocalVar(cVar.identifier)) => - symbolTable.get(LocalVar(cVar.identifier)).map(_.concat(s"$pathSep${XTypeRecovery.DummyIndexAccess}")) + symbolTable.get(LocalVar(cVar.identifier)).map(x => s"$x$pathSep${XTypeRecovery.DummyIndexAccess}") case _ => Set.empty } @@ -629,14 +629,18 @@ abstract class RecoverForXCompilationUnit[CompilationUnitType <: AstNode]( sb.toString() } + lazy val typesFromBaseCall = fa.argumentOut.headOption match + case Some(call: Call) => getTypesFromCall(call) + case _ => Set.empty[String] + fa.argumentOut.l match { case ::(i: Identifier, ::(f: FieldIdentifier, _)) if i.name.matches("(self|this)") => wrapName(f.canonicalName) case ::(i: Identifier, ::(f: FieldIdentifier, _)) => wrapName(s"${i.name}$pathSep${f.canonicalName}") case ::(c: Call, ::(f: FieldIdentifier, _)) if c.name.equals(Operators.fieldAccess) => wrapName(getFieldName(new FieldAccess(c), suffix = f.canonicalName)) - case ::(c: Call, ::(f: FieldIdentifier, _)) if getTypesFromCall(c).nonEmpty => + case ::(_: Call, ::(f: FieldIdentifier, _)) if typesFromBaseCall.nonEmpty => // TODO: Handle this case better - wrapName(s"${getTypesFromCall(c).head}$pathSep${f.canonicalName}") + wrapName(s"${typesFromBaseCall.head}$pathSep${f.canonicalName}") case ::(f: FieldIdentifier, ::(c: Call, _)) if c.name.equals(Operators.fieldAccess) => wrapName(getFieldName(new FieldAccess(c), prefix = f.canonicalName)) case ::(c: Call, ::(f: FieldIdentifier, _)) => @@ -758,7 +762,7 @@ abstract class RecoverForXCompilationUnit[CompilationUnitType <: AstNode]( /** Visits an identifier being assigned to the result of an index access operation. */ - protected def visitIdentifierAssignedToIndexAcess(i: Identifier, c: Call): Set[String] = + protected def visitIdentifierAssignedToIndexAccess(i: Identifier, c: Call): Set[String] = associateTypes(i, getTypesFromCall(c)) /** Visits an identifier that is the target of a cast operation. @@ -929,10 +933,8 @@ abstract class RecoverForXCompilationUnit[CompilationUnitType <: AstNode]( } protected def setTypeFromTypeHints(n: StoredNode): Unit = { - val nodeType = n.property(PropertyNames.TYPE_FULL_NAME, "ANY") - val dynamicTypeHints = n.property(PropertyNames.DYNAMIC_TYPE_HINT_FULL_NAME, Seq.empty[String]) - val types = (nodeType +: dynamicTypeHints).filterNot(x => x == "ANY" || XTypeRecovery.isDummyType(x)) - if (dynamicTypeHints.nonEmpty) setTypes(n, types) + val types = n.getKnownTypes.filterNot(XTypeRecovery.isDummyType) + if (types.nonEmpty) setTypes(n, types.toSeq) } /** In the case this field access is a function pointer, we would want to make sure this has a method ref. @@ -945,23 +947,21 @@ abstract class RecoverForXCompilationUnit[CompilationUnitType <: AstNode]( ): Unit = { // Sometimes the function identifier is an argument to the call itself as a "base". In this case we don't need // a method ref. This happens in jssrc2cpg - if (funcPtr.astParent.iterator.collectAll[Call].exists(_.name == funcName)) return - - baseTypes - .map(t => if (t.endsWith(funcName)) t else s"$t$pathSep$funcName") - .flatMap(p => cpg.method.fullNameExact(p)) - .map(m => m -> createMethodRef(baseName, funcName, m.fullName, funcPtr.lineNumber, funcPtr.columnNumber)) - .filterNot { case (_, mRef) => - addedNodes.contains((funcPtr.id(), s"${mRef.label()}$pathSep${mRef.methodFullName}")) - } - .foreach { case (m, mRef) => - funcPtr.astParent - .filterNot(_.astChildren.isMethodRef.methodFullNameExact(mRef.methodFullName).nonEmpty) - .foreach { inCall => - state.changesWereMade.compareAndSet(false, true) - integrateMethodRef(funcPtr, m, mRef, inCall) - } - } + if (!funcPtr.astParent.iterator.collectAll[Call].exists(_.name == funcName)) { + baseTypes + .map(t => if (t.endsWith(funcName)) t else s"$t$pathSep$funcName") + .flatMap(cpg.method.fullNameExact) + .filterNot(m => addedNodes.contains(s"${funcPtr.id()}${NodeTypes.METHOD_REF}$pathSep${m.fullName}")) + .map(m => m -> createMethodRef(baseName, funcName, m.fullName, funcPtr.lineNumber, funcPtr.columnNumber)) + .foreach { case (m, mRef) => + funcPtr.astParent + .filterNot(_.astChildren.isMethodRef.exists(_.methodFullName == mRef.methodFullName)) + .foreach { inCall => + state.changesWereMade.compareAndSet(false, true) + integrateMethodRef(funcPtr, m, mRef, inCall) + } + } + } } private def createMethodRef( @@ -992,7 +992,7 @@ abstract class RecoverForXCompilationUnit[CompilationUnitType <: AstNode]( case x => mRef.argumentIndex(x.astChildren.size + 1) } - addedNodes.add((funcPtr.id(), s"${mRef.label()}$pathSep${mRef.methodFullName}")) + addedNodes.add(s"${funcPtr.id()}${NodeTypes.METHOD_REF}$pathSep${mRef.methodFullName}") } protected def persistType(x: StoredNode, types: Set[String]): Unit = { @@ -1050,9 +1050,9 @@ abstract class RecoverForXCompilationUnit[CompilationUnitType <: AstNode]( .headOption private def storeNodeTypeInfo(storedNode: StoredNode, types: Seq[String]): Unit = { - lazy val existingTypes = nodeExistingTypes(storedNode) + lazy val existingTypes = storedNode.getKnownTypes - if (types.nonEmpty && types != existingTypes) { + if (types.nonEmpty && types.toSet != existingTypes) { storedNode match { case m: Member => // To avoid overwriting member updates, we store them elsewhere until the end @@ -1081,8 +1081,6 @@ abstract class RecoverForXCompilationUnit[CompilationUnitType <: AstNode]( ) } - protected def nodeExistingTypes(storedNode: StoredNode): Seq[String] = storedNode.allTypes.filterNot(_ == "ANY").toSeq - /** Allows one to modify the types assigned to identifiers. */ protected def storeIdentifierTypeInfo(i: Identifier, types: Seq[String]): Unit = @@ -1091,7 +1089,7 @@ abstract class RecoverForXCompilationUnit[CompilationUnitType <: AstNode]( /** Allows one to modify the types assigned to nodes otherwise. */ protected def storeDefaultTypeInfo(n: StoredNode, types: Seq[String]): Unit = - if (types != nodeExistingTypes(n)) { + if (types.toSet != n.getKnownTypes) { state.changesWereMade.compareAndSet(false, true) setTypes(n, (n.property(PropertyNames.DYNAMIC_TYPE_HINT_FULL_NAME, Seq.empty) ++ types).distinct) } @@ -1101,7 +1099,7 @@ abstract class RecoverForXCompilationUnit[CompilationUnitType <: AstNode]( */ protected def setTypes(n: StoredNode, types: Seq[String]): Unit = if (types.size == 1) builder.setNodeProperty(n, PropertyNames.TYPE_FULL_NAME, types.head) - else builder.setNodeProperty(n, PropertyNames.DYNAMIC_TYPE_HINT_FULL_NAME, types.distinct) + else builder.setNodeProperty(n, PropertyNames.DYNAMIC_TYPE_HINT_FULL_NAME, types) /** Allows one to modify the types assigned to locals. */ @@ -1113,6 +1111,8 @@ abstract class RecoverForXCompilationUnit[CompilationUnitType <: AstNode]( */ protected def postSetTypeInformation(): Unit = {} + private val unknownTypePattern = s"(i?)(UNKNOWN|ANY|${Defines.UnresolvedNamespace}).*".r + // The below are convenience calls for accessing type properties, one day when this pass uses `Tag` nodes instead of // the symbol table then perhaps this would work out better implicit class AllNodeTypesFromNodeExt(x: StoredNode) { @@ -1122,7 +1122,7 @@ abstract class RecoverForXCompilationUnit[CompilationUnitType <: AstNode]( )).iterator def getKnownTypes: Set[String] = { - x.allTypes.filterNot(_.toUpperCase.matches("(UNKNOWN|ANY)")).toSet + x.allTypes.toSet.filterNot(unknownTypePattern.matches) } } @@ -1130,7 +1130,7 @@ abstract class RecoverForXCompilationUnit[CompilationUnitType <: AstNode]( def allTypes: Iterator[String] = x.flatMap(_.allTypes) def getKnownTypes: Set[String] = - x.allTypes.filterNot(_.toUpperCase.matches("(UNKNOWN|ANY)")).toSet + x.allTypes.toSet.filterNot(unknownTypePattern.matches) } } From 798bb66821caa8db884c678b99edfedb1baf41d5 Mon Sep 17 00:00:00 2001 From: David Baker Effendi Date: Thu, 27 Jul 2023 11:46:57 +0200 Subject: [PATCH 2/3] Added `TypeRecoveryParserConfig` trait for re-usable OParser options * Added `TypeRecoveryParserConfig` and `XTypeRecovery.parserOptions` to reduce code duplication across frontends that use type propagation * Added logger info/warnings for type propagation iteration count, but no failures --- .../scala/io/joern/javasrc2cpg/Main.scala | 16 ++-- .../main/scala/io/joern/jssrc2cpg/Main.scala | 21 +---- .../main/scala/io/joern/pysrc2cpg/Main.scala | 10 +-- .../joern/pysrc2cpg/Py2CpgOnFileSystem.scala | 14 +--- .../scala/io/joern/rubysrc2cpg/Main.scala | 10 ++- .../x2cpg/passes/frontend/XTypeRecovery.scala | 76 +++++++++++++++---- .../joern/joerncli/AbstractJoernCliTest.scala | 4 +- 7 files changed, 85 insertions(+), 66 deletions(-) diff --git a/joern-cli/frontends/javasrc2cpg/src/main/scala/io/joern/javasrc2cpg/Main.scala b/joern-cli/frontends/javasrc2cpg/src/main/scala/io/joern/javasrc2cpg/Main.scala index d18be7285655..6ff46b7f5f09 100644 --- a/joern-cli/frontends/javasrc2cpg/src/main/scala/io/joern/javasrc2cpg/Main.scala +++ b/joern-cli/frontends/javasrc2cpg/src/main/scala/io/joern/javasrc2cpg/Main.scala @@ -1,6 +1,7 @@ package io.joern.javasrc2cpg -import io.joern.javasrc2cpg.Frontend._ +import io.joern.javasrc2cpg.Frontend.* +import io.joern.x2cpg.passes.frontend.{TypeRecoveryParserConfig, XTypeRecovery} import io.joern.x2cpg.{X2CpgConfig, X2CpgMain} import scopt.OParser @@ -13,10 +14,10 @@ final case class Config( delombokJavaHome: Option[String] = None, delombokMode: Option[String] = None, enableTypeRecovery: Boolean = false, - disableDummyTypes: Boolean = false, jdkPath: Option[String] = None, showEnv: Boolean = false -) extends X2CpgConfig[Config] { +) extends X2CpgConfig[Config] + with TypeRecoveryParserConfig[Config] { def withInferenceJarPaths(paths: Set[String]): Config = { copy(inferenceJarPaths = paths).withInheritedFields(this) } @@ -41,10 +42,6 @@ final case class Config( copy(enableTypeRecovery = value).withInheritedFields(this) } - def withDisableDummyTypes(value: Boolean): Config = { - copy(disableDummyTypes = value).withInheritedFields(this) - } - def withJdkPath(path: String): Config = { copy(jdkPath = Some(path)).withInheritedFields(this) } @@ -83,10 +80,7 @@ private object Frontend { .hidden() .action((_, c) => c.withEnableTypeRecovery(true)) .text("enable generic type recovery"), - opt[Unit]("no-dummyTypes") - .hidden() - .action((_, c) => c.withDisableDummyTypes(true)) - .text("disable generation of dummy types during type recovery"), + XTypeRecovery.parserOptions, opt[String]("jdk-path") .action((path, c) => c.withJdkPath(path)) .text("JDK used for resolving builtin Java types. If not set, current classpath will be used"), diff --git a/joern-cli/frontends/jssrc2cpg/src/main/scala/io/joern/jssrc2cpg/Main.scala b/joern-cli/frontends/jssrc2cpg/src/main/scala/io/joern/jssrc2cpg/Main.scala index 32eeb8fc0d78..d277dfe2fee0 100644 --- a/joern-cli/frontends/jssrc2cpg/src/main/scala/io/joern/jssrc2cpg/Main.scala +++ b/joern-cli/frontends/jssrc2cpg/src/main/scala/io/joern/jssrc2cpg/Main.scala @@ -1,25 +1,19 @@ package io.joern.jssrc2cpg import io.joern.jssrc2cpg.Frontend.* +import io.joern.x2cpg.passes.frontend.{TypeRecoveryParserConfig, XTypeRecovery} import io.joern.x2cpg.utils.Environment import io.joern.x2cpg.{X2CpgConfig, X2CpgMain} import scopt.OParser import java.nio.file.Paths -final case class Config(tsTypes: Boolean = true, disableDummyTypes: Boolean = false, typePropagationIterations: Int = 2) - extends X2CpgConfig[Config] { +final case class Config(tsTypes: Boolean = true) extends X2CpgConfig[Config] with TypeRecoveryParserConfig[Config] { + def withTsTypes(value: Boolean): Config = { copy(tsTypes = value).withInheritedFields(this) } - def withDisableDummyTypes(value: Boolean): Config = { - copy(disableDummyTypes = value).withInheritedFields(this) - } - - def withTypePropagationIterations(value: Int): Config = { - copy(typePropagationIterations = value).withInheritedFields(this) - } } object Frontend { @@ -34,14 +28,7 @@ object Frontend { .hidden() .action((_, c) => c.withTsTypes(false)) .text("disable generation of types via Typescript"), - opt[Unit]("no-dummyTypes") - .hidden() - .action((_, c) => c.withDisableDummyTypes(true)) - .text("disable generation of dummy types during type propagation"), - opt[Int]("type-prop-iterations") - .hidden() - .action((x, c) => c.withTypePropagationIterations(x)) - .text("maximum iterations of type propagation") + XTypeRecovery.parserOptions ) } diff --git a/joern-cli/frontends/pysrc2cpg/src/main/scala/io/joern/pysrc2cpg/Main.scala b/joern-cli/frontends/pysrc2cpg/src/main/scala/io/joern/pysrc2cpg/Main.scala index b1c1b2efa32c..1ca69f8691cb 100644 --- a/joern-cli/frontends/pysrc2cpg/src/main/scala/io/joern/pysrc2cpg/Main.scala +++ b/joern-cli/frontends/pysrc2cpg/src/main/scala/io/joern/pysrc2cpg/Main.scala @@ -2,6 +2,7 @@ package io.joern.pysrc2cpg import io.joern.pysrc2cpg.Frontend.cmdLineParser import io.joern.x2cpg.X2CpgMain +import io.joern.x2cpg.passes.frontend.XTypeRecovery import scopt.OParser import java.nio.file.Paths @@ -20,14 +21,7 @@ private object Frontend { // Default is specified in Py2CpgOFileSystemConfig because Scopt is a shit library. .text("Specifies whether venv-dir is ignored. Default to true.") .action(((value, config) => config.withIgnoreVenvDir(value))), - opt[Unit]("no-dummyTypes") - .hidden() - .action((_, c) => c.withDisableDummyTypes(true)) - .text("disable generation of dummy types during type propagation"), - opt[Int]("type-prop-iterations") - .hidden() - .action((x, c) => c.withTypePropagationIterations(x)) - .text("maximum iterations of type propagation") + XTypeRecovery.parserOptions ) } } diff --git a/joern-cli/frontends/pysrc2cpg/src/main/scala/io/joern/pysrc2cpg/Py2CpgOnFileSystem.scala b/joern-cli/frontends/pysrc2cpg/src/main/scala/io/joern/pysrc2cpg/Py2CpgOnFileSystem.scala index 78f8577536e9..54e80b727744 100644 --- a/joern-cli/frontends/pysrc2cpg/src/main/scala/io/joern/pysrc2cpg/Py2CpgOnFileSystem.scala +++ b/joern-cli/frontends/pysrc2cpg/src/main/scala/io/joern/pysrc2cpg/Py2CpgOnFileSystem.scala @@ -1,5 +1,6 @@ package io.joern.pysrc2cpg +import io.joern.x2cpg.passes.frontend.TypeRecoveryParserConfig import io.joern.x2cpg.{SourceFiles, X2Cpg, X2CpgConfig, X2CpgFrontend} import io.shiftleft.codepropertygraph.Cpg import io.shiftleft.utils.IOUtils @@ -11,10 +12,9 @@ import scala.util.Try case class Py2CpgOnFileSystemConfig( venvDir: Path = Paths.get(".venv"), ignoreVenvDir: Boolean = true, - disableDummyTypes: Boolean = false, - typePropagationIterations: Int = 2, requirementsTxt: String = "requirements.txt" -) extends X2CpgConfig[Py2CpgOnFileSystemConfig] { +) extends X2CpgConfig[Py2CpgOnFileSystemConfig] + with TypeRecoveryParserConfig[Py2CpgOnFileSystemConfig] { def withVenvDir(venvDir: Path): Py2CpgOnFileSystemConfig = { copy(venvDir = venvDir).withInheritedFields(this) } @@ -23,14 +23,6 @@ case class Py2CpgOnFileSystemConfig( copy(ignoreVenvDir = value).withInheritedFields(this) } - def withDisableDummyTypes(value: Boolean): Py2CpgOnFileSystemConfig = { - copy(disableDummyTypes = value).withInheritedFields(this) - } - - def withTypePropagationIterations(value: Int): Py2CpgOnFileSystemConfig = { - copy(typePropagationIterations = value).withInheritedFields(this) - } - def withRequirementsTxt(text: String): Py2CpgOnFileSystemConfig = { copy(requirementsTxt = text).withInheritedFields(this) } diff --git a/joern-cli/frontends/rubysrc2cpg/src/main/scala/io/joern/rubysrc2cpg/Main.scala b/joern-cli/frontends/rubysrc2cpg/src/main/scala/io/joern/rubysrc2cpg/Main.scala index 70354765f93c..ad774e5a5832 100644 --- a/joern-cli/frontends/rubysrc2cpg/src/main/scala/io/joern/rubysrc2cpg/Main.scala +++ b/joern-cli/frontends/rubysrc2cpg/src/main/scala/io/joern/rubysrc2cpg/Main.scala @@ -1,10 +1,13 @@ package io.joern.rubysrc2cpg -import io.joern.rubysrc2cpg.Frontend._ +import io.joern.rubysrc2cpg.Frontend.* +import io.joern.x2cpg.passes.frontend.{XTypeRecovery, TypeRecoveryParserConfig} import io.joern.x2cpg.{X2CpgConfig, X2CpgMain} import scopt.OParser -final case class Config(enableDependencyDownload: Boolean = false) extends X2CpgConfig[Config] { +final case class Config(enableDependencyDownload: Boolean = false) + extends X2CpgConfig[Config] + with TypeRecoveryParserConfig[Config] { def withEnableDependencyDownload(value: Boolean): Config = { copy(enableDependencyDownload = value).withInheritedFields(this) @@ -23,7 +26,8 @@ private object Frontend { opt[Unit]("enableDependencyDownload") .hidden() .action((_, c) => c.withEnableDependencyDownload(false)) - .text("enable dependency download for Unix System only") + .text("enable dependency download for Unix System only"), + XTypeRecovery.parserOptions ) } } diff --git a/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/passes/frontend/XTypeRecovery.scala b/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/passes/frontend/XTypeRecovery.scala index 005b9cde529e..f74f2c2dc9a9 100644 --- a/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/passes/frontend/XTypeRecovery.scala +++ b/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/passes/frontend/XTypeRecovery.scala @@ -1,6 +1,6 @@ package io.joern.x2cpg.passes.frontend -import io.joern.x2cpg.Defines +import io.joern.x2cpg.{Defines, X2CpgConfig} import io.shiftleft.codepropertygraph.Cpg import io.shiftleft.codepropertygraph.generated.nodes.* import io.shiftleft.codepropertygraph.generated.{EdgeTypes, NodeTypes, Operators, PropertyNames} @@ -12,6 +12,7 @@ import org.slf4j.{Logger, LoggerFactory} import overflowdb.BatchedUpdate import overflowdb.BatchedUpdate.DiffGraphBuilder import overflowdb.traversal.Traversal +import scopt.OParser import java.util.concurrent.RecursiveTask import java.util.concurrent.atomic.AtomicBoolean @@ -66,26 +67,44 @@ abstract class XTypeRecoveryPass[CompilationUnitType <: AstNode]( config: XTypeRecoveryConfig = XTypeRecoveryConfig() ) extends CpgPass(cpg) { - override def run(builder: BatchedUpdate.DiffGraphBuilder): Unit = { - val stopEarly = new AtomicBoolean(false) - val state = XTypeRecoveryState(config, stopEarly = stopEarly) - try { - Iterator.from(0).takeWhile(_ < config.iterations).foreach { i => - val newState = state.copy(currentIteration = i) - generateRecoveryPass(newState).createAndApply() + override def run(builder: BatchedUpdate.DiffGraphBuilder): Unit = + if (config.iterations > 0) { + val stopEarly = new AtomicBoolean(false) + val state = XTypeRecoveryState(config, stopEarly = stopEarly) + try { + Iterator.from(0).takeWhile(_ < config.iterations).foreach { i => + val newState = state.copy(currentIteration = i) + generateRecoveryPass(newState).createAndApply() + } + // If dummy values are enabled and we are stopping early, we need one more round to propagate these dummy values + if (stopEarly.get() && config.enabledDummyTypes) + generateRecoveryPass(state.copy(currentIteration = config.iterations - 1)).createAndApply() + } finally { + state.clear() } - // If dummy values are enabled and we are stopping early, we need one more round to propagate these dummy values - if (stopEarly.get() && config.enabledDummyTypes) - generateRecoveryPass(state.copy(currentIteration = config.iterations - 1)).createAndApply() - } finally { - state.clear() } - } protected def generateRecoveryPass(state: XTypeRecoveryState): XTypeRecovery[CompilationUnitType] } +trait TypeRecoveryParserConfig[R <: X2CpgConfig[R]] { this: R => + + var disableDummyTypes: Boolean = false + var typePropagationIterations: Int = 2 + + def withDisableDummyTypes(value: Boolean): R = { + this.disableDummyTypes = value + this + } + + def withTypePropagationIterations(value: Int): R = { + typePropagationIterations = value + this + } + +} + /** Based on a flow-insensitive static single-assignment symbol-table-style approach. This pass aims to be fast and * deterministic and does not try to converge to some fixed point but rather iterates a fixed number of times. This * will help recover:
  1. Imported call signatures from external dependencies
  2. Dynamic type hints for @@ -142,6 +161,8 @@ abstract class XTypeRecovery[CompilationUnitType <: AstNode](cpg: Cpg, state: XT object XTypeRecovery { + private val logger = LoggerFactory.getLogger(getClass) + val DummyReturnType = "" val DummyMemberLoad = "" val DummyIndexAccess = "" @@ -154,6 +175,33 @@ object XTypeRecovery { */ def isDummyType(typ: String): Boolean = DummyTokens.exists(typ.contains) + /** Parser options for languages implementing this pass. + */ + def parserOptions[R <: X2CpgConfig[R] with TypeRecoveryParserConfig[R]]: OParser[_, R] = { + val builder = OParser.builder[R] + import builder.* + OParser.sequence( + opt[Unit]("no-dummyTypes") + .hidden() + .action((_, c) => c.withDisableDummyTypes(true)) + .text("disable generation of dummy types during type propagation"), + opt[Int]("type-prop-iterations") + .hidden() + .action((x, c) => c.withTypePropagationIterations(x)) + .text("maximum iterations of type propagation") + .validate { x => + if (x <= 0) { + logger.info("Disabling type propagation as the given iteration count is <= 0") + } else if (x == 1) { + logger.info("Intra-procedural type propagation enabled") + } else if (x > 5) { + logger.warn(s"Large iteration count of $x will take a while to terminate") + } + success + } + ) + } + } /** Performs type recovery from the root of a compilation unit level diff --git a/joern-cli/src/test/scala/io/joern/joerncli/AbstractJoernCliTest.scala b/joern-cli/src/test/scala/io/joern/joerncli/AbstractJoernCliTest.scala index 07dbf37605ca..d676c372348d 100644 --- a/joern-cli/src/test/scala/io/joern/joerncli/AbstractJoernCliTest.scala +++ b/joern-cli/src/test/scala/io/joern/joerncli/AbstractJoernCliTest.scala @@ -3,7 +3,7 @@ package io.joern.joerncli import better.files.File import io.joern.console.FrontendConfig import io.joern.console.cpgcreation.{CCpgGenerator, JsSrcCpgGenerator} -import io.joern.jssrc2cpg.{JsSrc2Cpg, Config => JsConfig} +import io.joern.jssrc2cpg.{JsSrc2Cpg, Config as JsConfig} import io.shiftleft.codepropertygraph.Cpg import io.shiftleft.codepropertygraph.generated.Languages import io.shiftleft.utils.ProjectRoot @@ -35,7 +35,7 @@ trait AbstractJoernCliTest { val cpg = DefaultOverlays.create(cpgOutFileName) language match { case Languages.JSSRC | Languages.JAVASCRIPT => - JsSrc2Cpg.postProcessingPasses(cpg, Option(JsConfig(disableDummyTypes = true))).foreach(_.createAndApply()) + JsSrc2Cpg.postProcessingPasses(cpg, Option(JsConfig().withDisableDummyTypes(true))).foreach(_.createAndApply()) case _ => } (cpg, cpgOutFileName) From f24c82406f56c14439130f42bf399b8c23e933e7 Mon Sep 17 00:00:00 2001 From: David Baker Effendi Date: Tue, 1 Aug 2023 12:03:39 +0200 Subject: [PATCH 3/3] Pushing some code to rebuild pipeline --- .../scala/io/joern/x2cpg/passes/frontend/XTypeRecovery.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/passes/frontend/XTypeRecovery.scala b/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/passes/frontend/XTypeRecovery.scala index f74f2c2dc9a9..8f5efc961173 100644 --- a/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/passes/frontend/XTypeRecovery.scala +++ b/joern-cli/frontends/x2cpg/src/main/scala/io/joern/x2cpg/passes/frontend/XTypeRecovery.scala @@ -132,7 +132,7 @@ abstract class XTypeRecovery[CompilationUnitType <: AstNode](cpg: Cpg, state: XT override def run(builder: DiffGraphBuilder): Unit = { val changesWereMade = compilationUnit .map(unit => generateRecoveryForCompilationUnitTask(unit, builder).fork()) - .map(_.get()) + .map(_.get) .reduceOption((a, b) => a || b) .getOrElse(false) if (!changesWereMade) state.stopEarly.set(true)