Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CORE-207: handle quoted TSV values that look like non-strings #1508

Draft
wants to merge 5 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package org.broadinstitute.dsde.firecloud.utils

import org.broadinstitute.dsde.firecloud.utils.TsvFormatterBenchmark.Inputs
import org.broadinstitute.dsde.firecloud.model.{FlexibleModelSchema, ModelSchema, SchemaTypes}
import org.broadinstitute.dsde.firecloud.utils.TsvFormatterBenchmark.{EntityData, Inputs}
import org.broadinstitute.dsde.rawls.model.{AttributeBoolean, AttributeName, AttributeNumber, AttributeString, Entity}
import org.openjdk.jmh.annotations.{Benchmark, Scope, State}
import org.openjdk.jmh.infra.Blackhole

Expand All @@ -12,22 +14,72 @@ object TsvFormatterBenchmark {
val inputWithTab = "foo\tbar"
}

}
@State(Scope.Thread)
class EntityData {
val entityType: String = "sample"

class TsvFormatterBenchmark {
val model: ModelSchema = FlexibleModelSchema

@Benchmark
def tsvSafeStringNoTab(blackHole: Blackhole, inputs: Inputs): String = {
val result = TSVFormatter.tsvSafeString(inputs.inputNoTab)
blackHole.consume(result)
result
val headers: IndexedSeq[String] = IndexedSeq("sample_id", "col1", "col2", "fourth", "last")

val entities: Seq[Entity] = Seq(
Entity(
"1",
entityType,
Map(
AttributeName.withDefaultNS("col1") -> AttributeString("foo"),
AttributeName.withDefaultNS("col2") -> AttributeBoolean(true),
AttributeName.withDefaultNS("fourth") -> AttributeNumber(42),
AttributeName.withDefaultNS("last") -> AttributeString("gs://some-bucket/somefile.ext")
)
),
Entity(
"0005",
entityType,
Map(
AttributeName.withDefaultNS("col1") -> AttributeString("bar"),
AttributeName.withDefaultNS("col2") -> AttributeBoolean(false),
AttributeName.withDefaultNS("fourth") -> AttributeNumber(99),
AttributeName.withDefaultNS("last") -> AttributeString("gs://some-bucket/somefile2.ext")
)
),
Entity(
"789",
entityType,
Map(
AttributeName.withDefaultNS("col1") -> AttributeString("baz"),
AttributeName.withDefaultNS("col2") -> AttributeBoolean(true),
AttributeName.withDefaultNS("fourth") -> AttributeNumber(-123),
AttributeName.withDefaultNS("last") -> AttributeString("gs://some-bucket/somefile3.ext")
)
)
)
}

}

class TsvFormatterBenchmark {

@Benchmark
def tsvSafeStringWithTab(blackHole: Blackhole, inputs: Inputs): String = {
val result = TSVFormatter.tsvSafeString(inputs.inputWithTab)
def makeEntityRows(blackHole: Blackhole, entityData: EntityData): IndexedSeq[IndexedSeq[String]] = {
val result =
TSVFormatter.makeEntityRows(entityData.entityType, entityData.entities, entityData.headers)(entityData.model)
blackHole.consume(result)
result
}
//
// @Benchmark
// def tsvSafeStringNoTab(blackHole: Blackhole, inputs: Inputs): String = {
// val result = TSVFormatter.tsvSafeString(inputs.inputNoTab)
// blackHole.consume(result)
// result
// }
//
// @Benchmark
// def tsvSafeStringWithTab(blackHole: Blackhole, inputs: Inputs): String = {
// val result = TSVFormatter.tsvSafeString(inputs.inputWithTab)
// blackHole.consume(result)
// result
// }

}
Original file line number Diff line number Diff line change
Expand Up @@ -179,31 +179,78 @@ trait TSVFileSupport {
Creates an AttributeValue whose implementation is more closely tied to the value of the input.
*/
def stringToTypedAttribute(value: String): Attribute =
Try(java.lang.Integer.parseInt(value)) match {
case Success(intValue) => AttributeNumber(intValue)
case Failure(_) =>
Try(java.lang.Double.parseDouble(value)) match {
// because we represent AttributeNumber as a BigDecimal, and BigDecimal has no concept of infinity or NaN,
// if we find infinite/NaN numbers here, don't save them as AttributeNumber; instead let them fall through
// to AttributeString.
case Success(doubleValue)
if !Double.NegativeInfinity.equals(doubleValue)
&& !Double.PositiveInfinity.equals(doubleValue)
&& !Double.NaN.equals(doubleValue)
&& !matchesLiteral(value) =>
AttributeNumber(doubleValue)
case _ =>
Try(BooleanUtils.toBoolean(value.toLowerCase, "true", "false")) match {
case Success(booleanValue) => AttributeBoolean(booleanValue)
case Failure(_) =>
Try(value.parseJson.convertTo[AttributeEntityReference]) match {
case Success(ref) => ref
case Failure(_) => AttributeString(value)
}
}
}
// if this value starts and ends with a quote, it should always be treated as a string
if (value.startsWith("\"") && value.endsWith("\"")) {
AttributeString(value.substring(1, value.length - 1))
} else {
// else, inspect the value to find an appropriate datatype
toAttribute(value)
}

private val possibleNumbers = "0123456789+-"
private val possibleBooleans = "tfTF"
private val possibleReferences = "{"
private val possibleNonStrings = possibleNumbers + possibleBooleans + possibleReferences

def toAttribute(value: String): Attribute = {
val trimmed = value.trim
// empty string
if (trimmed.isEmpty) {
return AttributeString(value)
}

val firstChar = trimmed.charAt(0)

// first char doesn't match any known starters; this is a string
if (!possibleNonStrings.contains(firstChar)) {
return AttributeString(value)
}

// could this be a number?
if (possibleNumbers.contains(firstChar)) {
// Try(java.lang.Integer.parseInt(value)) match {
// case Success(intValue) => return AttributeNumber(intValue)
// case _ =>
Try(java.lang.Double.parseDouble(value)) match {
// because we represent AttributeNumber as a BigDecimal, and BigDecimal has no concept of infinity or NaN,
// if we find infinite/NaN numbers here, don't save them as AttributeNumber; instead let them fall through
// to AttributeString.
case Success(doubleValue)
if !Double.NegativeInfinity.equals(doubleValue)
&& !Double.PositiveInfinity.equals(doubleValue)
&& !Double.NaN.equals(doubleValue)
&& !matchesLiteral(value) =>
// this is a number. Now check if it is an int.
val floor = Math.floor(doubleValue)
if (doubleValue == floor) {
return AttributeNumber(doubleValue.intValue)
}
return AttributeNumber(doubleValue)
case _ => // noop
}
// }
}

// could this be a boolean?
if (possibleBooleans.contains(firstChar)) {
Try(BooleanUtils.toBoolean(value.toLowerCase, "true", "false")) match {
case Success(booleanValue) => return AttributeBoolean(booleanValue)
case _ => // noop
}
}

// could this be a reference?
if (possibleReferences.contains(firstChar)) {
Try(value.parseJson.convertTo[AttributeEntityReference]) match {
case Success(ref) => return ref
case _ => // noop
}
}

// it's a string.
AttributeString(value)
}

def checkForJson(value: String): Attribute =
Try(value.parseJson) match {
case Success(_: JsObject) => AttributeValueRawJson(value)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ package org.broadinstitute.dsde.firecloud.utils

import org.broadinstitute.dsde.rawls.model._
import org.broadinstitute.dsde.firecloud.model._
import org.broadinstitute.dsde.firecloud.service.TsvTypes
import org.broadinstitute.dsde.firecloud.service.{TSVFileSupport, TsvTypes}

object TSVFormatter {
object TSVFormatter extends TSVFileSupport {

// for serializing entity references
val attributeFormat = new AttributeFormat with PlainArrayAttributeListSerializer
Expand Down Expand Up @@ -36,23 +36,31 @@ object TSVFormatter {
entity.copy(attributes = attributes)
}

private def makeRow(entity: Entity, headerAttributes: IndexedSeq[AttributeName]): IndexedSeq[String] =
headerAttributes.map { colname =>
entity.attributes.get(colname) match {
case Some(attrValue) => tsvSafeAttribute(attrValue)
case None => ""
}
}

/**
* Generate a row of values in the same order as the headers.
*
* @param entity The Entity object to extract data from
* @param headerValues List of ordered header values to determine order of values
* @return IndexedSeq of ordered data fields
*/
private def makeRow(entity: Entity, headerValues: IndexedSeq[String]): IndexedSeq[String] = {
private def makeRow(entity: Entity, headerIndexes: Map[AttributeName, Int]): IndexedSeq[String] = {
val rowMap: Map[Int, String] = entity.attributes map { case (attributeName, attribute) =>
val columnPosition = headerValues.indexOf(AttributeName.toDelimitedName(attributeName))
val columnPosition = headerIndexes(attributeName)
val cellValue = tsvSafeAttribute(attribute)
columnPosition -> cellValue
}
// If there are entities that don't have a value for which there is a known header, that will
// be missing in the row. Fill up those positions with empty strings in that case.
val completedRowMap: IndexedSeq[(Int, String)] =
IndexedSeq.range(1, headerValues.size).map { i =>
IndexedSeq.range(1, headerIndexes.size).map { i =>
(i, rowMap.getOrElse(i, ""))
}

Expand All @@ -71,15 +79,19 @@ object TSVFormatter {
* @param value The input attribute to make safe
* @return the safe value
*/
def tsvSafeAttribute(attribute: Attribute): String = {
def tsvSafeAttribute(attribute: Attribute): String =
// AttributeStringifier works for everything except single entity references;
// it even works for AttributeEntityReferenceList
val intermediateString = attribute match {
attribute match {
case ref: AttributeEntityReference => attributeFormat.write(ref).compactPrint
case _ => AttributeStringifier(attribute)
case str: AttributeString =>
// if this string looks like a non-string datatype, such as '0005', surround it with quotes so it remains a string.
toAttribute(str.value) match {
case x: AttributeString if !str.value.contains(TSVParser.DELIMITER) => str.value
case _ => s"\"${str.value}\""
}
case _ => tsvSafeString(AttributeStringifier(attribute))
}
tsvSafeString(intermediateString)
}

/**
* Creates a string that is safe to output into a TSV as a cell value.
Expand Down Expand Up @@ -201,10 +213,18 @@ object TSVFormatter {
} else {
entities
}
// map of header->columnIndex for faster lookup inside makeRow
val headerIndexes: Map[AttributeName, Int] = headers.zipWithIndex.map { case (hdr, idx) =>
AttributeName.fromDelimitedName(hdr) -> idx
}.toMap

// headers as AttributeNames
val headerAttributes: IndexedSeq[AttributeName] = headers.map(AttributeName.fromDelimitedName)

// Turn them into rows
filteredEntities
.filter(_.entityType == entityType)
.map(entity => makeRow(entity, headers))
.map(entity => makeRow(entity, headerAttributes))
.toIndexedSeq
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ object TSVParser {
// empty string. These replace all nulls with the empty string.
settings.setNullValue("")
settings.setEmptyValue("")
// Configures the parser to keep enclosing quote characters in the values parsed from the input.
settings.setKeepQuotes(true)
new CsvParser(settings)
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -263,8 +263,8 @@ object MockTSVStrings {
).newlineSeparated

val windowsNewline = List(
List("foo", "bar").tabbed,
List("baz", "biz").tabbed
List("foo".quoted, "bar".quoted).tabbed,
List("baz".quoted, "biz".quoted).tabbed
).windowsNewlineSeparated

val missingFields1 = List(
Expand Down Expand Up @@ -310,8 +310,9 @@ object MockTSVLoadFiles {
val validEmptyStrWSAttribute = TSVLoadFile("workspace", Seq("a1"), Seq(Seq("")))
val validRemoveWSAttribute = TSVLoadFile("workspace", Seq("a1"), Seq(Seq("__DELETE__")))
val validRemoveAddAttribute = TSVLoadFile("workspace", Seq("a1", "a2"), Seq(Seq("__DELETE__", "v2")))
val validQuotedValues = TSVLoadFile("foo", Seq("foo", "bar"), Seq(Seq("baz", "biz")))
val validQuotedValuesWithTabs = TSVLoadFile("foo", Seq("foo", "bar"), Seq(Seq("baz", "this\thas\ttabs")))
val validQuotedValues = TSVLoadFile("\"foo\"", Seq("\"foo\"", "\"bar\""), Seq(Seq("\"baz\"", "\"biz\"")))
val validQuotedValuesWithTabs =
TSVLoadFile("\"foo\"", Seq("\"foo\"", "\"bar\""), Seq(Seq("\"baz\"", "\"this\thas\ttabs\"")))
val validNamespacedAttributes =
TSVLoadFile("foo", Seq("foo", "tag:foo", "bar", "tag:bar"), Seq(Seq("1", "2", "3", "4"), Seq("5", "6", "7", "8")))
val missingFields1 = TSVLoadFile("foo", Seq("foo", "bar", "baz"), Seq(Seq("biz", "", "buz")))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class ExportEntitiesByTypeServiceSpec
override val executionContext: ExecutionContext = scala.concurrent.ExecutionContext.Implicits.global

// On travis, slow processing causes the route to timeout and complete too quickly for the large content checks.
implicit override val routeTestTimeout: RouteTestTimeout = RouteTestTimeout(30.seconds)
implicit override val routeTestTimeout: RouteTestTimeout = RouteTestTimeout(120.seconds)

def actorRefFactory: ActorSystem = system

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,11 @@ class TSVFileSupportSpec extends AnyFreeSpec with TSVFileSupport {
Double.MaxValue.toString -> AttributeNumber(Double.MaxValue)
)
val stringTestCases = List("", "string", "true525600", ",")
val quotedStringTestCases = Map(
"\"0005\"" -> AttributeString("0005"),
"\"true\"" -> AttributeString("true"),
"\"embedded\ttabs\"" -> AttributeString("embedded\ttabs")
)
val referenceTestCases = Map(
"""{"entityType":"targetType","entityName":"targetName"}""" -> AttributeEntityReference("targetType",
"targetName"
Expand Down Expand Up @@ -155,6 +160,14 @@ class TSVFileSupportSpec extends AnyFreeSpec with TSVFileSupport {
}
}

"should teat any quoted values in the TSV as strings" in {
quotedStringTestCases foreach { case (input, expected) =>
withClue(s"should handle quoted string: $input") {
stringToTypedAttribute(input) shouldBe expected
}
}
}

"should detect string values when applicable" in {
stringTestCases foreach { str =>
withClue(s"should handle string: $str") {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,8 @@ class TSVFormatterSpec extends AnyFreeSpec with ScalaFutures with Matchers with
val tsvSafeAttributeTestData = Map(
AttributeString("foo") -> "foo",
AttributeString(""""quoted string"""") -> """"quoted string"""",
AttributeString("0005") -> """"0005"""", // string that looks like a number should be quoted
AttributeString("true") -> """"true"""", // string that looks like a boolean should be quoted
AttributeNumber(123.45) -> "123.45",
AttributeBoolean(true) -> "true",
AttributeBoolean(false) -> "false",
Expand Down
Loading