-
Notifications
You must be signed in to change notification settings - Fork 37
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Use known interface when parsing xml #621
Conversation
This is currently failing unit test:
|
We should also hook into |
I fixed a couple of issues:
|
Currently this PR seems to be breaking paws with S3 redirects and failing to return response from AWS. svc = paws.storage::s3()
resp = svc$list_objects_v2(Bucket = "voltrondata-labs-datasets")
#> Error in rawToChar(value): argument 'x' must be a raw vector Created on 2023-08-09 with reprex v2.0.2 svc = paws.storage::s3(config = list(credentials = list(anonymous = T), region = "us-east-1"))
resp = svc$list_objects_v2(Bucket = "voltrondata-labs-datasets")
resp
#> $IsTruncated
#> logical(0)
#>
#> $Contents
#> list()
#>
#> $Name
#> character(0)
#>
#> $Prefix
#> character(0)
#>
#> $Delimiter
#> character(0)
#>
#> $MaxKeys
#> numeric(0)
#>
#> $CommonPrefixes
#> list()
#>
#> $EncodingType
#> character(0)
#>
#> $KeyCount
#> numeric(0)
#>
#> $ContinuationToken
#> character(0)
#>
#> $NextContinuationToken
#> character(0)
#>
#> $StartAfter
#> character(0) Created on 2023-08-09 with reprex v2.0.2 |
I believe I have a working solution developed from initially proposal. This method I use the xml2:xml_contents similar to xml2::as_list and then map the interface onto it. restxml_unmarshal_2 <- function(request) {
t <- paws.common:::rest_payload_type(request$data)
if (t == "structure" || t == "") {
data <- request$http_response$body
interface <- request$data
request$data <- xml_unmarshal_2(data, interface)
} else {
request <- paws.common:::rest_unmarshal(request)
}
return(request)
}
# Decode raw bytes XML into an R list object.
xml_unmarshal_2 <- function(raw_data, interface = NULL) {
if (paws.common::is_empty(raw_data)) {
return(interface)
}
data <- xml2::read_xml(raw_data, encoding = "utf8")
out <- xml_parse_2(data, interface)
return(out)
}
xml_parse_2 <- function(data, interface) {
nms <- names(interface)
contents <- xml2::xml_contents(data)
contents_nms <- xml2::xml_name(contents)
result <- list()
for (nm in nms) {
interface_i <- interface[[nm]]
tags_i <- attr(interface_i, "tags")
if (!is.null(tags_i$locationName)) {
key <- tags_i$locationName
} else {
key <- nm
}
found <- (key == contents_nms)
xml_elts <- contents[found]
result[[nm]] <- (
if (any(found)) {
parse_xml_elt_2(xml_elts, interface_i, tags_i)
} else {
default_parse_xml(interface_i, tags_i)
}
)
}
return(result)
}
parse_xml_elt_2 <- function(xml_elts, interface_i, tags_i) {
n <- length(xml_elts)
type <- tags_i[["type"]]
flattened <- tags_i[["flattened"]]
t <- paws.common:::type(interface_i)
parse_fn <- switch(t,
structure = xml_parse_structure_2,
map = xml_parse_map_2,
list = xml_parse_list_2,
xml_parse_scalar_2
)
result <- parse_fn(xml_elts, interface_i, type)
# the `is.list()` check is necessary because e.g. `CheckSumAlgorithm` has
# a list interface though it isn't a list?!
if (isTRUE(flattened) && is.list(result)) {
result <- transpose(result)
}
return(result)
}
xml_parse_structure_2 <- function(xml_elts, interface, tags_i, type = NULL) {
result <- xml_parse_2(xml_elts, interface)
attr(result, "tags") <- tags_i
return(result)
}
xml_parse_map_2 <- function(xml_elts, interface, tags_i, type = NULL) {
result <- xml_parse_2(xml_elts, interface)
attr(result, "tags") <- tags_i
return(result)
}
xml_parse_list_2 <- function(xml_elts, interface, type = NULL) {
xml_parse_2(xml_elts, interface[[1]])
}
xml_parse_scalar_2 <- function(xml_elts, interface, type = NULL) {
results <- vapply(xml_elts, xml2::xml_text, FUN.VALUE = character(1))
convert <- switch(type,
blob = paws.common:::base64_to_raw,
boolean = as.logical,
double = as.numeric,
float = as.numeric,
integer = as.numeric,
long = as.numeric,
timestamp = function(x) paws.common:::as_timestamp(x, format = "iso8601"),
as.character
)
result <- convert(results)
names(result) <- names(interface)
return(result)
}
default_parse_xml <- function(interface_i, tags_i) {
type <- tags_i[["type"]]
t <- paws.common:::type(interface_i)
parse_fn <- switch(t,
structure = default_parse_structure,
map = default_parse_map,
list = default_parse_list,
default_parse_scalar
)
return(parse_fn(interface_i, type))
}
default_parse_structure <- function(interface_i, type = NULL) {
nms <- names(interface_i)
result <- list()
for (nm in nms) {
tags_i <- attr(interface_i[[nm]], "tags")
result[[nm]] <- default_parse_xml(interface_i[[nm]], tags_i)
}
return(list(result))
}
default_parse_map <- function(interface_i, type = NULL) {
nms <- names(interface_i)
result <- list()
for (nm in nms) {
tags_i <- attr(interface_i[[nm]], "tags")
result[[nm]] <- default_parse_xml(interface_i[[nm]], tags_i)
}
return(list(result))
}
default_parse_list <- function(interface_i, type = NULL) {
interface_i <- interface_i[[1]]
nms <- names(interface_i)
result <- list()
for (nm in nms) {
tags_i <- attr(interface_i[[nm]], "tags")
result[[nm]] <- default_parse_xml(interface_i[[nm]], tags_i)
}
return(list(result))
}
default_parse_scalar <- function(interface_i, type = NULL) {
result <- switch(type,
integer = numeric(),
double = numeric(),
long = numeric(),
float = numeric(),
timestamp = as.POSIXct(NULL),
boolean = logical(),
character()
)
return(result)
}
transpose <- function(x) {
if (!is.list(x)) {
return(x)
}
n_col <- length(x)
if (n_col == 0) {
return(list())
}
n_row <- length(x[[1]])
if (n_row == 0) {
return(list())
}
out <- vector("list", length = n_row)
col_seq <- seq.int(n_col, 1)
vals <- vector("list", length = n_col)
names(vals) <- names(x)
for (row in seq.int(1, n_row)) {
for (col in col_seq) {
vals[col] <- list(rep_len(x[[col]], n_row)[[row]])
}
out[[row]] <- vals
}
names(out) <- names(x[[1]])
return(out)
} request <- readRDS("request.RDS")
raw_data <- request$http_response$body
interface <- request$data
data <- xml2::read_xml(raw_data, encoding = "utf8")
bm <- bench::mark(
resp1 = paws.common:::restxml_unmarshal(request)$data,
resp2 = restxml_unmarshal_2(request)$data,
check = F,
iterations = 1000
) Note: request.RDS is a list_objects_v2 request from my personal AWS s3 bucket. I will try and get example from a public aws s3 bucket for further benchmarking. TODO:
|
It was way faster until I got an error 😉 The response from my bucket seems to be different to the response from the bucket
|
Just a quick update. I have nearly got all unit tests working. I am struggling to get the code working for the final unit tests. I believe it should be working for most cases. @mgirlich do you mind doing an another tempt as profvis::profvis({
library(paws.storage)
svc <- s3()
resp <- svc$list_objects_v2(Bucket = "voltrondata-labs-datasets")
}) We are so close in getting this working as intended. So close but so far at the same time 😆 |
So much better! 🎉 CRAN PR I noticed two (minor) things:
This is basically irrelevant to me but it might indicate there are still some bugs in parsing (or maybe even fixes?) |
You are quite right. I am still squashing the final bugs. I just need to make sure the all the data types are returned correctly. I believe it is working fine for all units test bar from 4. Nearly there 😆 |
Unfortunately, the small fixes cost quite a lot of performance overall (though still much better than before this PR!):
|
Ah nuts, lets see if we can claw any of the performance back :) |
@mgirlich is transpose the bottle neck here? 🤔 If you replace the transpose with the |
Ok I have made some improvements. It is looking promising however I don't know how it scales up. @mgirlich do you mind doing another benchmark for me? (sorry for keep pestering you for the benchmarks) |
This looks much better now CRAN: 19710 ms / 17250 ms for |
That is good news, can you put the profvis break down? I just want to see what is bottle necking us now. And see if we can get any more improvements out of it. |
I attached a Rprof file, so you can also easily analyse this 😄 |
@mgirlich I noticed xml_name was taking alot of time roughly 880ms. I believe it was down to result <- lapply(contents, function(x) {
xml_elt <- xml2::xml_contents(x)
xml_parse(xml_elt, interface_i[[1]], xml2::xml_name(xml_elt))
}) I have now opted to get xml names from the xml_nms <- xml2::xml_name(xml2::xml_child(contents[[1]]))
result <- lapply(contents, function(x) {
xml_elt <- xml2::xml_contents(x)
xml_parse(xml_elt, interface_i[[1]], xml_nms)
}) @mgirlich do you mind doing a benchmark to see if we have got any performance improvements at all :P |
Don't try to hard for now. I am now also looking into improving the performance of {xml2}. There are a couple of easy performance improvements that should give nice boost here as well. |
If that is the case then I will merge these improvements and start preparing paws.common to be released to the cran. |
This improves the performance noted in #619. Still the most time is spend in parsing but I got a 3 times performance improvement.
I'm not sure there is an easy way to improve the performance further. It might be doable by using smart xpaths but I don't have much experience with that. Maybe there is a chance to improve the performance of
xml_find_all()
itself.I didn't manage to load the paws package properly and run the tests. So, I only tested this for
list_objects_v2()
EDIT: we might do a bit better by using
xml_find_first()
instead ofxml_find_all()
in some situations.