From 6e09da34243c559577c7bde60d74494e64526eb8 Mon Sep 17 00:00:00 2001 From: JBGruber Date: Mon, 8 Jan 2024 08:55:12 +0100 Subject: [PATCH] overhaul time parsing (#31) --- R/rwhatsapp.R | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/R/rwhatsapp.R b/R/rwhatsapp.R index 7272b8c..bbf6d48 100755 --- a/R/rwhatsapp.R +++ b/R/rwhatsapp.R @@ -46,28 +46,20 @@ rwa_read <- function(x, chat_raw <- rwa_read_lines(x, verbose, start_time, encoding, ...) chat_raw <- chat_raw[!chat_raw == ""] - time <- stri_extract_first_regex( - str = chat_raw, - pattern = "^\\d{2,4}.\\d{2}.\\d{2,4} - \\d{2}:\\d{2}[^;]+;|^\\d{2,4}-\\d{2}-\\d{2,4}[^-]+ -" + formats <- c( + "^\\d{2,4}.\\d{2}.\\d{2,4} - \\d{2}:\\d{2}[^;]+;|^\\d{2,4}-\\d{2}-\\d{2,4}[^-]+ -", + "[^-]+ - ", + "[^]]+] ", + "^[^A-z]*\\d{1,2}:\\d{1,2}(\\sAM|\\sPM){0,1}" ) - if (sum(is.na(time)) > (length(time) * 0.9)) { - time <- stri_extract_first_regex( + time <- lapply(formats, function(f) { + stri_extract_first_regex( str = chat_raw, - pattern = "[^-]+ - " + pattern = f ) - } - if (sum(is.na(time)) > (length(time) * 0.9)) { - time <- stri_extract_first_regex( - str = chat_raw, - pattern = "[^]]+] " - ) - } - if (sum(is.na(time)) > (length(time) * 0.9)) { - time <- stri_extract_first_regex( - str = chat_raw, - pattern = "^[^A-z]*\\d{1,2}:\\d{1,2}(\\sAM|\\sPM){0,1}" - ) - } + }) + nnas <- vapply(time, function(t) sum(is.na(t)), FUN.VALUE = integer(1)) + time <- time[[which.min(nnas)]] proper_time <- stri_detect_regex( str = time,