Merge pull request #5 from ahgroup/strain-name-fun

Strain name fun
ahgroup · Aug 12, 2024 · f921aa0 · f921aa0
2 parents c6c858b + 2d64ce9
commit f921aa0
Show file tree

Hide file tree

Showing 15 changed files with 1,038 additions and 53 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -3,4 +3,5 @@
 ^.*\.Rproj$
 ^\.Rproj\.user$
 ^LICENSE\.md$
-^README\.Rmd$
+^README\.qmd$
+^data-raw$
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,16 +1,24 @@
 Package: hgp
 Type: Package
 Title: HandelGroupPackage
-Version: 0.0.1
+Version: 0.0.2
 Authors@R: 
     person("Zane", "Billings", , "wz.billings@gmail.com", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0002-0184-6134"))
-Description: Miscellaneous functions used for Handelgroup research
+Description: Miscellaneous functions used for Handelgroup research.
 Encoding: UTF-8
 LazyData: true
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.2
 Date: 2023-11-03
-Suggests: 
-    ggplot2
 URL: https://github.com/ahgroup/hgp
 BugReports: https://github.com/ahgroup/hgp/issues
+Depends: 
+    R (>= 2.10)
+Suggests: 
+    dplyr,
+    forcats,
+    ggplot2,
+    here,
+    readr,
+    tibble
+License: AGPL (>= 3)
diff --git a/LICENSE.md b/LICENSE.md
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,3 +1,4 @@
 # Generated by roxygen2: do not edit by hand
 
+export(replace_strain_names)
 export(theme_ms)
diff --git a/R/handelgroup_strain_names.R b/R/handelgroup_strain_names.R
@@ -0,0 +1,16 @@
+#' Handelgroup Standardized Strain Names Dataset
+#'
+#' A dataset containing various formats of the names for the influenza strains
+#' we use in our research.
+#'
+#' @format ## `handelgroup_strain_names`
+#' A data frame with 46 rows and 6 columns:
+#' \describe{
+#'   \item{subtype}{Whether the strain is H1N1 or H3N2.}
+#'   \item{analysis_name}{Strain name format used in clean_data.Rds.}
+#'   \item{genbank_strain_name}{The accepted full strain name, as found in
+#'   genbank.}
+#'   \item{short_name}{The abbrevated name, usually 2-4 letters and the
+#'   last two digits of the year, useful for saving space in displays.}
+#' }
+"handelgroup_strain_names"
diff --git a/R/replace_strain_names.R b/R/replace_strain_names.R
@@ -0,0 +1,89 @@
+#' Replace strain names with a different format
+#'
+#' Using the standardized list of handelgroup strain names from
+#' ["handelgroup_strain_names"], pass in a vector of strain names of arbitrary
+#' length and return the same sequence of names in a different format.
+#'
+#' @param x A vector of strain names.
+#' @param from Format of the names in the vector `x`. Should be one of
+#' "analysis", "full", or "short". See ["handelgroup_strain_names"] for the
+#' allowed names in each of the formats. If you want to transform a strain
+#' that is not currently in the strain list, you will need to add it and
+#' submit a PR to `hgp`!
+#' @param to Format of the returned names. Should be one of "short", "full",
+#' "analysis", or "subtype".
+#' @param drop If TRUE, levels of the returned factor variable are dropped. If
+#' FALSE, the level set of the factor will still contain every strain in
+#' ["handelgroup_strain_names"], which is typically not desirable.
+#'
+#' @return A factor vector of the same length as `x`.
+#' @export
+#'
+#' @examples
+#' replace_strain_names("CA/09", from = "short", to = "analysis")
+#'
+#' dat <- data.frame(s = c("CA/09", "MI/15"), x = c(1, 2))
+#' transform(
+#'     dat,
+#'     s_long = replace_strain_names(s, from = "short", to = "analysis")
+#' )
+replace_strain_names <- function(x, from = "analysis", to = "short",
+																 drop = TRUE) {
+	# Load needed packages
+	requireNamespace("forcats", quietly = TRUE)
+	requireNamespace("tibble", quietly = TRUE)
+	requireNamespace("dplyr", quietly = TRUE)
+	requireNamespace("readr", quietly = TRUE)
+
+	# Load the strain names data
+	#utils::data("handelgroup_strain_names", envir=environment())
+	handelgroup_strain_names <- hgp::handelgroup_strain_names
+
+	# Check if from and to are the same
+	if (from == to) {
+		warning("From and to are the same, returning original vector.")
+		return(x)
+	}
+
+	# Find the right column for selecting names from
+	if (from == "analysis") {
+		from_vec <- handelgroup_strain_names$analysis_name
+	} else if (from == "full") {
+		from_vec <- handelgroup_strain_names$genbank_strain_name
+	} else if (from == "short") {
+		from_vec <- handelgroup_strain_names$short_name
+	} else {
+		stop("'from' should be 'analysis', 'full', or 'short'.")
+	}
+
+	# Make sure all values of x exist in the virus info table
+	if (!(all(x %in% from_vec))) {
+		stop(paste0(
+			"'x' should be a vector of ", from, " names that exist in the",
+			' virus-info sheet.'
+		))
+	}
+
+	# Now get the location in the virus info table for each element of x
+	locs <- match(x, from_vec)
+
+	# Based on the names argument, get the correct names to return.
+	if (to == "analysis") {
+		vals <- handelgroup_strain_names$analysis_name[locs]
+	} else if (to == "full") {
+		vals <- handelgroup_strain_names$genbank_strain_name[locs]
+	} else if (to == "short") {
+		vals <- handelgroup_strain_names$short_name[locs]
+	} else if (to == "subtype") {
+		vals <- handelgroup_strain_names$subtype[locs]
+	} else {
+		stop("'to' should be 'analysis', 'full', 'short', or 'subtype'.")
+	}
+
+	# If requested, remove unseen factor levels
+	if (isTRUE(drop)) {
+		vals <- forcats::fct_drop(vals)
+	}
+
+	return(vals)
+}
diff --git a/README.md b/README.md
@@ -1,17 +1,17 @@
 
-<!-- README.md is generated from README.qmd. Please edit that file -->
 
+<!-- README.md is generated from README.qmd. Please edit that file -->
 <!-- ASCII ART BANNER TITLE -->
-```
-  _    _                 _      _  _____                       _____           _                       ___               __  
- | |  | |               | |    | |/ ____|                     |  __ \         | |                     / / |              \ \ 
- | |__| | __ _ _ __   __| | ___| | |  __ _ __ ___  _   _ _ __ | |__) |_ _  ___| | ____ _  __ _  ___  | || |__   __ _ _ __ | |
- |  __  |/ _` | '_ \ / _` |/ _ \ | | |_ | '__/ _ \| | | | '_ \|  ___/ _` |/ __| |/ / _` |/ _` |/ _ \ | || '_ \ / _` | '_ \| |
- | |  | | (_| | | | | (_| |  __/ | |__| | | | (_) | |_| | |_) | |  | (_| | (__|   < (_| | (_| |  __/ | || | | | (_| | |_) | |
- |_|  |_|\__,_|_| |_|\__,_|\___|_|\_____|_|  \___/ \__,_| .__/|_|   \__,_|\___|_|\_\__,_|\__, |\___| | ||_| |_|\__, | .__/| |
-                                                        | |                               __/ |       \_\       __/ | |  /_/ 
-                                                        |_|                              |___/                 |___/|_|   
-```
+
+      _    _                 _      _  _____                       _____           _                    
+     | |  | |               | |    | |/ ____|                     |  __ \         | |                   
+     | |__| | __ _ _ __   __| | ___| | |  __ _ __ ___  _   _ _ __ | |__) |_ _  ___| | ____ _  __ _  ___ 
+     |  __  |/ _` | '_ \ / _` |/ _ \ | | |_ | '__/ _ \| | | | '_ \|  ___/ _` |/ __| |/ / _` |/ _` |/ _ \
+     | |  | | (_| | | | | (_| |  __/ | |__| | | | (_) | |_| | |_) | |  | (_| | (__|   < (_| | (_| |  __/
+     |_|  |_|\__,_|_| |_|\__,_|\___|_|\_____|_|  \___/ \__,_| .__/|_|   \__,_|\___|_|\_\__,_|\__, |\___|
+                                                            | |                               __/ |     
+                                                            |_|                              |___/      
+
 <!-- END OF TITLE -->
 
 # `hgp: HandelGroupPackage`
@@ -20,8 +20,13 @@
 <!-- badges: end -->
 
 `hgp` is a package for storing utility functions for use across
-handelgroup. Right now, `hgp` only contains a `ggplot2` theme. But we
-will update this description as we add more stuff.
+handelgroup. Right now, `hgp` contains the following utilities:
+
+- A group-standard `ggplot2` theme function;
+- A set of functions for working with HAI data;
+- A function to standardize influenza strain names for the strains used
+  in the datasets we have available to us; along with a corresponding
+  data set of standardized strain names.
 
 **Because this repo is public, absolutely NO sensitive or confidential
 information should be stored here. This repo is strictly for shared
@@ -33,8 +38,8 @@ You can install the development version of hgp from
 [GitHub](https://github.com/) with:
 
 ``` r
-# install.packages("devtools")
-devtools::install_github("ahgroup/hgp")
+# install.packages("remotes")
+remotes::install_github("ahgroup/hgp")
 ```
 
 Or if you are in a repository with `renv` enabled (recommended):
@@ -79,8 +84,6 @@ ggplot2::theme_set(hgp::theme_ms())
 - Please run `devtools:check()` before merging any new functionality to
   the main branch and fix any resulting messages. It is unnecessary to
   `build` the package, we just need to ensure that any checks pass.
-  - Note that as of `2023-11-03`, we have not decided on a package
-    license, so you will get one warning in the check results for that.
 - `renv` is initialized for this repository using the EXPLICIT snapshot
   mode.
   - If you need to import a new package, you MUST update the `NAMESPACE`

diff --git a/README.qmd b/README.qmd
@@ -13,14 +13,32 @@ knitr::opts_chunk$set(
 )
 ```
 
+<!-- ASCII ART BANNER TITLE -->
+```
+  _    _                 _      _  _____                       _____           _                    
+ | |  | |               | |    | |/ ____|                     |  __ \         | |                   
+ | |__| | __ _ _ __   __| | ___| | |  __ _ __ ___  _   _ _ __ | |__) |_ _  ___| | ____ _  __ _  ___ 
+ |  __  |/ _` | '_ \ / _` |/ _ \ | | |_ | '__/ _ \| | | | '_ \|  ___/ _` |/ __| |/ / _` |/ _` |/ _ \
+ | |  | | (_| | | | | (_| |  __/ | |__| | | | (_) | |_| | |_) | |  | (_| | (__|   < (_| | (_| |  __/
+ |_|  |_|\__,_|_| |_|\__,_|\___|_|\_____|_|  \___/ \__,_| .__/|_|   \__,_|\___|_|\_\__,_|\__, |\___|
+                                                        | |                               __/ |     
+                                                        |_|                              |___/      
+```
+<!-- END OF TITLE -->
+
 # `hgp: HandelGroupPackage`
 
 <!-- badges: start -->
 <!-- badges: end -->
 
 `hgp` is a package for storing utility functions for use across handelgroup.
-Right now, `hgp` only contains a `ggplot2` theme. But we will update this
-description as we add more stuff.
+Right now, `hgp` contains the following utilities:
+
+* A group-standard `ggplot2` theme function;
+* A set of functions for working with HAI data;
+* A function to standardize influenza strain names for the strains used in the
+datasets we have available to us; along with a corresponding data set of
+standardized strain names.
 
 **Because this repo is public, absolutely NO sensitive or confidential
 information should be stored here. This repo is strictly for shared utilities.**
@@ -30,8 +48,8 @@ information should be stored here. This repo is strictly for shared utilities.**
 You can install the development version of hgp from [GitHub](https://github.com/) with:
 
 ``` r
-# install.packages("devtools")
-devtools::install_github("ahgroup/hgp")
+# install.packages("remotes")
+remotes::install_github("ahgroup/hgp")
 ```
 
 Or if you are in a repository with `renv` enabled (recommended):
@@ -75,8 +93,6 @@ should use `devtools::document()` to generate the documentation files.
 * Please run `devtools:check()` before merging any new functionality to the
 main branch and fix any resulting messages. It is unnecessary to `build` the
 package, we just need to ensure that any checks pass.
-	* Note that as of `2023-11-03`, we have not decided on a package license,
-	so you will get one warning in the check results for that.
 * `renv` is initialized for this repository using the EXPLICIT snapshot mode.
 	* If you need to import a new package, you MUST update the `NAMESPACE` file
 	appropriately before invoking `renv::snapshot()`.

diff --git a/data-raw/handelgroup-strain-names.csv b/data-raw/handelgroup-strain-names.csv
@@ -0,0 +1,47 @@
+subtype,analysis_name,genbank_strain_name,short_name,factor_order,vaccine_strain
+h1,H1N1-South Carolina-1918,A/H1N1/South Carolina/1/1918,SC/18,1,FALSE
+h1,H1N1-Puerto Rico-1934,A/H1N1/Puerto Rico/8/1934,PR/34,2,FALSE
+h1,H1N1-Weiss-1943,A/H1N1/Weiss/43,Wei/43,3,FALSE
+h1,H1N1-Fort Monmouth-1947,A/H1N1/Fort Monmouth/1/1947,FM/47,4,FALSE
+h1,H1N1-Denver-1957,A/H1N1/Denver/1957,Den/57,5,FALSE
+h1,H1N1-New Jersey-1976,A/H1N1/New Jersey/8/1976,NJ/76,6,FALSE
+h1,H1N1-Ussr-1977,A/H1N1/Ussr/90/1977,USSR/77,7,FALSE
+h1,H1N1-Brazil-1978,A/H1N1/Brazil/11/1978,Bra/78,8,FALSE
+h1,H1N1-California-1978,A/H1N1/California/10/1978,CA/78,9,FALSE
+h1,H1N1-Chile-1983,A/H1N1/Chile/1/1983,Chi/83,10,FALSE
+h1,H1N1-Singapore-1986,A/H1N1/Singapore/6/1986,Sing/86,11,FALSE
+h1,H1N1-Texas-1991,A/H1N1/Texas/36/1991,TX/91,12,FALSE
+h1,H1N1-Beijing-1995,A/H1N1/Beijing/262/1995,Bei/95,13,FALSE
+h1,H1N1-New Caledonia-1999,A/H1N1/New Caledonia/20/1999,NC/99,14,FALSE
+h1,H1N1-Solomon Islands-2006,A/H1N1/Solomon Islands/3/2006,SI/06,15,FALSE
+h1,H1N1-Brisbane-2007,A/H1N1/Brisbane/59/2007,Bris/07,16,FALSE
+h1,H1N1-California-2009,A/H1N1/California/07/2009,CA/09,17,TRUE
+h1,H1N1-Michigan-2015,A/H1N1/Michigan 45/2015,MI/15,18,TRUE
+h1,H1N1-Brisbane-2018,A/H1N1/Brisbane/02/2018,Bris/18,19,TRUE
+h1,H1N1-Guangdong Maonan-2019,A/H1N1/Guangdong-Maonan/SWL1536/201,GD/19,20,TRUE
+h1,H1N1-Victoria-2019,A/H1N1/Victoria/2570/2019,Vic/19,21,TRUE
+h3,H3N2-Hong Kong-1968,A/H3N2/Hong Kong/8/1968,HK/68,22,FALSE
+h3,H3N2-Port Chalmers-1973,A/H3N2/Port Chalmers/1/1973,PC/73,23,FALSE
+h3,H3N2-Texas-1977,A/H3N2/Texas/1/1977,TX/77,24,FALSE
+h3,H3N2-Mississippi-1985,A/H3N2/Mississippi/1/1985,MI/85,25,FALSE
+h3,H3N2-Sichuan-1987,A/H3N2/Sichuan/2/1987,Sich/87,26,FALSE
+h3,H3N2-Shangdong-1993,A/H3N2/Shangdong/9/1993,Shan/93,27,FALSE
+h3,H3N2-Nanchang-1995,A/H3N2/Nanchang/933/1995,Nan/95,28,FALSE
+h3,H3N2-Sydney-1997,A/H3N2/Sydney/5/1997,Syd/97,29,FALSE
+h3,H3N2-Panama-1999,A/H3N2/Panama/2007/1999,Pan/99,30,FALSE
+h3,H3N2-Fujian-2002,A/H3N2/Fujian/411/2002,Fuj/02,31,FALSE
+h3,H3N2-New York-2004,A/H3N2/New York/55/2004,NY/04,32,FALSE
+h3,H3N2-Brisbane-2007,A/H3N2/Brisbane/10/2007,Br/07,33,TRUE
+h3,H3N2-Wisconsin-2005,A/H3N2/Wisconsin/67/2005,WI/05,34,FALSE
+h3,H3N2-Uruguay-2007,A/H3N2/Uruguay/716/2007,Uru/07,35,FALSE
+h3,H3N2-Perth-2009,A/H3N2/Perth/16/2009,Per/09,36,FALSE
+h3,H3N2-Victoria-2011,A/H3N2/Victoria/361/2011,Vic/11,37,FALSE
+h3,H3N2-Texas-2012,A/H3N2/Texas/50/2012,TX/12,38,TRUE
+h3,H3N2-Switzerland-2013,A/H3N2/Switzerland/9715293/2013,Switz/13,39,TRUE
+h3,H3N2-Hong Kong-2014,A/H3N2/Hong Kong/4801/2014,HK/14,40,TRUE
+h3,H3N2-Singapore-2016,A/H3N2/Singapore/infimh-16-0019/2016,Sing/16,41,TRUE
+h3,H3N2-Kansas-2017,A/H3N2/Kansas/14/2017,KS/17,42,TRUE
+h3,H3N2-Hong Kong-2019,A/H3N2/Hong Kong/2671/2019,HK/19,43,TRUE
+h3,H3N2-South Australia-2019,A/H3N2/South Australia/34/2019,SA/19,44,TRUE
+h3,H3N2-Tasmania-2020,A/H3N2/Tasmania/503/2020 ,Tas/20,45,TRUE
+h3,H3N2-Darwin-2021,A/H3N2/Darwin/9/2021,Dar/21,46,TRUE
diff --git a/data-raw/strain-names-data-prep.R b/data-raw/strain-names-data-prep.R
@@ -0,0 +1,38 @@
+###
+# Code to prepare the strain names data for package inclusion
+# Zane Billings
+# 2024-08-12
+# The strain names data is a table of strain names, currently the ones that
+# are used in UGAFluVac. We encourage updates to raw CSV file to add strain
+# names that are used in other handelgroup datasets.
+###
+
+handelgroup_strain_names <- readr::read_csv(
+	here::here("data-raw", "handelgroup-strain-names.csv"),
+	col_types = 'fcccil'
+) |>
+	# Remove the useless columns
+	dplyr::select(-c(vaccine_strain)) |>
+	# Append a row so sorting the overall entry for CATEs is easy
+	tibble::add_row(
+		subtype = "",
+		analysis_name = "Overall",
+		short_name = "Overall",
+		genbank_strain_name = "Overall",
+		factor_order = 9999L
+	) |>
+	# Make all of the name variables ordered factors and clean up the subtypes
+	dplyr::mutate(
+		subtype = factor(
+			as.character(subtype),
+			levels = c("h1", "h3", ""),
+			labels = c("H1N1", "H3N2", "")
+		),
+		# Put the different name factors in order
+		dplyr::across(
+			c(analysis_name, genbank_strain_name, short_name),
+			\(x) forcats::fct_reorder(x, factor_order)
+		),
+	)
+
+usethis::use_data(handelgroup_strain_names, overwrite = TRUE)
diff --git a/data/handelgroup_strain_names.rda b/data/handelgroup_strain_names.rda
diff --git a/man/handelgroup_strain_names.Rd b/man/handelgroup_strain_names.Rd