updated version

trinker · Jan 1, 2017 · f0ddaa2 · f0ddaa2
1 parent caa3a15
commit f0ddaa2
Show file tree

Hide file tree

Showing 18 changed files with 138 additions and 113 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,28 +1,20 @@
-language:      c
+language: r
+sudo: false
+cache: packages
+r_build_args: "--resave-data=best"
+env:
+   global:
+     - DISPLAY=:99.0
+
+r_github_packages:
+  - jimhester/covr
+  - kbenoit/quanteda
+  - Rdatatable/data.table
 
-sudo: required
 before_install:
-  - curl -OL http://raw.github.com/craigcitro/r-travis/master/scripts/travis-tool.sh
-  - chmod 755 ./travis-tool.sh
-  - ./travis-tool.sh bootstrap
-install:
   - sh -e /etc/init.d/xvfb start
-  - ./travis-tool.sh aptget_install r-cran-xml
-  - ./travis-tool.sh install_github hadley/devtools
-  - ./travis-tool.sh install_github Rdatatable/data.table
-  - ./travis-tool.sh install_github kbenoit/quanteda
-  - ./travis-tool.sh install_deps
-  - ./travis-tool.sh github_package jimhester/covr
-script:      ./travis-tool.sh run_tests
+
 after_success:
-  - Rscript -e 'library(covr);coveralls()'
-notifications:
-  email:
-    on_success:      change
-    on_failure:      change
-env:
-   global:
-     - R_BUILD_ARGS="--resave-data=best"
-     - R_CHECK_ARGS="--as-cran"
-     - DISPLAY=:99.0
-     - BOOTSTRAP_LATEX=1
+  - Rscript -e 'covr::coveralls()'
+
+
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -9,7 +9,7 @@ Description: Harness the power of 'quanteda', 'data.table' & 'stringi'
         TermDocumentMatrix data structures.
 Depends: R (>= 3.2.2)
 Suggests: testthat
-Imports: data.table (>= 1.9.5), quanteda, slam, SnowballC, tm
+Imports: data.table (>= 1.9.5), quanteda, slam, SnowballC, stats, tm
 Date: 2016-12-31
 License: GPL-2
 LazyData: TRUE

diff --git a/NEWS b/NEWS
@@ -17,21 +17,11 @@ And constructed with the following guidelines:
 * Bug fixes and misc changes bumps the patch
 
 
-gofastr 0.1.0
+gofastr 0.1.0 - 0.1.1
 ----------------------------------------------------------------
 
-BUG FIXES
-
-NEW FEATURES
-
 * `partial_republican_debates_2015` data set added.
 
-MINOR FEATURES
-
-IMPROVEMENTS
-
-CHANGES
-
 gofastr 0.0.1
 ----------------------------------------------------------------
 

diff --git a/NEWS.md b/NEWS.md
@@ -20,18 +20,8 @@ And constructed with the following guidelines:
 gofastr 0.1.0
 ----------------------------------------------------------------
 
-**BUG FIXES**
-
-**NEW FEATURES**
-
 * `partial_republican_debates_2015` data set added.
 
-**MINOR FEATURES**
-
-IMPROVEMENTS
-
-**CHANGES**
-
 gofastr 0.0.1
 ----------------------------------------------------------------
 

diff --git a/R/filter_tf_idf.R b/R/filter_tf_idf.R
@@ -38,7 +38,7 @@ filter_tf_idf.DocumentTermMatrix <- function(x, min = NULL, verbose = FALSE){
             log2(tm::nDocs(x)/slam::col_sums(x > 0))
 
     if (is.null(min)) {
-        min <- median(term_tfidf)
+        min <- stats::median(term_tfidf)
     }
     if (isTRUE(verbose)) {
         cat("Summary stats for the tf-idf:\n\n")
@@ -56,7 +56,7 @@ filter_tf_idf.TermDocumentMatrix  <- function(x, min = NULL, verbose = FALSE){
             log2(tm::nDocs(x)/slam::row_sums(x > 0))
 
     if (is.null(min)) {
-        min <- median(term_tfidf)
+        min <- stats::median(term_tfidf)
     }
     if (isTRUE(verbose)) {
         cat("Summary stats for the tf-idf:\n\n")

diff --git a/R/gofastr-package.R b/R/gofastr-package.R
@@ -1,4 +1,4 @@
-#' Fast DocumentTermMatrix and TermDocumentMatric Creation
+#' Fast DocumentTermMatrix and TermDocumentMatrix Creation
 #'
 #' This package does one thing...It harness the power of \pkg{quanteda},
 #' \pkg{data.table} & \pkg{stringi} to quickly generate \pkg{tm}

diff --git a/R/remove_stopwords.R b/R/remove_stopwords.R
@@ -5,7 +5,7 @@
 #'
 #' @param x A \code{\link[tm]{TermDocumentMatrix}} or \code{\link[tm]{DocumentTermMatrix}}.
 #' @param stopwords A vector of stopwords to remove.
-#' @param min.char The minial length character for retained words.
+#' @param min.char The minimal length character for retained words.
 #' @param max.char The maximum length character for retained words.
 #' @param stem Logical.  If \code{TRUE} the \code{stopwords} will be stemmed.
 #' @param denumber Logical.  If \code{TRUE} numbers will be excluded.
@@ -76,7 +76,7 @@ regex_pattern <-"(?<=^| )[-.]*\\d+(?:\\.\\d+)?(?= |\\.?$)|\\d+(?:,\\d{3})+(\\.\\
 #' Remove Stopwords from a TermDocumentMatrix/DocumentTermMatrix
 #'
 #' \code{prep_stopwords} - Join multiple vectors of words, convert to lower case,
-#' and return sorted uniue words.
+#' and return sorted unique words.
 #'
 #' @rdname remove_stopwords
 #' @export

diff --git a/R/sub_in_na.R b/R/sub_in_na.R
@@ -1,9 +1,10 @@
 #' Regex Sub to Missing
 #'
-#' USe a regex to identify elements to sub out for missing \code{NA}.  USeful
-#' within a \pkg{magrittr} pipeline.
+#' Use a regex to identify elements to sub out for missing \code{NA}.  Useful
+#' within a \pkg{magrittr} pipeline before producing the
+#' \code{\link[tm]{TermDocumentMatrix}} or \code{\link[tm]{DocumentTermMatrix}}.
 #'
-#' @param x A vector.
+#' @param x A vector of text strings.
 #' @param regex A regex to match strings in a vector.
 #' @param \ldots Other arguments passed to \code{\link[base]{grepl}}
 #' @return Returns a vector with \code{NA}s inserted.
@@ -12,6 +13,18 @@
 #' x <- c("45", "..", "", "   ", "dog")
 #' sub_in_na(x)
 #' sub_in_na(x, "^\\s*$")
+#'
+#' \dontrun{
+#' library(tidyverse)
+#' x %>%
+#'     q_dtm() %>%
+#'     as.matrix()
+#'
+#' x %>%
+#'     sub_in_na() %>%
+#'     q_dtm() %>%
+#'     as.matrix()
+#' }
 sub_in_na <- function(x, regex = "^[^A-Za-z]*$", ...){
   x <- unlist(x)
   x[grepl(regex, x, ...)] <- NA

diff --git a/README.Rmd b/README.Rmd
@@ -34,9 +34,9 @@ knitr::opts_chunk$set(fig.path = "inst/figure/")
 <img src="inst/gofastr_logo/r_gofastr.png" width="150" alt="readability Logo">  
 
 
-**gofastr** is designed to do one thing really well...It harnesses the power of **data.table** and **stringi** to quickly generate **tm** `DocumentTermMatrix` and `TermDocumentMatrix` data structures. 
+**gofastr** is designed to do one thing really well...It harnesses the power of **data.table** and **stringi** to quickly generate **tm** `DocumentTermMatrix` and `TermDocumentMatrix` data structures. There are two types ways in which time is meaingingful to an analyst: (a) coding time, or the time spent writing code and (b) computational run time, or the time the computer takes to run the code.  Ideally, we want to minimize both of these sources of time expenditures.  The **gofaster** package is my attempt to reduce the time an analysts takes to turn raw text into an analysis ready data format.
 
-In my work I often get data in the form of large .csv files.  Additionally, most of the higher level analysis of text I undertake utilizes a TermDocumentMatrix or DocumentTermMatrix as the input data.  Generally, the `Corpus` generation/structure is an unnecessary step that requires additional run time.  **gofastr** skips this step and uses the power of [**quanteda**](https://github.com/kbenoit/quanteda) (which in turn wraps **data.table**, **stringi**, & **Matrix**) to quickly make the `DocumentTermMatrix` or `TermDocumentMatrix` data structures directly.  
+In my work I often get data in the form of large .csv files.  Additionally, most of the higher level analysis of text I undertake utilizes a `TermDocumentMatrix` or `DocumentTermMatrix` as the input data.  Generally, the **tm** package's `Corpus` structure is an unnecessary step in building a usable data structure that requires additional coding and run time.  **gofastr** skips this step and uses the power of [**quanteda**](https://github.com/kbenoit/quanteda) (which in turn wraps **data.table**, **stringi**, & **Matrix**) to quickly make the `DocumentTermMatrix` or `TermDocumentMatrix` that are fast to code up and fast for the computer to build.  
 
 # Function Usage