read_data.R

check_read_dataset <- function(ifname, ofname, SXY.upper_index = c(5, 10, 18), is.TP = FALSE, is.P = FALSE){
  # READING IN SXY: study design (S) and/or covariates (X) and species data (Y) 
  SXY = read.csv(ifname, stringsAsFactors=TRUE)
  # S: study design, including units of study and their possible coordinates
  # X: covariates to be used as predictors
  # Y: species data
  S=SXY[,1:SXY.upper_index[1]]
  X=SXY[,(SXY.upper_index[1]+1):SXY.upper_index[2]]
  Y=SXY[,(SXY.upper_index[2]+1):SXY.upper_index[3]]
  
  # Check that the data looks as it should!
  View(S)
  View(X)
  View(Y)
  # check that community data are numeric and have finite numbers. If the script
  # writes "Y looks OK", you are ok.
  if (is.numeric(as.matrix(Y)) || is.logical(as.matrix(Y)) && is.finite(sum(Y, na.rm=TRUE))) {
    print("Y looks OK")
  } else {
    print("Y should be numeric and have finite values")	}
  # Check that the stydy design data do not have missing values (they are allowed for Y but not S, X, P or Tr)
  if (any(is.na(S))) {
    print("S has NA values - not allowed for")
  } else {
    print("S looks ok")	}
  # Check that the covariate data do not have missing values (they are allowed for Y but not S, X, P or Tr)
  if (any(is.na(X))) {
    print("X has NA values - not allowed for")
  } else {
    print("X looks ok")	}
  
  
  # READING IN TP: traits (T) and/or phylogenetic information in table format (P)
  if(is.TP){
    # Read in the species names as rownames, not as a column of the matrix
    TP = read.csv("TP.csv", stringsAsFactors=TRUE,row.names = 1)
    # The script below checks if the species names in TP are identical and in the same order as in Y
    # If the script prints "species names in TP and SXY match", you are ok.
    # If it says that they do not match, you need to modify the files so that they match 
    if(all(rownames(TP)==colnames(Y))) {
      print("species names in TP and SXY match")
    } else{
      print("species names in TP and SXY do not match")
    }
    # Modify the next two lines to split your TP file to components that relate to
    # Tr: species traits (note that T is a reserved word in R and that's why we use Tr)
    # P: phylogenetic information given by taxonomical levels, e.g. order, family, genus, species
    # If you don't have trait data, indicate this by Tr=NULL. 
    # If TP does not have phylogenetic data (because you don't have such data at all, or because
    # it is given in tree-format, like is the case in this example), indicate this with P=NULL 
    Tr = TP[,1:2]
    P = NULL
    # Check that the data looks as it should!
    View(Tr)
    View(P)
    # Check that the Tr data do not have missing values (they are allowed for Y but not S, X, P or Tr)
    if (any(is.na(Tr))) {
      print("Tr has NA values - not allowed for")
    } else {
      print("Tr looks ok")	}
    # Check that the phylogenetic/taxonomic data do not have missing values (they are allowed for Y but not S, X, P or Tr)
    if (any(is.na(P))) {
      print("P has NA values - not allowed for")
    } else {
      print("P looks ok")	}
  }
  
  # READING IN P: phylogenetic information in tree format (P)
  # we use ape package for trees, and P.tre must be in a format that ape understands
  if(is.P){
    # Read in the phylogenetic tree using read.tree from ape
    library(ape)
    P = read.tree("P.tre")
    # When you look at P (e.g. write P and press enter),
    # you should see that it is a phylogenetic tree which
    # is rooted and includes branch lengths and tip labels
    # The script below checks if the species names in P are identical (but not necessarily in the same order) as in Y
    # If the script prints "species names in P and SXY match", you are ok.
    # If it says that they do not match, you need to modify the files so that they match 
    if(all(sort(P$tip.label) == sort(colnames(Y)))){
      print("species names in P and SXY match")
    } else{
      print("species names in P and SXY do not match")
    }
    # Check that the data looks as it should!
    plot(P, cex=0.5)
  }
  
  Y = as.matrix(Y)
  save(S,X,Y,file=ofname)
  print(paste("Dataset saved in", ofname))
}