-
Notifications
You must be signed in to change notification settings - Fork 0
/
GetModelMetadata.R
134 lines (107 loc) · 6.39 KB
/
GetModelMetadata.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
GetModelMetadata <- function(formulaOrTerms, rPackage, trainingData, xVar, yVar, zVar, mVar, coordinateSystem, model=NULL)
{
# Initialize the list we will populate and return.
modelMetadata <- list()
modelMetadata$rPackage <- rPackage
modelMetadata$xVar <- xVar
modelMetadata$yVar <- yVar
modelMetadata$zVar <- zVar
modelMetadata$mVar <- mVar
modelMetadata$coordinateSystem <- coordinateSystem
# Build a model.frame from the formulaOrTerms and trainingData. From this,
# we can extract all the relevant metadata we'll need later.
modelFrame <- model.frame(formulaOrTerms, trainingData)
modelTerms <- attr(modelFrame, "terms")
dataClasses <- attr(modelTerms, "dataClasses")
# Extract the response term and its characteristics.
if (attr(modelTerms, "response") < 1)
stop("The model formula must include a response term.", call.=FALSE)
modelMetadata$responseExpr <- names(dataClasses[attr(modelTerms, "response")])
modelMetadata$responseDataClass <- as.character(dataClasses[attr(modelTerms, "response")])
modelMetadata$responseVar <- all.vars(parse(text=modelMetadata$responseExpr))
if (rPackage == "rpart" && model$method == "poisson" && modelMetadata$responseDataClass == "nmatrix.2" && length(modelMetadata$responseVar) == 2) # Special case for rpart poisson models specified cbind(A, B) ~ ...
{
modelMetadata$offsetExprs <- modelMetadata$responseVar[1]
modelMetadata$offsetDataClasses <- "numeric"
modelMetadata$offsetVars <- modelMetadata$responseVar[1]
modelMetadata$responseExpr <- modelMetadata$responseVar[2]
modelMetadata$responseDataClass <- "numeric"
modelMetadata$responseVar <- modelMetadata$responseVar[2]
}
else if (length(modelMetadata$responseVar) != 1)
stop("The response term of the model formula must reference exactly one variable. Response terms that reference more than one variable are not allowed.", call.=FALSE)
modelMetadata$responseVarClass <- class(trainingData[[modelMetadata$responseVar]])
# Extract the offset terms and their characteristics.
if (!is.null(attr(modelTerms, "offset")))
{
modelMetadata$offsetExprs <- names(dataClasses[attr(modelTerms, "offset")])
modelMetadata$offsetDataClasses <- as.character(dataClasses[attr(modelTerms, "offset")])
modelMetadata$offsetVars <- all.vars(parse(text=modelMetadata$offsetExprs))
}
# Extract the predictor terms and their characteristics.
allPredictors <- dataClasses[-c(attr(modelTerms, "response"), attr(modelTerms, "offset"))]
modelMetadata$predictorExprs <- names(allPredictors)
modelMetadata$predictorDataClasses <- as.character(allPredictors)
modelMetadata$predictorVars <- all.vars(parse(text=modelMetadata$predictorExprs))
# For all numeric variables used in the formula, and the formula terms
# themselves, extract their min and max values; for factors, their levels;
# for logicals, their unique values.
modelMetadata$minNumericValues <- list()
modelMetadata$maxNumericValues <- list()
modelMetadata$factorLevels <- list()
modelMetadata$logicalValues <- list()
for (variable in names(trainingData))
if (class(trainingData[[variable]]) %in% c("numeric", "integer", "character"))
{
modelMetadata$minNumericValues[[variable]] <- min(trainingData[[variable]])
modelMetadata$maxNumericValues[[variable]] <- max(trainingData[[variable]])
}
else if (class(trainingData[[variable]]) == "factor")
modelMetadata$factorLevels[[variable]] <- levels(trainingData[[variable]])
else if (class(trainingData[[variable]]) == "logical")
modelMetadata$logicalValues[[variable]] <- unique(trainingData[[variable]])
else
stop(sprintf("The variable \"%s\" in the model formula has an unsupported data class \"%s\". Please contact the MGET development team for assistance.", variable, class(trainingData[[variable]])), call.=FALSE)
for (i in 1:length(dataClasses))
{
termDataClass <- dataClasses[i]
termExpression <- names(dataClasses[i])
if (termDataClass %in% c("numeric", "nmatrix.1")) # nmatrix.1 is the class of object returned by the lo() smoother in the gam package (not the mgcv package) when just one variable is smoothed
{
modelMetadata$minNumericValues[[termExpression]] <- min(modelFrame[[termExpression]])
modelMetadata$maxNumericValues[[termExpression]] <- max(modelFrame[[termExpression]])
}
else if (termDataClass == "nmatrix.2")
{
# nmatrix.2 is the class of object returned by the lo() smoother in
# the gam package (not the mgcv package) when two variables are
# smoothed. In this case, do nothing. We do not fully support lo()
# with more than one variable but will do the best we can to allow
# predictions.
}
else if (termDataClass == "factor")
modelMetadata$factorLevels[[termExpression]] <- levels(modelFrame[[termExpression]])
else if (termDataClass == "logical")
modelMetadata$logicalValues[[termExpression]] <- unique(modelFrame[[termExpression]])
else
stop(sprintf("The model formula term \"%s\" has an unsupported data class \"%s\". Please contact the MGET development team for assistance.", termExpression, termDataClass), call.=FALSE)
}
# Do the same thing
# For the caller's convenience, create a vector listing all model variables.
modelMetadata$allVars <- c(modelMetadata$responseVar, modelMetadata$offsetVars, modelMetadata$predictorVars)
# Determine what kind of model it is.
if (!is.null(model))
{
modelMetadata$isBinaryClassification <- (!is.null(rPackage) && rPackage %in% c("mgcv", "gam") || is.null(rPackage)) && model$family$family %in% c("binomial", "quasibinomial") ||
!is.null(rPackage) && modelMetadata$responseDataClass == "factor" && length(modelMetadata$factorLevels[[modelMetadata$responseExpr]]) == 2 &&
(rPackage == "rpart" && model$method == "class" ||
rPackage == "randomForest" && model$type == "classification" ||
rPackage == "party" && (model@responses@is_nominal[1] || model@responses@is_ordinal[1]))
modelMetadata$isNonBinaryClassification <- !modelMetadata$isBinaryClassification && !is.null(rPackage) &&
(rPackage == "rpart" && model$method == "class" ||
rPackage == "randomForest" && model$type == "classification" ||
rPackage == "party" && (model@responses@is_nominal[1] || model@responses@is_ordinal[1]))
}
# Return successfully.
return(modelMetadata)
}