-
Notifications
You must be signed in to change notification settings - Fork 0
/
ObserveTo84ms.Analysis.R
271 lines (245 loc) · 12.8 KB
/
ObserveTo84ms.Analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
library(tidyverse)
library(openxlsx)
library(readxl)
library(dplyr)
library(lubridate)
setwd("E:/Molecular Epidemiology/GLYCEMIA/GLY")
all <- read_excel("all.xlsx")
str(all)
model.3 <- readRDS("model.3.rds")
summary(model.3)
postprob(model.3)
####1.定位到治疗半年以上的数据####
#####1.1找到治疗半年以上的病载记录,去除半年以前的记录#
TreatMore6ms <- all %>% group_by(ID) %>% arrange(ID, TestingDate) %>%
mutate(TimefromART.m = round(interval(start=ARTinitialDate, end=VLDate) / months(1), digits = 2)) %>%
mutate(ART6months = ifelse(!is.na(VL) & TimefromART.m > 6, cumsum(!is.na(VL) & TimefromART.m > 6),NA)) %>%
mutate(first_occurrence = if_else(any(ART6months == 1), which(ART6months == 1)[1], NA_integer_)) %>%
filter(is.na(first_occurrence) | row_number() >= first_occurrence) %>%
select(-first_occurrence) %>%
filter(!all(is.na(ART6months))) %>%
mutate(ART6months = ifelse(is.na(ART6months), -1, ART6months))
#对治疗半年以上只有一次记录的ID,若不是VF则去除,因为两次记录才能判定LLV和VS
ids.1record <- TreatMore6ms %>% group_by(ID) %>% filter(!any(ART6months == 2))
ids.1record.removed <- ids.1record %>% group_by(ID) %>% filter(any(ART6months == 1 & VL< 1000))
TreatMore6ms <- anti_join(TreatMore6ms, ids.1record.removed, by = "ID")
####2.VLgroup,分VS,LLV,VF组####
#####2.1提取出只有一条记录的ID,分组为VF#####
ids.1record.keep <- ids.1record %>% anti_join(ids.1record.removed, by = "ID")
ids.1record.keep <- ids.1record.keep %>% mutate(ART6months = ifelse(ART6months== -1, NA, ART6months))
ids.1record.keep <- ids.1record.keep %>% mutate(VLgroup = "VF")
#####2.2两条以上记录的,依据条件分VS,LLV,VF组#####
ids.more2record <- TreatMore6ms %>% anti_join(ids.1record.keep, by = "ID")
ids.2record <- ids.more2record %>% group_by(ID) %>% filter(ART6months %in% c(1, 2))
ids.2record <- ids.2record %>% group_by(ID) %>%
mutate(
VLgroup = case_when(
all(VL <= 50) ~ "VS",
first(VL) >= 1000 |
(first(VL) < 1000 & lead(VL) >= 1000) |
(first(VL) >= 200 & lead(VL) >= 200) ~ "VF",
TRUE ~ NA_character_
)) %>%
group_by(ID) %>%
mutate(VLgroup = max(VLgroup, na.rm = TRUE)) %>%
mutate(VLgroup = ifelse(is.na(VLgroup), "LLV", VLgroup))
ids.2record.select <- ids.2record %>% select(1, 11) %>% distinct(ID, .keep_all = TRUE)
ids.1record.keep.select <- ids.1record.keep %>% select(1, 11) %>% distinct(ID, .keep_all = TRUE)
ids.select.group <- bind_rows(ids.2record.select, ids.1record.keep.select)
VLgroup.vs.llv.vf <- TreatMore6ms %>% left_join(ids.select.group, by = "ID")
#####2.3计算入组时间#####
#第一次记录VL不是大于等于1000则删除,只有VF可以通过一次判断入组,其余都要观察两次,根据第二次值入组
VLgroup.vs.llv.vf.follow <- VLgroup.vs.llv.vf %>% group_by(ID) %>% arrange(ID, TestingDate) %>%
mutate(row_number = row_number()) %>%
filter(!(VLgroup == "VF" & row_number == 1 & VL < 1000)) %>%
filter(!(VLgroup == "VS" & row_number == 1)) %>%
filter(!(VLgroup == "LLV" & row_number == 1)) %>%
select(-row_number)%>%
mutate(StartfollowDate = first(VLDate[!is.na(VLDate)]))%>%
mutate(FollowTime.m = round(interval(start=StartfollowDate, end=TestingDate) / months(1), digits = 2)) %>%
mutate(ART6months = ifelse(ART6months== -1, NA, ART6months)) %>%
filter(FollowTime.m >= 0)
write.xlsx(VLgroup.vs.llv.vf.follow, "VLgroup.vs.llv.vf.follow.xlsx")
#####2.4去除入组为VF的数据#####
VLgroup.vs.llv.follow <- VLgroup.vs.llv.vf.follow %>% filter(VLgroup!="VF")
write.xlsx(VLgroup.vs.llv.follow, "VLgroup.vs.llv.follow.xlsx")
####3.入组后开始观察####
#####3.1血糖#####
#去除血糖缺失值
Gly <- VLgroup.vs.llv.follow %>% filter(!is.na(Gly))
#识别基线血糖异常的ID
abnormalGLY <- Gly %>% group_by(ID) %>% arrange(ID, TestingDate) %>%
slice(1) %>% filter(Gly >= 7 | Gly <= 2.8) %>% pull(ID)
#剔除基线血糖异常的ID,对随访的血糖异常值进行剔除
Gly <- Gly %>% filter(!ID %in% abnormalGLY) %>% filter(Gly>1&Gly<33) %>% filter(n()>2)
write.xlsx(Gly, "Gly.xlsx")
# hist(data.3.3.3$Gly, breaks = 120)
# boxplot(data.3.3.3$Gly)
# summary(data.3.3.3$Gly)
#IQR <- Q3 - Q1
#upper.IQR <- Q3 + 1.5 * IQR
#hist(data.hlme$Gly, breaks = 30)
#####3.2检测时间间隔####
#去除检测时间间隔小于等于30天的数据,去除检测时间间隔小于等于180天往后的数据
Gly.TestInterval.30.180 <- Gly %>% group_by(ID) %>% arrange(ID, TestingDate) %>%
mutate(Testintervalday = c(0, as.numeric(diff(TestingDate)))) %>%
filter(row_number() == 1 & Testintervalday == 0 | Testintervalday > 30) %>% filter(n()>2) %>%
mutate(Testintervalday = c(0, as.numeric(diff(TestingDate)))) %>%
filter(row_number() == 1 & Testintervalday == 0 | Testintervalday > 30) %>% filter(n()>2) %>%
mutate(Testintervalday = c(0, as.numeric(diff(TestingDate)))) %>%
mutate(Testintervalday = c(0, as.numeric(diff(TestingDate)))) %>%
mutate(FollowStatus = ifelse(Testintervalday > 180, "N", "Y")) %>%
mutate(cumsum_status = cumsum(FollowStatus == "N")) %>%
filter(cumsum_status == 0) %>%
select(-cumsum_status) %>%
filter(n()>2)
write.xlsx(Gly.TestInterval.30.180, "Gly.TestInterval.30.180.xlsx")
#####3.3观察终点#####
#统一观察7年截至,不管有没有发生血糖异常
Gly.observe.to.84ms <- Gly.TestInterval.30.180 %>%
group_by(ID) %>% arrange(ID, TestingDate) %>%
filter(FollowTime.m <= 84) %>% filter(n()>2)
write.xlsx(Gly.observe.to.84ms, "Gly.observe.to.84ms.xlsx")
#观察7年后,连续出现2次血糖异常为1,未出现为0
To.84ms.glygroup <- Gly.observe.to.84ms %>% group_by(ID) %>% mutate(
Glygroup = ifelse(any(Gly >= 7 & lead(Gly, 1) >= 7, na.rm = TRUE), 1, 0))
To.84ms.glygroup.unique <- To.84ms.glygroup %>% distinct(ID, .keep_all = TRUE)
table(To.84ms.glygroup.unique$VLgroup)
table(To.84ms.glygroup.unique$Glygroup)
####4.轨迹组成员生存分析前数据准备####
#获取轨迹组内的ID的原本完整的TestingDate,因为筛选后的TestingDate不完整
ids.to84ms <- To.84ms.glygroup.unique %>% pull(ID)
To.84ms.survival <- all %>% filter(ID %in% ids.to84ms)
info <- To.84ms.glygroup.unique %>% select(1, 11, 12, 13, 16)
#合并死亡日期到轨迹组成员内
To.84ms.died <- left_join(To.84ms.survival, death, by = "ID")
#获取轨迹组内的ID的StartfollowDate等信息
To.84ms.died.survtime <- To.84ms.died %>% left_join(info, by= "ID")%>%
group_by(ID) %>% arrange(ID, TestingDate) %>%
mutate(SurvTime.m = ifelse(
is.na(DiedDate),
round(interval(start=StartfollowDate, end=max(TestingDate)) / months(1), digits = 2),
round(interval(start=StartfollowDate, end=DiedDate) / months(1), digits = 2)))
To.84ms.died.survtime.Status <- To.84ms.died.survtime %>%
mutate(SurvivalStatus=ifelse(is.na(DiedDate), 0, 1)) %>%
mutate(SurvivalStatus.a=ifelse(SurvivalStatus==1&SurvTime.m>=84, 0, SurvivalStatus))%>%
mutate(SurvTime.m.a = ifelse(SurvTime.m>=84,84,SurvTime.m))
#保留一条ID记录并简化数据框
To.84ms.surv.unique <- To.84ms.died.survtime.Status %>% group_by(ID) %>%
distinct(ID, .keep_all = TRUE) %>%
select(1, 13, 16, 17, 19,20)
#剔除记录错误的id(检测时间晚于死亡时间的ID)
ids.testlaterdeath <- To.84ms.surv.unique %>% filter(SurvTime.m.a<=0) %>% pull(ID)
To.84ms.surv.unique <- To.84ms.surv.unique %>% filter(!ID %in% ids.testlaterdeath)
summary(To.84ms.surv.unique$SurvTime.m.a)
table(To.84ms.surv.unique$SurvivalStatus.a)
To.84ms.glygroup <- To.84ms.glygroup %>% filter(!ID %in% ids.testlaterdeath)
To.84ms.glygroup.unique <- To.84ms.glygroup.unique %>% filter(!ID %in% ids.testlaterdeath)
write.xlsx(To.84ms.glygroup, "To.84ms.glygroup.xlsx")
write.xlsx(To.84ms.glygroup.unique, "To.84ms.glygroup.unique.xlsx")
####5.lcmm识别不同轨迹组####
library(lcmm)
#####5.1简化数据框,生成Subject#####
To.84ms.hlme <- To.84ms.glygroup %>% select( 1, 6, 11, 13, 16) %>%
group_by(ID) %>% arrange(ID, FollowTime.m)%>%
mutate(Subject = cur_group_id())
write.xlsx(To.84ms.hlme, "To.84ms.hlme.xlsx")
#####5.2计算模型#####
model.1 <- hlme(fixed = Gly ~ 1 + FollowTime.m + I(FollowTime.m^2),
random = ~ 1 + FollowTime.m,
ng = 1,
data = To.84ms.hlme,
subject = "Subject")
summary(model.1)
model.4 <- hlme(fixed = Gly ~ 1 + FollowTime.m + I(FollowTime.m^2),
mixture = ~ 1 + FollowTime.m + I(FollowTime.m^2),
random = ~ 1 + FollowTime.m,
ng = 4,
nwg = TRUE,
B = model.1,
idiag = FALSE,
maxiter = 500,
data = To.84ms.hlme,
subject = "Subject")
summary(model.4)
postprob(model.4)
#####5.3绘图#####
windows(width=10, height=8)
datnew <- data.frame(FollowTime.m = seq(0,84, length = 1000))
plotpred <- predictY(model.2, datnew, var.time ="FollowTime.m")
plot(plotpred, lty = 1, lwd = 5,marg=FALSE, shades=T,
xlab="FollowTime(Months)", ylab="GLY",
legend.loc = "topleft", cex=0.75)
#####5.4提取不同轨迹组的对象#####
class <- model.3$pprob[,1:2]
#####5.5合并hlme与class#####
To.84ms.hlme.class <- To.84ms.hlme %>% left_join(class, by = "Subject")
To.84ms.hlme.class.unique <- To.84ms.hlme.class %>% distinct(ID, .keep_all = TRUE)
#####5.6制作VLgroup与class的交叉表#####
table <- table(To.84ms.hlme.class.unique$VLgroup, To.84ms.hlme.class.unique$class)
percent <- prop.table(table, margin = 1) * 100
count.percent <- table
for (i in 1:nrow(table)) {
for (j in 1:ncol(table)) {
count.percent[i, j] <- paste0(table[i, j], " (", round(percent[i, j], 2), "%)")
}
}
count.percent <- as.data.frame.matrix(count.percent)
rowtotals <- rowSums(table)
coltotals <- colSums(table)
count.percent$Total <- rowtotals
count.percent <- rbind(count.percent, Total = c(coltotals, sum(coltotals)))
#count.percent <- count.percent %>% select(`1`, `0`,`Total`)
count.percent
####6.不同轨迹组内成员的死亡生存分析####
library(survival)
library(survminer)
#####6.1合并生存分析准备数据框与lcmm模型生成的class#####
To.84ms.surv.unique.class <- To.84ms.surv.unique %>%
left_join(To.84ms.hlme.class.unique.select, by = "ID")
#####6.2拟合生存函数#####
fit <- survfit(Surv(SurvTime.m.a,SurvivalStatus.a) ~ class,
data = To.84ms.surv.unique.class)
fit
saveRDS(fit, "survfit.rds")
#####6.3绘制不同class内成员的死亡生存曲线#####
ggsurvplot(
fit,
data = To.84ms.surv.unique.class,
#fun = "cumhaz",#将生存曲线转化为风险曲线
linetype = 1, # 根据分层更改线型c(0,1) or c("solid", "dashed") or "strata"
#surv.median.line = "hv", # 同时显示垂直和水平参考线 即增加中位生存时间 可选 "none"、"hv"、"h"、"v"
palette = "lancet" ,#定义颜色 可选调色板有 "hue" "grey","npg","aaas","lancet","jco", "ucscgb","uchicago","simpsons"和"rickandmorty".
xlab = "Time(months)",
ylab = "Survival Probability",
title = "",
legend = "right", # 指定图例位置 "top"(默认),"left","right","none","bottom"
legend.title = "",
legend.labs = c("class1", "class2"),
xlim = c(0,85),
ylim = c(0.75,1),
break.x.by = 12,
break.y.by = .05,
axes.offset = F, # 逻辑词,默认为TRUE。FALSE则生存曲线图的坐标轴从原点开始
#conf.int = TRUE,#增加置信区间
#conf.int.alpha = .3,# 数值,指定置信区间填充颜色的透明度; # 数值在0-1之间,0为完全透明,1为不透明
pval = TRUE, #log 秩检验
pval.size = 5,# 指定p值文本大小的数字,默认为 5
pval.coord = c(6,0.8),# 长度为2的数字向量,指定p值位置x、y,如pval.coord=c(x,y)
censor = T, # 逻辑词,默认为TRUE,在图上绘制删失点。
censor.shape = 3, # 数值或字符,用于指定删失点的形状;默认为"+"(3), 可选"|"(124)
censor.size = 1.5,# 指定删失点形状的大小,默认为4.5
# risk.table = "absolute", #"absolute"、"percentage"、"abs_pct"", #绝对人数、百分比和危险之中
# #risk.table.pos =in,#分线表放在图内
# risk.table.col = "strata", #按组更改风险表颜色
# risk.table.y.text.col = TRUE, #颜色风险表文本注释(按层)
# risk.table.y.text = FALSE, #在风险表图例中的文本注释中显示条形而不是名称
# risk.table.height = 0.2,
# risk.table.title="Number at the time",
# fontsize=4,#风险表字体
# ggtheme = theme_bw(), #使用主题自定义情节和风险表。
ggtheme = theme_survminer()+
theme(
panel.grid.major = element_line(color = "gray", size = 0.5), # 设置主要网格线
panel.grid.minor = element_line(color = "gray", linetype = "dotted", size = 0.25)) # 设置次要网格线
)