Script to calculate and display a temporal phenome map using

advertisement

15

16

17

18

19

20

21

22

23

24

25

9

10

11

12

13

14

5

6

3

4

7

8

1

2

#Script to calculate and display a temporal phenome map using MIMIC2 data

#

#

#

#Author:

#

Jeremy L. Warner M.D., M.S.

Assistant Professor of Medicine and Biomedical Informatics jeremy.warner@vanderbilt.edu

#Version date: 07/12/2013

#

#

#

#Disclaimer: This script is open-source and you are free to distribute, modify, and

# use without limitations. The original author is not responsible for any errors associated with the script and offers no guarantees as to its functionality.

Attribution is appreciated.

#

#Note: This is a script file and is not functionalized. Individual sections

# are delineated by headers.

#RPostgreSQL library for SQL database interface with MIMIC2 library(RPostgreSQL) drv <- dbDriver("PostgreSQL")

#Password is in clear text; you must have MIMIC2 access. conn <- dbConnect(drv, dbname="MIMIC2", user="mimic2", password="XXXXXX")

#1 Run algorithm to determine length of hospital stay for all adult patients

46

47

48

49

50

51

40

41

42

43

44

45

34

35

36

37

38

39

29

30

31

32

33

26

27

28

#1.1 All adult admissions with admit and discharge dates all.hadm <- dbGetQuery(conn, "SELECT distinct admissions.subject_id as subject_id, hadm_id, admit_dt, disch_dt

FROM mimic2v26.admissions

WHERE admissions.subject_id in

(SELECT subject_id

FROM mimic2v26.icustay_detail

WHERE icustay_age_group = 'adult')") hadm <- paste(all.hadm$hadm_id, ',', sep='', collapse=' ') hadm <- substr(hadm, start = 1, stop = nchar(hadm) - 1)

#1.1.1 All adult admissions with admission and discharge dates and at least one POE order all.hadm.poe <- dbGetQuery(conn, "

SELECT hadm_id, min(enter_dt) as first_poe, max(stop_dt) as last_poe

FROM mimic2v26.admissions JOIN mimic2v26.poe_order using(hadm_id)

WHERE admissions.subject_id in

(SELECT subject_id

FROM mimic2v26.icustay_detail

WHERE icustay_age_group = 'adult')

GROUP BY admissions.subject_id, hadm_id, admit_dt, disch_dt")

#1.1.2 All adult admissions with at least one lab order within t -48 hours of admission all.hadm.lab <- dbGetQuery(conn, paste("

SELECT admissions.hadm_id as hadm_id, min(charttime) as first_lab

FROM mimic2v26.labevents JOIN mimic2v26.admissions using(subject_id)

66

67

68

69

70

71

60

61

62

63

64

65

72

73

74

75

76

77

52

53

54

55

56

57

58

59

WHERE admissions.hadm_id in (", hadm, ") AND charttime - admit_dt > interval '-48 hours'

GROUP BY admissions.hadm_id"))

#1.1.3 All adult discharges from the ICU all.hadm.icu <- dbGetQuery(conn, "SELECT subject_id, hadm_id, icustay_outtime as icu_out

FROM mimic2v26.icustay_detail

WHERE icustay_age_group = 'adult'")

#1.2 The following code determines the first event for each hadm_id all.hadm <- merge(all.hadm, all.hadm.lab, by='hadm_id') all.hadm <- merge(all.hadm, all.hadm.poe, by='hadm_id', all.x = TRUE) all.hadm$first.event <- as.POSIXct(NA) all.hadm$first.event[difftime(all.hadm$first_lab, all.hadm$admit_dt, units = 'h') <= 24] <- all.hadm$first_lab[difftime(all.hadm$first_lab, all.hadm$admit_dt, units = 'h') <= 24] foo <- which(is.na(all.hadm$first.event))

#1.2.1 Some hadm have enter dates more than 24 hours after admit_dt; fall back to admit_dt/disch_dt all.hadm$first.event[foo] <- all.hadm$first_poe[foo] all.hadm$first.event[foo][difftime(all.hadm$first.event[foo], all.hadm$admit_dt[foo], units = 'h') > 24] <-

NA all.hadm$first.event[is.na(all.hadm$first.event)] <- all.hadm$admit_dt[is.na(all.hadm$first.event)]

98

99

100

101

102

103

92

93

94

95

96

97

86

87

88

89

90

91

81

82

83

84

85

78

79

80

#1.2.2 For patients who went from hospital to ICU and then discharged in the same day, use ICU discharge as out

#and first POE order as in. all.temp <- merge(all.hadm.icu, all.hadm, by='subject_id', all.x = TRUE) all.temp$diff <- difftime(all.temp$icu_out, all.temp$disch_dt, units = 'days') all.temp <- all.temp[which(all.temp$diff >= 0),] all.temp <- all.temp[which(all.temp$diff <= 1),] all.temp$diff <- difftime(all.temp$icu_out, all.temp$first.event, units = 'hours') all.temp <- all.temp[order(all.temp$hadm_id.y),]

#1.3 The following code determines the last event for each hadm_id all.hadm$last.event <- as.POSIXct(NA)

#1.3.1 Adds the times to the all.hadm table all.hadm$last.event[which(all.hadm$hadm_id %in% all.temp$hadm_id.y)] = all.temp$icu_out

#1.3.2 Some hadm have stop dates less than 24 hours after disch_dt foo <- which(is.na(all.hadm$last.event))

#1.3.3 Some hadm have enter dates more than 24 hours after admit_dt; fall back to admit_dt/disch_dt all.hadm$last.event[foo] <- all.hadm$last_poe[foo] all.hadm$last.event[foo][difftime(all.hadm$last.event[foo], all.hadm$disch_dt[foo], units = 'h') > 24] <-

NA

#1.3.4 Some have labs less than 24 hours after disch_dt

118

119

120

121

122

123

124

125

126

127

128

112

113

114

115

116

117

104

105

106

107

108

109

110

111 hadm <- all.hadm$hadm_id[is.na(all.hadm$last.event)] hadm <- paste(hadm, ',', sep='', collapse=' ') hadm <- substr(hadm, start = 1, stop = nchar(hadm) - 1) last.lab <- dbGetQuery(conn, paste("

SELECT admissions.hadm_id as hadm_id, max(charttime) as last_lab

FROM mimic2v26.labevents JOIN mimic2v26.admissions using(subject_id)

WHERE admissions.hadm_id in (", hadm, ") AND charttime - disch_dt <= interval '24 hours' AND charttime - disch_dt > interval '0 hours'

GROUP BY admissions.hadm_id")) last.lab <- last.lab[order(last.lab$hadm_id),] all.hadm$last.event[which(all.hadm$hadm_id %in% last.lab$hadm_id)] = last.lab$last_lab

#1.3.5 Some will just default to the disch_dt all.hadm$last.event[is.na(all.hadm$last.event)] <- all.hadm$disch_dt[is.na(all.hadm$last.event)]

#1.4 Calculate the difference, in hours, between discharge and admit times all.hadm$diff <- difftime(all.hadm$last.event,all.hadm$first.event, units = 'hours') all.hadm$diff <- as.numeric(all.hadm$diff)

#1.4.1 If the difference is a negative number, truncate to 0 hours all.hadm$diff[all.hadm$diff < 0] <- 0

143

144

145

146

147

148

149

150

151

152

153

137

138

139

140

141

142

129

130

131

132

133

134

135

136

#2 Specify bin size and determine interval cutoffs and subpopulations

#2.1 Determine the optimal number of bins with goal ~n patients per bin

#Example: n = 100 n <- 100 bin.n <- ceiling(dim(all.hadm)[1]/n)

#2.2 Prepare to create cutoffs t.cuts <- quantile(as.numeric(all.hadm$diff), probs=seq(from=0, to=1, length.out=bin.n))

#2.3 Calculate subpopulations based on intervals sub.hadm <- as.list(rep(0,(length(t.cuts)-1))) for (a in 1:(length(t.cuts)-1)) { hadm <- all.hadm$hadm_id[which(all.hadm$diff > t.cuts[a] & all.hadm$diff <= t.cuts[a+1])] hadm.n <- length(hadm) hadm <- paste(hadm, ',', sep='', collapse=' ') hadm <- substr(hadm, start = 1, stop = nchar(hadm) - 1) sub.hadm[[a]][1] <- hadm.n sub.hadm[[a]][2] <- hadm

168

169

170

171

172

173

174

175

176

177

178

162

163

164

165

166

167

154

155

156

157

158

159

160

161

}

#3 Obtain ICD-9 counts for the subpopulations

#3.1 Initialize table with all possible ICD-9 in MIMIC icd9.table <- dbGetQuery(conn, "SELECT distinct code FROM mimic2v26.icd9") for (a in 1:(length(t.cuts)-1))

{

temp.colname <- paste("t",a,sep="_")

temp.icd9 <- dbGetQuery(conn, paste(

"SELECT code, count(*) AS", temp.colname,

"FROM mimic2v26.icd9",

"WHERE hadm_id in (",

sub.hadm[[a]][2],

") GROUP BY code"))

icd9.table <- merge(icd9.table,temp.icd9,all.x=T)

}

#3.2 Replace all NA created in this process with 0's icd9.table[is.na(icd9.table)] <- 0

193

194

195

196

197

198

199

200

201

202

203

187

188

189

190

191

192

179

180

181

182

183

184

185

186

#4 Determine p-values for the developed table with Fisher's Exact Test p.table <- data.frame(matrix(nrow=dim(icd9.table)[1],ncol=dim(icd9.table)[2]),stringsAsFactors=F) colnames(p.table) <- colnames(icd9.table) p.table[,1] <- icd9.table[,1] icd9.table.r <- rowSums(icd9.table[,2:dim(icd9.table)[2]]) for (j in 2:dim(p.table)[2])

{

# Number in subgroup, rounded up n3 <- as.integer(sub.hadm[[j-1]][[1]]) if(n3 > 0) { temp <- rep(0,dim(p.table)[1]) for (i in 1:dim(p.table)[1]) temp[i] <- fisher.test(matrix(c(icd9.table[i,j], icd9.table.r[i]-icd9.table[i,j],

n3,

dim(all.hadm)[1]-n3),

nrow = 2, ncol = 2))[['p.value']]

218

219

220

221

222

223

224

225

226

227

228

212

213

214

215

216

217

204

205

206

207

208

209

210

211 p.table[,j] <- temp } else p.table[,j] <- 1

}

#4.1 Replace p=0 with p=1.5e-322 (smallest size reported in R) p.table[p.table==0] <- 1.5e-322

#4.2 Adjust P-value for FDR on each bin individually p.table.adj <- p.table for (i in 2:dim(p.table)[2]) p.table.adj[,i] <- p.adjust(p.table[,i], method='BH')

#5 Data preparation for visualization

#5.1 Create numeric correlates to V-codes and E-codes code <- data.frame(codec=as.character(icd9.table$code), coden=as.character(icd9.table$code),stringsAsFactors=F) temp.index <- grep("V", code$codec) v.rep <- code$codec[temp.index] code$coden[temp.index] <- as.numeric(sub('V','',v.rep))+1050 rm(v.rep) temp.index <- grep("E", code$codec)

243

244

245

246

247

248

249

250

251

252

253

237

238

239

240

241

242

229

230

231

232

233

234

235

236 e.rep <- code$codec[temp.index] code$coden[temp.index] <- as.numeric(sub('E','',e.rep))+400 rm(e.rep) code$coden <- as.numeric(code$coden)

#5.2 Create an ICD-9 chapter color mapping table icd9.col <- data.frame(cbind( lower=c(0,140,240,280,290,320,360,390,460,520,580,630,680,710,740,760,780,

800,1025,1175), color=c('magenta','cyan','blue','green3','black','orange',

'magenta','cyan','blue','green3','black','orange',

'magenta','cyan','blue','green3','black','orange',

'purple','gray')

), stringsAsFactors = FALSE) icd9.col$lower <- as.numeric(icd9.col$lower)

#5.3 Create data frames containing localization, point sizing, and color information

#5.3.1 First data frame, all adjusted p not equal to 1 temp.df1 <- data.frame(codec=NA,coden=NA,y1=NA,y2=NA,p=NA) for (i in 2:dim(p.table.adj)[2]) { temp <- which(p.table.adj[,i] != 1)

268

269

270

271

272

273

274

275

276

277

278

262

263

264

265

266

267

254

255

256

257

258

259

260

261 if (length(temp) > 0) for (j in 1:length(temp))

} temp.df1 <- rbind(temp.df1, cbind(code[temp[j],],y1=i-1,y2=i,p=p.table.adj[temp[j],i])) temp.df1 <- na.omit(temp.df1) temp.df1 <- temp.df1[temp.df1$p < 0.99,] temp.df1 <- temp.df1[order(temp.df1$coden),] temp.df1$t1 <- round(t.cuts[temp.df1$y1]/24,digits=1) temp.df1$t2 <- round(t.cuts[temp.df1$y2]/24,digits=1) temp.df1$color <- '' for (j in 1:dim(temp.df1)[1]) temp.df1$color[j] <- icd9.col$color[max(which(icd9.col$lower <= temp.df1$coden[j]))]

#5.3.2 Second data frame, only adjusted p <= 0.05 temp.df2 <- temp.df1[temp.df1$p <= 0.05,]

#

#

#6 Temporal phenome maps based on temp.df1 and temp.df2

#

If you wish to generate a TIFF, run the entire section.

If you wish to view the graphic within the R environment, exclude the tiff() and dev.off()

If you wish to "zoom" into parts of the map, change the x.min, x.max, y.min, y.max parameters

293

294

295

296

297

298

299

300

301

302

303

287

288

289

290

291

292

279

280

281

282

283

284

285

286

#6.1 Set the borders of the map

#6.1.1 Default x-axis values range from 0 to 1400 x.min <- 0 x.max <- 1400

#6.1.2 Default y-axis values range from 1st to last time interval y.min <- 0 y.max <- dim(p.table)[2]

#6.2 Open pipe to TIFF tiff(filename = paste('temporal.phenome.',n,'.',Sys.Date(),'.tif',sep=''), width=10, height=8, res=300, units='in', compression='lzw')

#6.3 Set the environment to show two graphs in one TIFF/window par(mfrow=c(2,1))

#6.4 Create the first temporal phenome map

#6.4.1 Create temporal phenome map borders and labels, first plot(x=-100, xlim=c(x.min, x.max), xaxt='n', ylim=c(y.min, y.max),

318

319

320

321

322

323

324

325

326

327

328

312

313

314

315

316

317

304

305

306

307

308

309

310

311 yaxt='n', xlab="ICD-9-CM Code", ylab="Time, days")

#6.4.2 Add line segments for (i in 1:dim(temp.df2)[1]) segments(x0=temp.df2$coden[i], y0=temp.df2$y1[i], y1=temp.df2$y2[i]+1, lwd=3*log(-log10(temp.df2$p[i])), col=temp.df2$color[i])

#6.4.3 Place tick marks on the x-axis axis(side=1, labels=c('V Codes','E Codes'), at=c(1100,1300)

)

#6.4.4 Place tick marks on the y-axis axis(side=2, labels=round(quantile(x=t.cuts, probs = seq(0, 1, 0.1))/24),

at=quantile(x=1:length(t.cuts), probs = seq(0, 1, 0.1)))

#6.4.5 Next, add title (default title is "A") mtext(text = "A", font=2, side = 3, adj = 0, padj = -1, cex=1.5)

343

344

345

346

347

348

349

350

351

352

353

337

338

339

340

341

342

329

330

331

332

333

334

335

336

#6.4.6 Add light gray dashed vertical lines to divide by ICD-9 chapter for (i in c(140,240,280,290,320,360,390,460,520,580,630,680,710,740,760,780,

800,1025,1175)) abline(v = i, lty = 2, col = "light gray")

#6.4.7 Adds horizontal lines at 25th, median, 75th percentile

# Default text placement will have to be changed if borders are changed foo <- quantile(1:length(t.cuts), probs=seq(from=0.25, to=.75, length.out=3)) abline(h = foo[1], lty = 2, col = "black") text(1200, foo[1], "25th percentile", pos = 3) abline(h = foo[2], lty = 2, col = "black") text(1200, foo[2], "50th percentile", pos = 3) abline(h = foo[3], lty = 2, col = "black") text(1200, foo[3], "75th percentile", pos = 3)

#6.5 Create the second temporal phenome map

#6.5.1 Create temporal phenome map borders and labels, first plot(x=-100, xlim=c(x.min, x.max), xaxt='n',

368

369

370

371

372

373

374

375

376

377

378

362

363

364

365

366

367

354

355

356

357

358

359

360

361 ylim=c(y.min, y.max), yaxt='n', xlab="ICD-9-CM Code", ylab="Time, days")

#6.5.2 Add line segments for (i in 1:dim(temp.df1)[1]) segments(x0=temp.df1$coden[i], y0=temp.df1$y1[i], y1=temp.df1$y2[i]+1, lwd=3*log(-log10(temp.df1$p[i])), col=temp.df1$color[i])

#6.5.3 Place tick marks on the x-axis axis(side=1, labels=c('V Codes','E Codes'), at=c(1100,1300)

)

#6.5.4 Place tick marks on the y-axis axis(side=2, labels=round(quantile(x=t.cuts, probs = seq(0, 1, 0.1))/24),

at=quantile(x=1:length(t.cuts), probs = seq(0, 1, 0.1)))

#6.5.5 Next, add title (default title is "B")

393

394

395

396

397

398

399

400

401

402

403

387

388

389

390

391

392

379

380

381

382

383

384

385

386 mtext(text = "B", font=2, side = 3, adj = 0, padj = -1, cex=1.5)

#6.5.6 Add light gray dashed vertical lines to divide by ICD-9 chapter for (i in c(140,240,280,290,320,360,390,460,520,580,630,680,710,740,760,780,

800,1025,1175)) abline(v = i, lty = 2, col = "light gray")

#6.5.7 Adds horizontal lines at 25th, median, 75th percentile

# Default text placement will have to be changed if borders are changed foo <- quantile(1:length(t.cuts), probs=seq(from=0.25, to=.75, length.out=3)) abline(h = foo[1], lty = 2, col = "black") text(1200, foo[1], "25th percentile", pos = 3) abline(h = foo[2], lty = 2, col = "black") text(1200, foo[2], "50th percentile", pos = 3) abline(h = foo[3], lty = 2, col = "black") text(1200, foo[3], "75th percentile", pos = 3)

#7 Output pertinent data into a RData file with a specified naming convention:

# temporal.phenome.[bin size].[date of creation].RData save(sub.hadm,all.hadm,icd9.table,p.table,

file=paste('temporal.phenome.',n,'.',Sys.Date(),'.RData',sep=''))

Download