R包ClusterGVis学习分享

看文献时发现一个信息量很丰富的图片，于是研究了一下并进行复现，先放个成果图

实现思路为，先对表达量数据做kmeans聚类分析进行分组（可使用其他分析如mfuzz, wgcna），然后对每组基因进行富集分析（KEGG，GO），然后使用ClusterGVis 包进行结果整合可视化。

ClusterGVis github地址: https://github.com/junjunlab/ClusterGVis

可以将包下下来，使用devtools::install_local('./path') 命令进行安装

PS : 安装ClusterGVis包时需要注意各依赖项的版本问题，我安装的是ClusterGVis 0.1.1版本的，其中使用了ComplexHeatmap的anno_block函数的align_to参数，低版本的ComplexHeatmap没有该参数，后面更新到了2.20.0版本的ComplexHeatmap才能用

# Argument importation
option <- function() {
    disc <- "ClusterGVis"
    parser <- ArgumentParser(description = disc)
    parser $add_argument("--cluster", type = "character", required = T, help = '输入kmeans结果文件，第一列是基因，第二列是clusterID，第三列往后是各样本表达量数据.') parser$ add_argument("-g", "--group",
                        type = "character",
                        required = T,
                        help = "输入样本分组文件，第一列是样本，第二列是分组.")
    parser $add_argument('-o', '--outdir', type = "character", default = "./", required = F, help = '输出目录. [Optional, default: %(default)s]') parser$ add_argument('--focus',
                        type = "character",
                        required = F,
                        default = NULL,
                        help = '关注基因列表，一列数据，没有表头，可以不给，不给就不画关注基因. [Optional]')
    parser $add_argument('--enrichRes', type = "character", required = F, default = NULL, help = '富集结果文件，可以不给，不给就不画富集结果，必须包含三列数据，表头为cluster，Description，P-value.') parser$ add_argument('--enrichTop',
                        type = "integer",
                        required = F,
                        default = 5,
                        help = '画富集结果时选用top个数画图，太多了会很挤，显示不全. [Optional, default: %(default)s]')
    if (length(commandArgs(T)) == 0) {
        parser $print_help() quit(status = 1) } args <- parser$ parse_args()
    return(args)
}

library(argparse)
# Get command line options
args <- option()

Col_L15 <- c("#8dd3c7","#ffffb3","#bebada","#fb8072","#80b1d3","#fdb462","#b3de69","#fccde5",
            "#d9d9d9","#bc80bd","#ccebc5","#ffed6f","#928262","#0067A5","#4DBF4A","#fb9a99","#b2df8a")

# ComplexHeatmap 版本(2.4.3)低了用不了(anno_block 没有align_to参数)
library(ClusterGVis)

options(scipen = 100, stringsAsFactors = F)
rawdata <- read.table(args $cluster, header = T, sep = '\t', quote = '', check.names = F) #head(rawdata) #ID cluster A-1 A-2 A-3 B-1 B-2 B-3 C-1 C-2 C-3 #Chr11.1255 1 0.0675 0.0328 0.1165 2.9284 2.9944 2.5721 0.1141 0.0412 0.0432 #Chr11.342 1 0.2846 0.083 0.2947 0.9262 1.1964 0.5811 0.1925 0.1739 0.2185 #Chr11.981 1 6.3054 8.2915 8.481 16.3796 16.2822 14.0238 4.2052 5.6431 3.8638 #Chr12.2476 1 1.4847 1.1787 0.3138 1.8628 0.3538 1.6086 0.7857 0.2592 0.504 colnames(rawdata)[1:2] <- c('gene', 'cluster') rownames(rawdata) <- rawdata$ gene
sampleid <- read.table(args $group, header = T, sep = '\t') #head(sampleid) #Sample Group #A-1 A #A-2 A #A-3 A #B-1 B #B-2 B outdir <- args$ outdir
if (!dir.exists(outdir)) {
    dir.create(outdir)
}

anadata <- rawdata[, c('gene', 'cluster', as.character(sampleid[, 1]))]

# zscore
anadata[, 3:ncol(anadata)] <- t(apply(anadata[, 3:ncol(anadata)], 1, function(x) (x - mean(x, na.rm = T)) / sd(x, na.rm = T)))

# 构造数据列表
pl <- list()
pl $wide.res <- anadata # lone.res是长数据，用来单独画折线图，这里需要 #pl$ long.res <- 'none' 
pl $type <- 'kmeans' pl$ geneMode <- 'none'
pl $geneType <- 'none' # 自己构造样本注释条，实现颜色自由，legend标题自由 colcolor <- Col_L15[1:length(unique(sampleid[, 2]))] names(colcolor) <- unique(sampleid[, 2]) #sampleColor <- colcolor[sampleid[, 2]] sampledf <- sampleid[, 'Group', drop = F] rownames(sampledf) <- sampleid[, 1] sampleanno <- ComplexHeatmap::HeatmapAnnotation(df = sampledf, col = list(Group = colcolor), gp = grid::gpar(col = 'white'), show_legend = TRUE, show_annotation_name = FALSE) # 构建画图参数列表，通过列表来给主函数传参，想加参数自己添加列表元素就行了，不用每次都用主函数带一堆参数 paramsList <- list(object = pl, plot.type = "both", column_names_rot= 60, show_row_dend = T, annnoblock.text = T, sample.group = sampleid[, 2], ctAnno.col = jjAnno::useMyCol("stallion", n = length(unique(rawdata$ cluster))), # 使用包默认颜色画行注释
                   #sample.col = sampleColor, 
                   HeatmapAnnotation = sampleanno,
                   show_parent_dend_line = F,
                   add.box = F,
                   add.point = T,
                   point.arg = c(19, 'orange', 'orange', 0.5),
                   add.line = T,
                   line.side = 'left',
                   #cluster.order = sort(unique(rawdata $cluster)), column_title =\'' # 不显示分组标题 ) # 获取感兴趣基因名 if (!is.null(args$ focus)) {
    markgene <- read.table(args $focus, header = F, sep = '\t') paramsList$ markGenes <- as.character(markgene[, 1])
}

# 传富集结果和富集画图参数
if (!is.null(args $enrichRes)) { enrichData <- read.table(args$ enrichRes, header = T, sep = '\t', quote = '', check.names = F)
    enrichData $cluster <- as.numeric(enrichData$ cluster)
    if (length(unique(enrichData $cluster)) != length(unique(rawdata$ cluster))) {
        stop('每个cluster必须都有富集结果才能画富集图, 没有就不给enrichRes参数，只画热图')
    }
    enrichRes <- data.frame()
    for (i in unique(sort(enrichData $cluster))) { clusterEnrichdata <- enrichData[enrichData$ cluster == i, ]
        if (nrow(clusterEnrichdata) > 0) {
            clusterEnrichdata <- clusterEnrichdata[order(clusterEnrichdata $`P-value`), ] clusterEnrichdata <- head(clusterEnrichdata, args$ enrichTop)
            enrichRes <- rbind(enrichRes, data.frame('id'= paste0('C', i), clusterEnrichdata[, c('Description', 'P-value')]))
        }
    }
    colnames(enrichRes) <- c('id', 'term', 'pval')
    # id列是C1,C1,C2,C2...

    paramsList $annoKegg.data <- enrichRes paramsList$ add.kegg.bar <- T
    paramsList $kegg.size <- 'pval' paramsList$ by.kegg <- 'anno_link'
    paramsList $annoKegg.mside <- 'right' paramsList$ textbar.pos <- c(0.8, 0.2) # 将富集柱状图的cluster标签放到右下角，避免遮挡图片

    clusterColor <- jjAnno::useMyCol("stallion", n = length(unique(enrichRes $id))) names(clusterColor) <- unique(enrichRes$ id)
    paramsList $kegg.col <- clusterColor[enrichRes$ id] # 设置富集的颜色和cluster注释颜色保持一致

}

p <- do.call(visCluster, paramsList) # 使用do.call函数将参数列表传给画图主函数

cairo_pdf(paste0(outdir, '/clusterGVis.pdf'), width = 18, height = 12)
print(p)
dev.off()

png(paste0(outdir, '/clusterGVis.png'), width = 18, height = 12, units = 'in', type = 'cairo', res = 300)
print(p)
dev.off()

该代码是外置传参的，将代码复制成plot.R，可用于linux系统批量画图，使用示例：

Rscript plot.R --cluster kmeans_cluster.txt -g group.txt -o results --enrichRes KEGG.data.txt --focus focus.id