峰值注释有四个步骤

这个快速入门的目的是介绍四个新实现的函数,toRangesannoGOannotatePeakInBatch,addGeneIDs在新版本的ChIPpeakAnno.使用这些包装器函数,ChIP-Seq峰值的注释简化为四个主要步骤:

1读取峰值数据toGRanges2生成注释数据toGRanges3用annotatePeakInBatch添加与的附加信息addGeneIDs

大多数情况下,用户可以使用这些函数的参数的默认设置。这使得注释管道简单易用。

注意,注释数据的版本必须与用于绘图的基因组匹配,因为不同的基因组版本的坐标可能不同。例如,如果您正在使用Mus_musculus。v103用于映射,你最好也使用EnsDb.Mmusculus。V103用于注释。关于如何准备注释数据的详细信息,请参见?getAnnotation。

用例1:用四个步骤注释峰值数据EnsDb

步骤1:将峰值数据转换为农庄toGRanges

##首先加载ChIPpeakAnno包库(ChIPpeakAnno)
路径<- system. Path。文件(“extdata”、“Tead4。broadPeak", package="ChIPpeakAnno") peaks <- toGRanges(path, format="broadPeak") peaks[1:2]
## seqnames ranges strand | score signalValue pValue ##    |    ## peak12338 chr2 175473-176697 * | 206 668.42 -1 ## peak12339 chr2 246412-246950 * | 31 100.23 -1 ## qValue ##  ## peak12338 -1 ## peak12339 -1 ## ------- ## seqinfo: 1个来自未指定基因组的序列;没有seqlengths

步骤2:准备标注数据toGRanges

library(EnsDb.Hsapiens.v75) annoData <- toGRanges(EnsDb.Hsapiens.v75) annoData[1:2]
## seqnames ranges | gene_name ##    |  ## ENSG00000223972 chr1 11869-14412 + | DDX11L1 ## ENSG00000227232 chr1 14363-29806 - | WASH7P ## ------- ## seqinfo:来自2个基因组(hg19, GRCh37)的273个序列(1个循环)

步骤3:用annotatePeakInBatch

##保持seqnames在同一风格seqlevelsStyle(peaks) <- seqlevelsStyle(annoData) ##用最近的TSS anno做注释<- annotatePeakInBatch(peaks, AnnotationData=annoData) anno[1:2]
## seqnames ranges strand | score ##    |  ## peak12338。ENSG00000227061 chr2 175473-176697 * | 206 ##峰值12339。ENSG00000143727 chr2 246412-246950 * | 31 ## signalValue pValue qValue peak ## <数字> <数字> <数字> <字符> ##峰值12338。ENSG00000227061 668.42 -1 -1 peak12338 ## peak12339。ENSG00000143727 100.23 -1 -1 peak12339 ## feature start_position end_position ##    ## peak12338。ENSG00000227061 ENSG00000227061 197569 202605 ##峰值12339。ENSG00000143727 ENSG00000143727 264140 278283 ## feature_strand insideFeature distancetoFeature ## <字符> <字符> <数字> ## peak12338。ENSG00000227061 + upstream -22096 ## peak12339。ENSG00000143727 + upstream -17728 ## shortestDistance fromOverlappingOrNearest ##   ## peak12338。ENSG00000227061 20872最近位置##峰值12339。ENSG00000143727 17190 NearestLocation ## ------- ## seqinfo: 1个来自未指定基因组的序列;没有seqlengths
饼图可以用来展示峰值的重叠特征。pie1(表(伊斯兰教纪元insideFeature美元))

步骤4:添加附加注释addGeneIDs

library(org.Hs.eg.db) anno <- addGeneIDs(anno, orgAnn="org.Hs.eg.db", feature_id_type="ensembl_gene_id", IDs2Add=c("symbol")) head(anno)
## seqnames ranges strand | score ##    |  ## peak12338。ENSG00000227061 chr2 175473-176697 * | 206 ##峰值12339。ENSG00000143727 chr2 246412-246950 * | 31 ##峰值12340。ENSG00000143727 chr2 249352-250233 * | 195 ##峰值12341。ENSG00000143727 chr2 259896-261404 * | 510 ##峰值12342。ENSG00000143727 chr2 261931-263148 * | 48 ##峰值12343。ENSG00000236856 chr2 378232-378871 * | 132 ##信号值pValue qValue峰值## <数字> <数字> <数字> <字符> ##峰值12338。ENSG00000227061 668.42 -1 -1 peak12338 ## peak12339。ENSG00000143727 100.23 -1 -1 peak12339 ## peak12340。ENSG00000143727 630.65 -1 -1峰值12340 ##峰值12341。ENSG00000143727 1649.19 -1 -1 peak12341 ## peak12342。ENSG00000143727 155.56 -1 -1 peak12342 ## peak12343。ENSG00000236856 426.52 -1 -1 peak12343 ## feature start_position end_position ##    ## peak12338。ENSG00000227061 ENSG00000227061 197569 202605 ##峰值12339。ENSG00000143727 ENSG00000143727 264140 278283 ##峰值12340。ENSG00000143727 ENSG00000143727 264140 278283 ##峰值12341。ENSG00000143727 ENSG00000143727 264140 278283 ##峰值12342。ENSG00000143727 ENSG00000143727 264140 278283 ##峰值12343。ENSG00000236856 ENSG00000236856 388412 416885 ## feature_strand insideFeature distancetoFeature ## <字符> <字符> <数字> ## peak12338。ENSG00000227061 + upstream -22096 ## peak12339。ENSG00000143727 + upstream -17728 ##峰值12340。ENSG00000143727 + upstream -14788 ## peak12341。ENSG00000143727 + upstream -4244 ##峰值12342。ENSG00000143727 + upstream -2209 ## peak12343。ENSG00000236856 + upstream -10180 ## shortestDistance fromOverlappingOrNearest ##   ## peak12338。ENSG00000227061 20872最近位置##峰值12339。eng00000143727 17190最近位置##峰值12340。ENSG00000143727 13907最近位置##峰值12341。ENSG00000143727 2736最近位置##峰值12342。eng00000143727 992最近位置##峰值12343。ENSG00000236856 9541最近位置##符号## <字符> ##峰值12338。ENSG00000227061  ## peak12339。ENSG00000143727 ACP1 ##峰值12340。ENSG00000143727 ACP1 ## peak12341。ENSG00000143727 ACP1 ## peak12342。ENSG00000143727 ACP1 ## peak12343。ENSG00000236856  ## ------- ## seqinfo: 1个来自未知基因组的序列;没有seqlengths

用例2:用。提供的启动子注释峰TxDb

本节演示如何使用基于的新注释注释快速开始1中相同的峰值数据TxDbtoGRanges

library(TxDb.Hsapiens.UCSC.hg19.knownGene) annoData <- toGRanges(TxDb.Hsapiens.UCSC.hg19.knownGene) annoData[1:2]
## seqnames ranges strand ##    ## 1 chr19 58858172-58874214 - ## 10 chr8 18248755-18258723 + ## ------- ## seqinfo: hg19基因组的93个序列(1个循环)
seqlevelsStyle(peaks) <- seqlevelsStyle(annoData)

相同的annotatePeakInBatch函数用于使用刚刚创建的注释数据注释峰值。这次我们想要的是基因体内TSS上游2kb以内和下游300bp以内的峰值。

anno <- annotatePeakInBatch(peaks, AnnotationData=annoData, output="overlap ", FeatureLocForDistance="TSS", bindingRegion=c(-2000, 300)) anno$symbol <- xget(anno$feature, org.Hs.egSYMBOL) head(anno)
GRanges对象有6个范围和12个元数据列:# # seqnames范围链|得分signalValue pValue # # < Rle > < IRanges > < Rle > | <整数> <数字> <数字> # # peak12342 chr2 261931 - 263148 * 48 155.56 - 1 # # | peak12345 chr2 677052 - 677862 * 103 (|) 334.74 - 1 # # peak12348 chr2 3380709 - 3380709 * 110 (|) 357.22 - 1 # # peak12348 chr2 3380709 - 3380709 * 110 (|) 357.22 - 1 # # peak12349 chr2 3383131 - 3383131 * 199 (|) 645.56 - 1 # # peak12349 chr2 3383131 - 3383131 * 199 (|) 645.56 - 1 # # qValue峰值特征特性。范围的功能。链## <数字> <字符> <字符>   ## peak12342 -1 peak12348 129787 667973-677439 - ## peak12348 -1 peak12348 51112 3383446-3488857 + ## peak12349 -1 peak12349 7260 3192741- 3488853 - ## peak12349 -1 peak12349 51112 3383446-3488857 + ##距离insideFeature distanceToSite符号## <整数> <字符> <整数> <字符> ## peak12342 0overlapStart 0 TMEM18 ## peak12348 0 overlapStart 0 EIPR1 ## peak12348 1130 upstream 1130 TRAPPC12 ## peak12349 1477 upstream 1477 EIPR1 ## peak12349 0 overlapStart 0 TRAPPC12 ## ------- # seqinfo: 1个来自未指定基因组的序列;没有seqlengths

用例3:用5K bps内最近的转录起始点标注两边的峰值。

本节演示ChIPpeakAnno中注释函数的灵活性。而不是构建一个新的注释数据,参数bindingTypes而且bindingRegionannoPeak函数可以找到TSS上游和下游5000 bp内的峰值,可以是用户定义的启动子区域。

anno <- annotatePeakInBatch(peaks, AnnotationData=annoData, output="nearestBiDirectionalPromoters", bindingRegion=c(- 5000,500)) anno$symbol <- xget(anno$feature, org. h . egsymbol) anno[anno$peak=="peak12725"]
## seqnames ranges strand | score signalValue pValue ##    |    ## peak12725 chr2 28112981-28113476 * | 34 110.72 -1 ## peak12725 chr2 28112981-28113476 * | 34 110.72 -1 ## qValue峰值特征特征。范围的功能。链## <数字> <字符> <字符>   ## peak12725 -1 peak12725 9577 28113482-28561767 + ## peak12725 -1 peak12725 64080 28004266-28113223 - ## distance insideFeature distanceToSite符号## #  <字符>  <字符> ## peak12725 5 upstream 5 BABAM2 ## peak12725 0 overlapStart 0 RBKS ## ------- ## seqinfo: 1序列来自一个未指定的基因组;没有seqlengths

标注的峰值可以用R/Bioconductor软件包可视化trackViewer由我们小组开发。

library(trackViewer) gr <- peak <- peaks["peak12725"] start(gr) <- start(gr) - 5000 end(gr) <- end(gr) + 5000 if(. platform $OS。type != "windows"){peak12725 <- importScore(file=system. txt)文件(“extdata”、“Tead4。bigWig", package="ChIPpeakAnno"), ranges=peak, format =" bigWig")}else{## rtracklayer不能在Windows上导入bigWig文件(file.path(dirname(path), "cvglist.rds") peak12725 <- Views(cvglists[["Tead4"]][[as.character(seqnames(peak))]], start(peak), end(peak)) peak12725 <- viewApply(peak12725, as.numeric) tmp <- rep(peak, width(peak)) width(tmp) <- 1 tmp <- shift(tmp, shift=0 (width(peak)-1) mcols(tmp) <- - peak12725 colnames(mcols(tmp)) <- "score" peak12725 <- new("track", dat=tmp,name="peak12725", type="data", format="BED")} trs <- geneModelFromTxdb(TxDb.Hsapiens.UCSC.hg19. txt);knownGene, org. hs . exe .db, gr) names(trs) <- paste(sapply(trs, function(.ele) .ele@name), names(trs), sep=":") optSty <- optimizeStyle(trackList(peak12725, trs, heightDist = c(。3, .7)),主题="bw") viewTracks(optSty$tracks, gr=gr, viewerStyle=optSty$style)