内容

包:Pbase
作者:劳伦特与而且塞巴斯蒂安·吉布
最后编译:2019年10月29日星期二21:01:04
最后修改:2019-10-29 16:23:38

0.1简介

的中心数据对象Pbase包,即蛋白质实例,如下所示。它们包含一组蛋白质序列(下图中10个),由蛋白质序列(灰色框)和注释数据(左侧表)组成。每个蛋白质都链接到一组感兴趣的范围,比如实验观察到的多肽的蛋白质结构域(也是灰色的),也用它们自己的注释数据装饰。该图还显示了用于不同数据槽的访问器,在蛋白质?

蛋白质对象由来自fasta文件的蛋白质序列填充,而多肽通常来自lc - mms实验。

下面使用的原始数据是10 fmol肽保留时间校准混合物在Thermo Orbitrap Q Exactive仪器上检测到50 ng HeLa背景。来自UniProt发布的一组受限的高分人类蛋白质2015年_02使用MSGF +搜索引擎。

0.2fasta数据库

库(“Biostrings”)
##加载所需的包:XVector
## ##附加包:“Biostrings”
下面的对象从'package:base'屏蔽:## ## strsplit
fafile <- system.file("extdata/HUMAN_2015_02_selected. txt ")fasta", package = "Pbase") fa <- readAAStringSet(fafile) fa . fasta", package = "Pbase") fa
一个长度为9的AAStringSet实例,命名为MPVTEKDLAEDAPWKKIQQNTF…VLAVKWGEEHIPGSPFHVTVP sp | O75369 | FLNB_HU……## [2] 3374 mspeghsrifeatagpnkpes…YTLSKDSLSNGVPSGRQAEFS sp | A4UGR9 | XIRP2_H……## [3] 2624 mfrrarlsvkpnvrpgvgargs…EATTVSEYFFNDIFIEVDETE sp | A6H8Y1 | BDP1_HU……## [4] 911 mvdyhaanqsyqygpssagnga…一个VPGALDYKSFSTALYGESDL sp|O43707|ACTN4_H... ## [5] 417 MSLSNKLTLDKLDVKGKRVVMR...GASLELLEGKVLPGVDALSNI sp|P00558|PGK1_HU... ## [6] 375 MDDDIAALVVDNGSGMCKAGFA...MWISKQEYDESGPSIVHRKCF sp|P60709|ACTB_HU... ## [7] 664 METPSQRRATRSGAQASSTPLS...RSYLLGNSSPRTQSPQNCSIM sp|P02545|LMNA_HU... ## [8] 364 MPYQYPALTPEQKKELSDIAHR...TPSGQAGAAASESLFVSNHAY sp|P04075|ALDOA_H... ## [9] 418 MARRKPEGSSFNMTHLSMAMAF...TPSGQAGAAASESLFVSNHAY sp|P04075-2|ALDOA...

0.3PSM数据

library("mzID") idfile <- system.file("extdata/Thermo_Hela_PRTC_selected. file ")mzid", package = "Pbase") id <- flatten(mzid (idfile))
##读取Thermo_Hela_PRTC_selected.mzid…完成了!
暗(id)
## [1] 137 29
头(id)
## spectrumid扫描编号## 1 index=173 12256 ## 1.1 index=173 12256 ## 2 index=163 11860 ## 2.1 index=163 11860 ## 3 index=200 13408 ## 3.1 index=200 13408 ##频谱标题## 1 msLevel 2;retentionTime 2094.56706;scanNum 12256;precMz 1137.06665029649;precCharge 2 ## 1.1 ms2级;retentionTime 2094.56706;scanNum 12256;precMz 1137.06665029649;precCharge 2 ## 2 msLevel 2;retentionTime 2039.84424; scanNum 11860; precMz 1136.57450195803; precCharge 2 ## 2.1 msLevel 2; retentionTime 2039.84424; scanNum 11860; precMz 1136.57450195803; precCharge 2 ## 3 msLevel 2; retentionTime 2258.27868; scanNum 13408; precMz 703.038108542133; precCharge 3 ## 3.1 msLevel 2; retentionTime 2258.27868; scanNum 13408; precMz 703.038108542133; precCharge 3 ## acquisitionnum passthreshold rank calculatedmasstocharge ## 1 173 TRUE 1 1136.574 ## 1.1 173 TRUE 1 1136.574 ## 2 163 TRUE 1 1136.574 ## 2.1 163 TRUE 1 1136.574 ## 3 200 TRUE 1 703.037 ## 3.1 200 TRUE 1 703.037 ## experimentalmasstocharge chargestate ms-gf:denovoscore ms-gf:evalue ## 1 1137.0667 2 132 2.597097e-18 ## 1.1 1137.0667 2 132 2.597097e-18 ## 2 1136.5745 2 230 4.942664e-17 ## 2.1 1136.5745 2 230 4.942664e-17 ## 3 703.0381 3 145 4.080429e-10 ## 3.1 703.0381 3 145 4.080429e-10 ## ms-gf:rawscore ms-gf:specevalue assumeddissociationmethod isotopeerror ## 1 118 2.276758e-22 CID 1 ## 1.1 118 2.276758e-22 CID 1 ## 2 186 4.333009e-21 CID 0 ## 2.1 186 4.333009e-21 CID 0 ## 3 98 3.578068e-14 CID 0 ## 3.1 98 3.578068e-14 CID 0 ## isdecoy post pre end start accession length ## 1 FALSE C K 134 112 sp|P04075|ALDOA_HUMAN 364 ## 1.1 FALSE C K 188 166 sp|P04075-2|ALDOA_HUMAN 418 ## 2 FALSE C K 134 112 sp|P04075|ALDOA_HUMAN 364 ## 2.1 FALSE C K 188 166 sp|P04075-2|ALDOA_HUMAN 418 ## 3 FALSE Y K 173 154 sp|P04075|ALDOA_HUMAN 364 ## 3.1 FALSE Y K 227 208 sp|P04075-2|ALDOA_HUMAN 418 ## description ## 1 Fructose-bisphosphate aldolase A OS=Homo sapiens GN=ALDOA PE=1 SV=2 ## 1.1 Isoform 2 of Fructose-bisphosphate aldolase A OS=Homo sapiens GN=ALDOA ## 2 Fructose-bisphosphate aldolase A OS=Homo sapiens GN=ALDOA PE=1 SV=2 ## 2.1 Isoform 2 of Fructose-bisphosphate aldolase A OS=Homo sapiens GN=ALDOA ## 3 Fructose-bisphosphate aldolase A OS=Homo sapiens GN=ALDOA PE=1 SV=2 ## 3.1 Isoform 2 of Fructose-bisphosphate aldolase A OS=Homo sapiens GN=ALDOA ## pepseq modified modification ## 1 GVVPLAGTNGETTTQGLDGLSER FALSE  ## 1.1 GVVPLAGTNGETTTQGLDGLSER FALSE  ## 2 GVVPLAGTNGETTTQGLDGLSER FALSE  ## 2.1 GVVPLAGTNGETTTQGLDGLSER FALSE  ## 3 IGEHTPSALAIMENANVLAR FALSE  ## 3.1 IGEHTPSALAIMENANVLAR FALSE  ## idFile spectrumFile ## 1 Thermo_Hela_PRTC_selected.mzid Thermo_Hela_PRTC_selected.mgf ## 1.1 Thermo_Hela_PRTC_selected.mzid Thermo_Hela_PRTC_selected.mgf ## 2 Thermo_Hela_PRTC_selected.mzid Thermo_Hela_PRTC_selected.mgf ## 2.1 Thermo_Hela_PRTC_selected.mzid Thermo_Hela_PRTC_selected.mgf ## 3 Thermo_Hela_PRTC_selected.mzid Thermo_Hela_PRTC_selected.mgf ## 3.1 Thermo_Hela_PRTC_selected.mzid Thermo_Hela_PRTC_selected.mgf ## databaseFile ## 1 HUMAN_2015_02_selected.fasta ## 1.1 HUMAN_2015_02_selected.fasta ## 2 HUMAN_2015_02_selected.fasta ## 2.1 HUMAN_2015_02_selected.fasta ## 3 HUMAN_2015_02_selected.fasta ## 3.1 HUMAN_2015_02_selected.fasta

0.4蛋白质对象

库("Pbase") p <-蛋白质(fafile) p <- adddentificationdata (p, idfile)
##读取1个识别文件:
# # 1。/ tmp / RtmphAy6LL / Rinstf3345a8ba52 Pbase / extdata / Thermo_Hela_PRTC_selected.mzid
# #。
p
## S4类类型:蛋白质##类版本:0.2 ##创建时间:2019年10月29日星期二21:01:21 ##蛋白质数量:9 ##序列:## [1]A4UGR9 [2] A6H8Y1…[8] P04075-2 [9] P60709 ##蛋白范围:##肽

一个蛋白质对象由一组蛋白质序列组成aa访问器以及一组可选的肽特征,这些肽特征被映射为沿着蛋白质的坐标,可用飞机坠毁.实际的肽序列可以提取pfeatures.蛋白质序列的名称可以提取seqnames

aa (p)
一个长度为9的AAStringSet实例,命名为## [1]3374Ytlskdslsngvpsgrqaefs a4ugr9 ## [2] 2624 mfrrarlsvkpnvrpgvgargs…Eattvseyffndifievdete a6h8y1 ## [3] 911 mvdyhaanqsyqygpssagnga…一个VPGALDYKSFSTALYGESDL O43707 ## [4] 2602 MPVTEKDLAEDAPWKKIQQNTF...VLAVKWGEEHIPGSPFHVTVP O75369 ## [5] 417 MSLSNKLTLDKLDVKGKRVVMR...GASLELLEGKVLPGVDALSNI P00558 ## [6] 664 METPSQRRATRSGAQASSTPLS...RSYLLGNSSPRTQSPQNCSIM P02545 ## [7] 364 MPYQYPALTPEQKKELSDIAHR...TPSGQAGAAASESLFVSNHAY P04075 ## [8] 418 MARRKPEGSSFNMTHLSMAMAF...TPSGQAGAAASESLFVSNHAY P04075-2 ## [9] 375 MDDDIAALVVDNGSGMCKAGFA...MWISKQEYDESGPSIVHRKCF P60709
seqnames (p)
##[1]“a4ugr9”“a6h8y1”“o43707”“o75369”“p00558”“p02545”##[7]“p04075”“p04075 -2”“p60709”
飞机坠毁(p)
##数据框架与9行和1列##短数据帧##448-465,1284-1291,1120-1128,…## o43707 51-65,495-512,438-450,…## o75369 895-909,1746-1757,2563-2578,…## p00558 193-206,268-275,333-350,…## p02545 1-11,332-349,367-377,…## p04075 112-134,112-134,154-173,…## p04075-2 166-188,166-188,208-227,…184-196
pfeatures (p)
##长度为9的AAStringSetList ## [["A4UGR9"]] A4UGR9=QEITQNKSFFSSVKESQR…A4ugr9 = qeitqnksffssvk ## [[" a6h8y1 "]] a6h8y1 = edaeqvalevdlnqkkrr…## [[" o43707 "]] o43707 = qqrktftawcnshlr…O43707= vgweqllttiar ## [[" o75369 "]] o75369 = dldiidnydyshtvk…O75369= vqaqgpglkeaftnk ## [[" p00558 "]] p00558 = elnyfakalesper p00558 = dlmskaek…P00558= gtkalmdevvk ## [[" p02545 "]] p02545 = metpsqrratr…P02545= ratrsgaqasstplsptr ## [[" p04075 "]] p04075 = gvvplagtngetttqgldglser…## [[" p04075-2 "]] p04075-2 = gvvplagtngetttqgldglser…## [[" p60709 "]] p60709 = dltdylmkilter

蛋白质实例进一步描述为一般元数据列表。蛋白质序列和肽特征注释可以访问埃珂叫牌法而且pcols分别返回DataFrame实例。

元数据(p)
## $created ##[1]“2019年10月29日星期二21:01:21”
埃珂叫牌法(p)
# # DataFrame 9行和12列# # DB AccessionNumber EntryName IsoformName # # < Rle > <人物> <人物> < Rle > # # A4UGR9 sp A4UGR9 XIRP2_HUMAN NA # # A6H8Y1 sp A6H8Y1 BDP1_HUMAN NA # # O43707 sp O43707 ACTN4_HUMAN NA # # O75369 sp O75369 FLNB_HUMAN NA # # P00558 sp P00558 PGK1_HUMAN NA # # P02545 sp P02545 LMNA_HUMAN NA # # P04075 sp P04075 ALDOA_HUMAN NA # # P04075-2 sp P04075-2 ALDOA_HUMAN 2 # # P60709 sp P60709 ACTB_HUMAN NA # # ProteinName OrganismName # # <人物> < Rle > # # A4UGR9鑫肌动蛋白结合重复蛋白2智人A6H8Y1转录因子TFIIIB成分B "同源物智人O43707 α -肌动蛋白-4智人O75369丝蛋白-B智人P00558磷酸甘油酸激酶1智人P02545 pre - lamin-A/C智人P04075二磷酸果糖醛缩酶A智人P04075-2二磷酸果糖醛缩酶A智人P60709肌动蛋白胞质1智人# # GeneName ProteinExistence SequenceVersion评论# # < Rle > < Rle > < Rle > < Rle > # # A4UGR9 XIRP2证据蛋白质二级NA # # A6H8Y1 BDP1证据蛋白质三级NA # # O43707 ACTN4证据蛋白质二级NA # # O75369 FLNB证据蛋白质二级NA # # P00558 PGK1证据蛋白质三级NA # # P02545 LMNA证据蛋白质一级NA # # P04075 ALDOA证据蛋白质二级NA # # P04075-2 ALDOA NA NA NA # # P60709 ACTB证据在蛋白质水平1 NA ##文件名##  ## A4UGR9 /tmp/RtmphAy6LL/Rinstf3345a8ba52/Pbase/extdata/HUMAN_2015_02_selected. txtfasta# # A6H8Y1 /tmp/RtmphAy6LL/Rinstf3345a8ba52/Pbase/extdata/HUMAN_2015_02_selected。fasta # # O43707 / tmp / RtmphAy6LL / Rinstf3345a8ba52 Pbase / extdata / HUMAN_2015_02_selected。fasta# # O75369 /tmp/RtmphAy6LL/Rinstf3345a8ba52/Pbase/extdata/HUMAN_2015_02_selected。fasta# # P00558 /tmp/RtmphAy6LL/Rinstf3345a8ba52/Pbase/extdata/HUMAN_2015_02_selected。fasta# # P02545 /tmp/RtmphAy6LL/Rinstf3345a8ba52/Pbase/extdata/HUMAN_2015_02_selected。fasta# # P04075 /tmp/RtmphAy6LL/Rinstf3345a8ba52/Pbase/extdata/HUMAN_2015_02_selected。fasta# # P04075-2 /tmp/RtmphAy6LL/Rinstf3345a8ba52/Pbase/extdata/HUMAN_2015_02_selected。fasta # # P60709 / tmp / RtmphAy6LL / Rinstf3345a8ba52 Pbase / extdata / HUMAN_2015_02_selected。fasta ## npep ##  ## A4UGR9 36 ## A6H8Y1 23 ## O43707 6 ## O75369 13 ## P00558 5 ## P02545 12 ## P04075 21 ## P04075-2 20 ## P60709 1
pcols (p)
##数据框架与9行和1列##短数据帧##448-465,1284-1291,1120-1128,…## o43707 51-65,495-512,438-450,…## o75369 895-909,1746-1757,2563-2578,…## p00558 193-206,268-275,333-350,…## p02545 1-11,332-349,367-377,…## p04075 112-134,112-134,154-173,…## p04075-2 166-188,166-188,208-227,…184-196

利用名称索引法可以提取特定的蛋白质蛋白质和它们的肽特征可以用默认的绘图方法绘制。

seqnames (p)
##[1]“a4ugr9”“a6h8y1”“o43707”“o75369”“p00558”“p02545”##[7]“p04075”“p04075 -2”“p60709”
情节(p [c (9)])

详情见蛋白质?.上面生成的对象也可以直接作为数据(p)

0.5会话信息

sessionInfo ()
## R版本3.6.1(2019-07-05)##平台:x86_64-pc-linux-gnu(64位)##运行在Ubuntu 18.04.3 LTS ## ##矩阵产品:默认## BLAS: /home/biocbuild/bbs-3.10-bioc/R/lib/libRblas。所以## LAPACK: /home/biocbuild/bbs-3.10-bioc/R/lib/libRlapack。所以## ## locale: ## [1] LC_CTYPE=en_US。UTF-8 LC_NUMERIC= c# # [3] LC_TIME=en_US。UTF-8 LC_COLLATE= c# # [5] LC_MONETARY=en_US。utf - 8 LC_MESSAGES = en_US。UTF-8 ## [7] LC_PAPER=en_US。UTF-8 LC_NAME= c# # [9] LC_ADDRESS=C lc_phone = c# # [11] LC_MEASUREMENT=en_US。UTF-8 LC_IDENTIFICATION=C ## ##附加的基础包:## [1]grid stats4 parallel stats graphics grDevices utils ## [8] datasets methods base ## ##其他附加包:## [1]mzID_1.24.0 Biostrings_2.54.0 XVector_0.26.0 Gviz_1.30.0 GenomicRanges_1.38.0 ## [7] GenomeInfoDb_1.22.0 IRanges_2.20.0 S4Vectors_0.24.0 ## [10] Rcpp_1.0.2 BiocGenerics_0.32.0 BiocStyle_2.14.0 ## ##通过命名空间加载(且未附加):[1] colorspace_1.4-1 biovizBase_1.34.0 ## [3] htmlTable_1.13.2 base64enc_0.1-3 ## [5] dichromat_2.0-0 rstudioapi_0.10 ## [9] bit64_0.9-7 AnnotationDbi_1.48.0 ## [11] codetools_0.2-16 splines_3.6.1 ## [13] ncdf4_1 doParallel_1.0.15 ## [13] ncdf4_1 doParallel_1.0.15 ## [15] impute_1.60.0 knitr_1. 1.25 ## [17] zeallot_0.1.0 Formula_1.2-3 # [19] Rsamtools_2.2.0 vsn_3.54.0 ## [23] BiocManager_1.30.9 compiler_3.6.1 ## [25] httr_1.4.1 backports_1.1.5 ## [27][31] acepack_1.4.1 htmltools_0.4.0 ## [33] prettyunits_1.0.2 tools_3.6.1 ## [35] affy_1.64.0 gtable_0.3.0 ## [37] glue_1.3.1 GenomeInfoDbData_1.2.2 ## [39] dplyr_0.8.3 rappdirs_0.3.1 ## [41] MALDIquant_1.19.3 Biobase_2.46.0 ## [43] vctrs_0.2.0 preprocessCore_1.48.0 ## [45] rtracklayer_1.46.0 iterators_1.0.12 ## [49] ensembldb_2.10.0 XML_3.98-1.20 ## [51] MASS_7.3-51.4 zlibbioc_1.32.0 ## [53][61] AnnotationFilter_1.10.0 RColorBrewer_1.1-2 ## [63] yaml_2.2.0 curl4.2 ## [65] memoise_1.1.0 gridExtra_2.3 ## [67] ggplot2_3.2.1 cleaver_1.24.0 ## [69] biomaRt_2.42.0 rpart_1 . 4.1-15 ## [71] latticeExtra_0.6-28 stringi_1.4.3 ## [73] RSQLite_2.1.2 foreach_1.4.7 ## [75] checkmate_1.9.4 genome features_1 .38.0 ## [77][79] pkgconfig_2.0.3 matrixStats_0.55.0 ## [81] bitops_1.0-6 evaluate_0.14 ## [83] lattice_0.20-38 purrr_0.3.3 ## [85] GenomicAlignments_1.22.0 htmlwidgets_1.5.1 ## [87] bit_1.1-14 tidyselect_0.2.5 ## [89] plyr_1.8.4 magrittr_1. 1.5 ## [93] R6_2.4.0 Hmisc_4.2-0 ## [95] delayedarray_0.14 Pviz_1.20.0 ## [97] pillar_1.4.2 foreign_0.8-72 ## [101] nnet_7.3-12 tibble_2.1.3 ## [103] crayon_1.3.4 BiocFileCache_1.10.0 ## [105] rmarkdown_1.16 progress_1.2.2 ## [107] data.table_1.12.6 blob_1.2.0 ## [109] digest_0.6.22 openssl_1.4.1 ## [111] munsell_0.5.0 askpass_1.1