Pbase
示例数据包:Pbase
作者:劳伦特与而且塞巴斯蒂安·吉布
最后编译:2019年10月29日星期二21:01:04
最后修改:2019-10-29 16:23:38
的中心数据对象Pbase
包,即蛋白质
实例,如下所示。它们包含一组蛋白质序列(下图中10个),由蛋白质序列(灰色框)和注释数据(左侧表)组成。每个蛋白质都链接到一组感兴趣的范围,比如实验观察到的多肽的蛋白质结构域(也是灰色的),也用它们自己的注释数据装饰。该图还显示了用于不同数据槽的访问器,在蛋白质?
.
蛋白质
对象由来自fasta文件的蛋白质序列填充,而多肽通常来自lc - mms实验。
下面使用的原始数据是10 fmol肽保留时间校准混合物在Thermo Orbitrap Q Exactive仪器上检测到50 ng HeLa背景。来自UniProt发布的一组受限的高分人类蛋白质2015年_02
使用MSGF +
搜索引擎。
库(“Biostrings”)
##加载所需的包:XVector
## ##附加包:“Biostrings”
下面的对象从'package:base'屏蔽:## ## strsplit
fafile <- system.file("extdata/HUMAN_2015_02_selected. txt ")fasta", package = "Pbase") fa <- readAAStringSet(fafile) fa . fasta", package = "Pbase") fa
一个长度为9的AAStringSet实例,命名为MPVTEKDLAEDAPWKKIQQNTF…VLAVKWGEEHIPGSPFHVTVP sp | O75369 | FLNB_HU……## [2] 3374 mspeghsrifeatagpnkpes…YTLSKDSLSNGVPSGRQAEFS sp | A4UGR9 | XIRP2_H……## [3] 2624 mfrrarlsvkpnvrpgvgargs…EATTVSEYFFNDIFIEVDETE sp | A6H8Y1 | BDP1_HU……## [4] 911 mvdyhaanqsyqygpssagnga…一个VPGALDYKSFSTALYGESDL sp|O43707|ACTN4_H... ## [5] 417 MSLSNKLTLDKLDVKGKRVVMR...GASLELLEGKVLPGVDALSNI sp|P00558|PGK1_HU... ## [6] 375 MDDDIAALVVDNGSGMCKAGFA...MWISKQEYDESGPSIVHRKCF sp|P60709|ACTB_HU... ## [7] 664 METPSQRRATRSGAQASSTPLS...RSYLLGNSSPRTQSPQNCSIM sp|P02545|LMNA_HU... ## [8] 364 MPYQYPALTPEQKKELSDIAHR...TPSGQAGAAASESLFVSNHAY sp|P04075|ALDOA_H... ## [9] 418 MARRKPEGSSFNMTHLSMAMAF...TPSGQAGAAASESLFVSNHAY sp|P04075-2|ALDOA...
library("mzID") idfile <- system.file("extdata/Thermo_Hela_PRTC_selected. file ")mzid", package = "Pbase") id <- flatten(mzid (idfile))
##读取Thermo_Hela_PRTC_selected.mzid…完成了!
暗(id)
## [1] 137 29
头(id)
## spectrumid扫描编号## 1 index=173 12256 ## 1.1 index=173 12256 ## 2 index=163 11860 ## 2.1 index=163 11860 ## 3 index=200 13408 ## 3.1 index=200 13408 ##频谱标题## 1 msLevel 2;retentionTime 2094.56706;scanNum 12256;precMz 1137.06665029649;precCharge 2 ## 1.1 ms2级;retentionTime 2094.56706;scanNum 12256;precMz 1137.06665029649;precCharge 2 ## 2 msLevel 2;retentionTime 2039.84424; scanNum 11860; precMz 1136.57450195803; precCharge 2 ## 2.1 msLevel 2; retentionTime 2039.84424; scanNum 11860; precMz 1136.57450195803; precCharge 2 ## 3 msLevel 2; retentionTime 2258.27868; scanNum 13408; precMz 703.038108542133; precCharge 3 ## 3.1 msLevel 2; retentionTime 2258.27868; scanNum 13408; precMz 703.038108542133; precCharge 3 ## acquisitionnum passthreshold rank calculatedmasstocharge ## 1 173 TRUE 1 1136.574 ## 1.1 173 TRUE 1 1136.574 ## 2 163 TRUE 1 1136.574 ## 2.1 163 TRUE 1 1136.574 ## 3 200 TRUE 1 703.037 ## 3.1 200 TRUE 1 703.037 ## experimentalmasstocharge chargestate ms-gf:denovoscore ms-gf:evalue ## 1 1137.0667 2 132 2.597097e-18 ## 1.1 1137.0667 2 132 2.597097e-18 ## 2 1136.5745 2 230 4.942664e-17 ## 2.1 1136.5745 2 230 4.942664e-17 ## 3 703.0381 3 145 4.080429e-10 ## 3.1 703.0381 3 145 4.080429e-10 ## ms-gf:rawscore ms-gf:specevalue assumeddissociationmethod isotopeerror ## 1 118 2.276758e-22 CID 1 ## 1.1 118 2.276758e-22 CID 1 ## 2 186 4.333009e-21 CID 0 ## 2.1 186 4.333009e-21 CID 0 ## 3 98 3.578068e-14 CID 0 ## 3.1 98 3.578068e-14 CID 0 ## isdecoy post pre end start accession length ## 1 FALSE C K 134 112 sp|P04075|ALDOA_HUMAN 364 ## 1.1 FALSE C K 188 166 sp|P04075-2|ALDOA_HUMAN 418 ## 2 FALSE C K 134 112 sp|P04075|ALDOA_HUMAN 364 ## 2.1 FALSE C K 188 166 sp|P04075-2|ALDOA_HUMAN 418 ## 3 FALSE Y K 173 154 sp|P04075|ALDOA_HUMAN 364 ## 3.1 FALSE Y K 227 208 sp|P04075-2|ALDOA_HUMAN 418 ## description ## 1 Fructose-bisphosphate aldolase A OS=Homo sapiens GN=ALDOA PE=1 SV=2 ## 1.1 Isoform 2 of Fructose-bisphosphate aldolase A OS=Homo sapiens GN=ALDOA ## 2 Fructose-bisphosphate aldolase A OS=Homo sapiens GN=ALDOA PE=1 SV=2 ## 2.1 Isoform 2 of Fructose-bisphosphate aldolase A OS=Homo sapiens GN=ALDOA ## 3 Fructose-bisphosphate aldolase A OS=Homo sapiens GN=ALDOA PE=1 SV=2 ## 3.1 Isoform 2 of Fructose-bisphosphate aldolase A OS=Homo sapiens GN=ALDOA ## pepseq modified modification ## 1 GVVPLAGTNGETTTQGLDGLSER FALSE ## 1.1 GVVPLAGTNGETTTQGLDGLSER FALSE ## 2 GVVPLAGTNGETTTQGLDGLSER FALSE ## 2.1 GVVPLAGTNGETTTQGLDGLSER FALSE ## 3 IGEHTPSALAIMENANVLAR FALSE ## 3.1 IGEHTPSALAIMENANVLAR FALSE ## idFile spectrumFile ## 1 Thermo_Hela_PRTC_selected.mzid Thermo_Hela_PRTC_selected.mgf ## 1.1 Thermo_Hela_PRTC_selected.mzid Thermo_Hela_PRTC_selected.mgf ## 2 Thermo_Hela_PRTC_selected.mzid Thermo_Hela_PRTC_selected.mgf ## 2.1 Thermo_Hela_PRTC_selected.mzid Thermo_Hela_PRTC_selected.mgf ## 3 Thermo_Hela_PRTC_selected.mzid Thermo_Hela_PRTC_selected.mgf ## 3.1 Thermo_Hela_PRTC_selected.mzid Thermo_Hela_PRTC_selected.mgf ## databaseFile ## 1 HUMAN_2015_02_selected.fasta ## 1.1 HUMAN_2015_02_selected.fasta ## 2 HUMAN_2015_02_selected.fasta ## 2.1 HUMAN_2015_02_selected.fasta ## 3 HUMAN_2015_02_selected.fasta ## 3.1 HUMAN_2015_02_selected.fasta
库("Pbase") p <-蛋白质(fafile) p <- adddentificationdata (p, idfile)
##读取1个识别文件:
# # 1。/ tmp / RtmphAy6LL / Rinstf3345a8ba52 Pbase / extdata / Thermo_Hela_PRTC_selected.mzid
# #。
p
## S4类类型:蛋白质##类版本:0.2 ##创建时间:2019年10月29日星期二21:01:21 ##蛋白质数量:9 ##序列:## [1]A4UGR9 [2] A6H8Y1…[8] P04075-2 [9] P60709 ##蛋白范围:##肽
一个蛋白质
对象由一组蛋白质序列组成aa
访问器以及一组可选的肽特征,这些肽特征被映射为沿着蛋白质的坐标,可用飞机坠毁
.实际的肽序列可以提取pfeatures
.蛋白质序列的名称可以提取seqnames
.
aa (p)
一个长度为9的AAStringSet实例,命名为## [1]3374Ytlskdslsngvpsgrqaefs a4ugr9 ## [2] 2624 mfrrarlsvkpnvrpgvgargs…Eattvseyffndifievdete a6h8y1 ## [3] 911 mvdyhaanqsyqygpssagnga…一个VPGALDYKSFSTALYGESDL O43707 ## [4] 2602 MPVTEKDLAEDAPWKKIQQNTF...VLAVKWGEEHIPGSPFHVTVP O75369 ## [5] 417 MSLSNKLTLDKLDVKGKRVVMR...GASLELLEGKVLPGVDALSNI P00558 ## [6] 664 METPSQRRATRSGAQASSTPLS...RSYLLGNSSPRTQSPQNCSIM P02545 ## [7] 364 MPYQYPALTPEQKKELSDIAHR...TPSGQAGAAASESLFVSNHAY P04075 ## [8] 418 MARRKPEGSSFNMTHLSMAMAF...TPSGQAGAAASESLFVSNHAY P04075-2 ## [9] 375 MDDDIAALVVDNGSGMCKAGFA...MWISKQEYDESGPSIVHRKCF P60709
seqnames (p)
##[1]“a4ugr9”“a6h8y1”“o43707”“o75369”“p00558”“p02545”##[7]“p04075”“p04075 -2”“p60709”
飞机坠毁(p)
##数据框架与9行和1列##短数据帧##448-465,1284-1291,1120-1128,…## o43707 51-65,495-512,438-450,…## o75369 895-909,1746-1757,2563-2578,…## p00558 193-206,268-275,333-350,…## p02545 1-11,332-349,367-377,…## p04075 112-134,112-134,154-173,…## p04075-2 166-188,166-188,208-227,…184-196
pfeatures (p)
##长度为9的AAStringSetList ## [["A4UGR9"]] A4UGR9=QEITQNKSFFSSVKESQR…A4ugr9 = qeitqnksffssvk ## [[" a6h8y1 "]] a6h8y1 = edaeqvalevdlnqkkrr…## [[" o43707 "]] o43707 = qqrktftawcnshlr…O43707= vgweqllttiar ## [[" o75369 "]] o75369 = dldiidnydyshtvk…O75369= vqaqgpglkeaftnk ## [[" p00558 "]] p00558 = elnyfakalesper p00558 = dlmskaek…P00558= gtkalmdevvk ## [[" p02545 "]] p02545 = metpsqrratr…P02545= ratrsgaqasstplsptr ## [[" p04075 "]] p04075 = gvvplagtngetttqgldglser…## [[" p04075-2 "]] p04075-2 = gvvplagtngetttqgldglser…## [[" p60709 "]] p60709 = dltdylmkilter
蛋白质实例进一步描述为一般元数据
列表。蛋白质序列和肽特征注释可以访问埃珂叫牌法
而且pcols
分别返回DataFrame
实例。
元数据(p)
## $created ##[1]“2019年10月29日星期二21:01:21”
埃珂叫牌法(p)
# # DataFrame 9行和12列# # DB AccessionNumber EntryName IsoformName # # < Rle > <人物> <人物> < Rle > # # A4UGR9 sp A4UGR9 XIRP2_HUMAN NA # # A6H8Y1 sp A6H8Y1 BDP1_HUMAN NA # # O43707 sp O43707 ACTN4_HUMAN NA # # O75369 sp O75369 FLNB_HUMAN NA # # P00558 sp P00558 PGK1_HUMAN NA # # P02545 sp P02545 LMNA_HUMAN NA # # P04075 sp P04075 ALDOA_HUMAN NA # # P04075-2 sp P04075-2 ALDOA_HUMAN 2 # # P60709 sp P60709 ACTB_HUMAN NA # # ProteinName OrganismName # # <人物> < Rle > # # A4UGR9鑫肌动蛋白结合重复蛋白2智人A6H8Y1转录因子TFIIIB成分B "同源物智人O43707 α -肌动蛋白-4智人O75369丝蛋白-B智人P00558磷酸甘油酸激酶1智人P02545 pre - lamin-A/C智人P04075二磷酸果糖醛缩酶A智人P04075-2二磷酸果糖醛缩酶A智人P60709肌动蛋白胞质1智人# # GeneName ProteinExistence SequenceVersion评论# # < Rle > < Rle > < Rle > < Rle > # # A4UGR9 XIRP2证据蛋白质二级NA # # A6H8Y1 BDP1证据蛋白质三级NA # # O43707 ACTN4证据蛋白质二级NA # # O75369 FLNB证据蛋白质二级NA # # P00558 PGK1证据蛋白质三级NA # # P02545 LMNA证据蛋白质一级NA # # P04075 ALDOA证据蛋白质二级NA # # P04075-2 ALDOA NA NA NA # # P60709 ACTB证据在蛋白质水平1 NA ##文件名## ## A4UGR9 /tmp/RtmphAy6LL/Rinstf3345a8ba52/Pbase/extdata/HUMAN_2015_02_selected. txtfasta# # A6H8Y1 /tmp/RtmphAy6LL/Rinstf3345a8ba52/Pbase/extdata/HUMAN_2015_02_selected。fasta # # O43707 / tmp / RtmphAy6LL / Rinstf3345a8ba52 Pbase / extdata / HUMAN_2015_02_selected。fasta# # O75369 /tmp/RtmphAy6LL/Rinstf3345a8ba52/Pbase/extdata/HUMAN_2015_02_selected。fasta# # P00558 /tmp/RtmphAy6LL/Rinstf3345a8ba52/Pbase/extdata/HUMAN_2015_02_selected。fasta# # P02545 /tmp/RtmphAy6LL/Rinstf3345a8ba52/Pbase/extdata/HUMAN_2015_02_selected。fasta# # P04075 /tmp/RtmphAy6LL/Rinstf3345a8ba52/Pbase/extdata/HUMAN_2015_02_selected。fasta# # P04075-2 /tmp/RtmphAy6LL/Rinstf3345a8ba52/Pbase/extdata/HUMAN_2015_02_selected。fasta # # P60709 / tmp / RtmphAy6LL / Rinstf3345a8ba52 Pbase / extdata / HUMAN_2015_02_selected。fasta ## npep ## ## A4UGR9 36 ## A6H8Y1 23 ## O43707 6 ## O75369 13 ## P00558 5 ## P02545 12 ## P04075 21 ## P04075-2 20 ## P60709 1
pcols (p)
##数据框架与9行和1列##短数据帧##448-465,1284-1291,1120-1128,…## o43707 51-65,495-512,438-450,…## o75369 895-909,1746-1757,2563-2578,…## p00558 193-206,268-275,333-350,…## p02545 1-11,332-349,367-377,…## p04075 112-134,112-134,154-173,…## p04075-2 166-188,166-188,208-227,…184-196
利用名称索引法可以提取特定的蛋白质[
蛋白质和它们的肽特征可以用默认的绘图方法绘制。
seqnames (p)
##[1]“a4ugr9”“a6h8y1”“o43707”“o75369”“p00558”“p02545”##[7]“p04075”“p04075 -2”“p60709”
情节(p [c (9)])
详情见蛋白质?
.上面生成的对象也可以直接作为数据(p)
.
sessionInfo ()
## R版本3.6.1(2019-07-05)##平台:x86_64-pc-linux-gnu(64位)##运行在Ubuntu 18.04.3 LTS ## ##矩阵产品:默认## BLAS: /home/biocbuild/bbs-3.10-bioc/R/lib/libRblas。所以## LAPACK: /home/biocbuild/bbs-3.10-bioc/R/lib/libRlapack。所以## ## locale: ## [1] LC_CTYPE=en_US。UTF-8 LC_NUMERIC= c# # [3] LC_TIME=en_US。UTF-8 LC_COLLATE= c# # [5] LC_MONETARY=en_US。utf - 8 LC_MESSAGES = en_US。UTF-8 ## [7] LC_PAPER=en_US。UTF-8 LC_NAME= c# # [9] LC_ADDRESS=C lc_phone = c# # [11] LC_MEASUREMENT=en_US。UTF-8 LC_IDENTIFICATION=C ## ##附加的基础包:## [1]grid stats4 parallel stats graphics grDevices utils ## [8] datasets methods base ## ##其他附加包:## [1]mzID_1.24.0 Biostrings_2.54.0 XVector_0.26.0 Gviz_1.30.0 GenomicRanges_1.38.0 ## [7] GenomeInfoDb_1.22.0 IRanges_2.20.0 S4Vectors_0.24.0 ## [10] Rcpp_1.0.2 BiocGenerics_0.32.0 BiocStyle_2.14.0 ## ##通过命名空间加载(且未附加):[1] colorspace_1.4-1 biovizBase_1.34.0 ## [3] htmlTable_1.13.2 base64enc_0.1-3 ## [5] dichromat_2.0-0 rstudioapi_0.10 ## [9] bit64_0.9-7 AnnotationDbi_1.48.0 ## [11] codetools_0.2-16 splines_3.6.1 ## [13] ncdf4_1 doParallel_1.0.15 ## [13] ncdf4_1 doParallel_1.0.15 ## [15] impute_1.60.0 knitr_1. 1.25 ## [17] zeallot_0.1.0 Formula_1.2-3 # [19] Rsamtools_2.2.0 vsn_3.54.0 ## [23] BiocManager_1.30.9 compiler_3.6.1 ## [25] httr_1.4.1 backports_1.1.5 ## [27][31] acepack_1.4.1 htmltools_0.4.0 ## [33] prettyunits_1.0.2 tools_3.6.1 ## [35] affy_1.64.0 gtable_0.3.0 ## [37] glue_1.3.1 GenomeInfoDbData_1.2.2 ## [39] dplyr_0.8.3 rappdirs_0.3.1 ## [41] MALDIquant_1.19.3 Biobase_2.46.0 ## [43] vctrs_0.2.0 preprocessCore_1.48.0 ## [45] rtracklayer_1.46.0 iterators_1.0.12 ## [49] ensembldb_2.10.0 XML_3.98-1.20 ## [51] MASS_7.3-51.4 zlibbioc_1.32.0 ## [53][61] AnnotationFilter_1.10.0 RColorBrewer_1.1-2 ## [63] yaml_2.2.0 curl4.2 ## [65] memoise_1.1.0 gridExtra_2.3 ## [67] ggplot2_3.2.1 cleaver_1.24.0 ## [69] biomaRt_2.42.0 rpart_1 . 4.1-15 ## [71] latticeExtra_0.6-28 stringi_1.4.3 ## [73] RSQLite_2.1.2 foreach_1.4.7 ## [75] checkmate_1.9.4 genome features_1 .38.0 ## [77][79] pkgconfig_2.0.3 matrixStats_0.55.0 ## [81] bitops_1.0-6 evaluate_0.14 ## [83] lattice_0.20-38 purrr_0.3.3 ## [85] GenomicAlignments_1.22.0 htmlwidgets_1.5.1 ## [87] bit_1.1-14 tidyselect_0.2.5 ## [89] plyr_1.8.4 magrittr_1. 1.5 ## [93] R6_2.4.0 Hmisc_4.2-0 ## [95] delayedarray_0.14 Pviz_1.20.0 ## [97] pillar_1.4.2 foreign_0.8-72 ## [101] nnet_7.3-12 tibble_2.1.3 ## [103] crayon_1.3.4 BiocFileCache_1.10.0 ## [105] rmarkdown_1.16 progress_1.2.2 ## [107] data.table_1.12.6 blob_1.2.0 ## [109] digest_0.6.22 openssl_1.4.1 ## [111] munsell_0.5.0 askpass_1.1