PredictDB databases are stored in simple sqlite files. You can programmatically query them via python, R, perl, etc (using appropriate libraries). Below is an example on how to query the database in R.
Each has two tables the extra and the weights tables. - extra table contains the list of available genes and some prediction performance information (for elastic net, not for the mashr models), - weights table contains the weights for predicting the gene expression levels (or other mediating/molecular traits)
## install.packages("RSQLite")
library("RSQLite")
sqlite <- dbDriver("SQLite")
dbname <- "assets/en_Adipose_Subcutaneous.db" ## add full path if db file not in current directory
## connect to db
db = dbConnect(sqlite,dbname)
## list tables
dbListTables(db)
## [1] "extra" "weights"
dbListFields(db, "weights")
## [1] "gene" "rsid" "varID" "ref_allele" "eff_allele"
## [6] "weight"
dbListFields(db, "extra")
## [1] "gene" "genename" "gene_type"
## [4] "alpha" "n_snps_in_window" "n.snps.in.model"
## [7] "test_R2_avg" "test_R2_sd" "cv_R2_avg"
## [10] "cv_R2_sd" "in_sample_R2" "nested_cv_fisher_pval"
## [13] "nested_cv_converged" "rho_avg" "rho_se"
## [16] "rho_zscore" "pred.perf.R2" "pred.perf.pval"
## [19] "pred.perf.qval"
## convenience query function
query <- function(...) dbGetQuery(db, ...)
## example queries
query('select count(*) from weights')
## count(*)
## 1 249965
query('select * from weights where gene = "GATA6" ')
## [1] gene rsid varID ref_allele eff_allele weight
## <0 rows> (or 0-length row.names)
query('select * from weights limit 10')
## gene rsid varID ref_allele eff_allele
## 1 ENSG00000261456.5 rs11252127 chr10_52147_C_T_b38 C T
## 2 ENSG00000261456.5 rs11252546 chr10_58487_T_C_b38 T C
## 3 ENSG00000261456.5 rs11591988 chr10_80130_C_T_b38 C T
## 4 ENSG00000261456.5 rs4495823 chr10_97603_G_A_b38 G A
## 5 ENSG00000261456.5 rs11253478 chr10_98907_C_T_b38 C T
## 6 ENSG00000261456.5 rs7901397 chr10_102757_T_C_b38 T C
## 7 ENSG00000261456.5 rs7476951 chr10_137211_T_C_b38 T C
## 8 ENSG00000261456.5 rs3123247 chr10_264285_C_T_b38 C T
## 9 ENSG00000261456.5 rs4880567 chr10_267364_T_C_b38 T C
## 10 ENSG00000261456.5 rs4881392 chr10_519179_A_G_b38 A G
## weight
## 1 0.052252706
## 2 -0.033544959
## 3 0.014296499
## 4 -0.030826218
## 5 0.013036311
## 6 -0.118567569
## 7 0.007865262
## 8 0.004556608
## 9 0.030123310
## 10 0.057842737
## how many genes are available for given tissue?
## dbname should be the name of the sqlite database for the tissue
query('select count(*) from extra')
## count(*)
## 1 8650
## select genes with R2>0.01 (this is cor>0.1)
## ths won't work for the latest MASHR-based GTEx V8 models
## this only works for models where R2 is included.
#high.h2.genes <- query('select * from extra where R2 > 0.01')