fn_compare_expression_matrices

Haky Im

2020-12-03

Want to write a function that takes in two data frames with expression data and returns

vector of correlations of matched genes
summary of the correlation vect (min,1Q,median,3Q,max)
returns a list of aligned data frames
plots correlation for a sample of matched genes
print number of individuals in df1 and df2
print number of individuals in common between df1 and df2
print number of genes in common between df1 and df2
print number of genes in df1 not in df2
print number of genes in df2 not in df1

## Rows: 838
## Columns: 14,262
## Delimiter: "\t"
## chr [    2]: FID, IID
## dbl [14260]: ENSG00000187608, ENSG00000188290, ENSG00000160072, ENSG00000162576, ENSG00000187642...
## 
## Use `spec()` to retrieve the guessed column specification
## Pass a specification to the `col_types` argument to quiet this message

## Rows: 838
## Columns: 12,816
## Delimiter: "\t"
## chr [    2]: FID, IID
## dbl [12814]: ENSG00000237491.8, ENSG00000177757.2, ENSG00000225880.5, ENSG00000272438.1, ENSG000...
## 
## Use `spec()` to retrieve the guessed column specification
## Pass a specification to the `col_types` argument to quiet this message

Comparison function

fn_remove_ver_from_names = function(df)
{
  ind = substr(names(df),1,4) == "ENSG"
  names(df)[ind] = substr(names(df)[ind],1,15)
  df
}


fn_compare_expression_matrices = function(df1, df2, nplots = 4, common_genes=TRUE,plot_dir=NULL)
  {
  ## this function assumes that df1 and df2 have the first two columns named FID and IID"
  ## usage: 
  ## tempo_list = fn_compare_expression_matrices(df1, df2, plot_dir="working_dir/plots")
  ## to see plots on screen use the default value plot_dir=NULL by not specifying anything
  ## tempo_list = fn_compare_expression_matrices(df1, df2)
  print(dim(df1)) 
  print(dim(df2))
  if(!identical(df1$FID, df2$FID) |  !identical(df1$IID, df2$IID) )
  {
    id_tibble <- full_join(hg19[,1:2], amygdala[,1:2])
    print(paste("in total there are ",nrow(id_tibble) , "rows") )
    df1 = left_join(id_tibble, df1, by=c("FID","IID"))  
    df2 = left_join(id_tibble, df1, by=c("FID","IID"))  
    print(identical(df1$FID, df2$FID))
    print(identical(df1$IID, df2$IID)) 
    print("the two lines above should say TRUE")
  } 
  ## remove version number from ensid
  df1 = fn_remove_ver_from_names(df1)
  df2 = fn_remove_ver_from_names(df2)
  if(common_genes)
  { 
    gene_list = intersect(names(df1),names(df2))
    gene_list = gene_list[!(gene_list %in% c("FID","IID"))] ## remove FID, IID
    print(glue::glue("There are {length(gene_list)} genes in common"))
    cor_vec = rep(NA, length(gene_list)); names(cor_vec) = gene_list
    for(gg in gene_list)
    {
      cor_vec[gg] = cor(df1[[gg]],df2[[gg]])
    }
    print(summary(cor_vec))
    for(pp in 1:nplots) {
      gg = sample(gene_list,1)
      if(!is.null(plot_dir)) png(glue::glue("{plot_dir}/{gg}_df1_vs_df2.png"))
      plot(df1[[gg]],df2[[gg]],main=paste(gg," - cor = ",round(cor_vec[gg]),2))
      if(!is.null(plot_dir)) dev.off()
    }
    if(!is.null(plot_dir)) png(glue::glue("{plot_dir}/hist_cor_df1_vs_df2.png"))
    hist(cor_vec,main="correlation between df1 and df2")
    if(!is.null(plot_dir)) dev.off()
  } else ## below is just testing, 10K by 10K is too large
  if(ncol(df1)*ncol(df2)<2e6){
    mat1 = scale( as.matrix(df1 %>% select(-c("FID","IID")) ) )
    mat2 = scale(as.matrix(df2 %>% select(-c("FID","IID")) ) )
    cor_mat = t(mat1) %*% mat2 / nrow(mat1)
  }
  list(df1,df2,cor_vec)
}

tempo_list = suppressWarnings(fn_compare_expression_matrices(df1, df2) )

## [1]   838 14262
## [1]   838 12816
## There are 8557 genes in common
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## -0.98801  0.05389  0.36561  0.37017  0.74258  1.00000       11

Calculated correlation all genes agains all genes. Thsi cannot be done for all the genes, so I’m using a subset of 100 or so.

suppressMessages(library(wesanderson))



tileplot <- function(mat,no_label=FALSE)
{
  mat = data.frame(mat)
  mat$Var1 = factor(rownames(mat), levels=rownames(mat)) ## preserve rowname order
  melted_mat <- gather(mat,key=Var2,value=value,-Var1)
  melted_mat$Var2 = factor(melted_mat$Var2, levels=colnames(mat)) ## preserve colname order
  rango = range(melted_mat$value)
  melted_mat$val2 = cut(melted_mat$value,breaks=c(-1,-0.3,0.3,1))
  pp <- ggplot(melted_mat,aes(x=Var1,y=Var2,fill=val2)) + geom_tile() ##+scale_fill_gradientn(colours = c("#C00000", "#FF0000", "#FF8080", "#FFC0C0", "#FFFFFF", "#C0C0FF", "#8080FF", "#0000FF", "#0000C0"), limit = c(-1,1)) 
  if(no_label)  
    pp <- pp + 
    theme(axis.title.x=element_blank(),
          axis.text.x=element_blank(),
          axis.ticks.x=element_blank(),
          axis.title.y=element_blank(),
          axis.text.y=element_blank(),
          axis.ticks.y=element_blank()) +
    scale_fill_manual(values = wes_palette("GrandBudapest1", n = 3))
          #scale_fill_brewer(palette = "Dark2")
          # scale_fill_gradientn( colours=rainbow(5),limit=c(-1,1) )
##    scale_fill_gradient( limits = c(-.5,.5),low =  "blue", high = "red", na.value="black") 
  print(pp)
}


  df1 = fn_remove_ver_from_names(df1)
  df2 = fn_remove_ver_from_names(df2)

t_genelist = sample(intersect(colnames(df1[,-c(1:2)]),colnames(df2[,-c(1:2)])),100)
    mat1 = scale( as.matrix(df1[,t_genelist]  ) )
    mat2 = scale(as.matrix(df2[,t_genelist]  ) )
cor_mat = t(mat1) %*% mat2 / nrow(mat1)
tileplot(cor_mat,no_label=TRUE)

fn_compare_expression_matrices

Haky Im

2020-12-03

Reuse

Suggest changes

Citation