Want to write a function that takes in two data frames with expression data and returns

  • vector of correlations of matched genes

  • summary of the correlation vect (min,1Q,median,3Q,max)

  • returns a list of aligned data frames

  • plots correlation for a sample of matched genes

  • print number of individuals in df1 and df2

  • print number of individuals in common between df1 and df2

  • print number of genes in common between df1 and df2

  • print number of genes in df1 not in df2

  • print number of genes in df2 not in df1

## Rows: 838
## Columns: 14,262
## Delimiter: "\t"
## chr [    2]: FID, IID
## dbl [14260]: ENSG00000187608, ENSG00000188290, ENSG00000160072, ENSG00000162576, ENSG00000187642...
## 
## Use `spec()` to retrieve the guessed column specification
## Pass a specification to the `col_types` argument to quiet this message
## Rows: 838
## Columns: 12,816
## Delimiter: "\t"
## chr [    2]: FID, IID
## dbl [12814]: ENSG00000237491.8, ENSG00000177757.2, ENSG00000225880.5, ENSG00000272438.1, ENSG000...
## 
## Use `spec()` to retrieve the guessed column specification
## Pass a specification to the `col_types` argument to quiet this message

Comparison function

fn_remove_ver_from_names = function(df)
{
  ind = substr(names(df),1,4) == "ENSG"
  names(df)[ind] = substr(names(df)[ind],1,15)
  df
}


fn_compare_expression_matrices = function(df1, df2, nplots = 4, common_genes=TRUE,plot_dir=NULL)
  {
  ## this function assumes that df1 and df2 have the first two columns named FID and IID"
  ## usage: 
  ## tempo_list = fn_compare_expression_matrices(df1, df2, plot_dir="working_dir/plots")
  ## to see plots on screen use the default value plot_dir=NULL by not specifying anything
  ## tempo_list = fn_compare_expression_matrices(df1, df2)
  print(dim(df1)) 
  print(dim(df2))
  if(!identical(df1$FID, df2$FID) |  !identical(df1$IID, df2$IID) )
  {
    id_tibble <- full_join(hg19[,1:2], amygdala[,1:2])
    print(paste("in total there are ",nrow(id_tibble) , "rows") )
    df1 = left_join(id_tibble, df1, by=c("FID","IID"))  
    df2 = left_join(id_tibble, df1, by=c("FID","IID"))  
    print(identical(df1$FID, df2$FID))
    print(identical(df1$IID, df2$IID)) 
    print("the two lines above should say TRUE")
  } 
  ## remove version number from ensid
  df1 = fn_remove_ver_from_names(df1)
  df2 = fn_remove_ver_from_names(df2)
  if(common_genes)
  { 
    gene_list = intersect(names(df1),names(df2))
    gene_list = gene_list[!(gene_list %in% c("FID","IID"))] ## remove FID, IID
    print(glue::glue("There are {length(gene_list)} genes in common"))
    cor_vec = rep(NA, length(gene_list)); names(cor_vec) = gene_list
    for(gg in gene_list)
    {
      cor_vec[gg] = cor(df1[[gg]],df2[[gg]])
    }
    print(summary(cor_vec))
    for(pp in 1:nplots) {
      gg = sample(gene_list,1)
      if(!is.null(plot_dir)) png(glue::glue("{plot_dir}/{gg}_df1_vs_df2.png"))
      plot(df1[[gg]],df2[[gg]],main=paste(gg," - cor = ",round(cor_vec[gg]),2))
      if(!is.null(plot_dir)) dev.off()
    }
    if(!is.null(plot_dir)) png(glue::glue("{plot_dir}/hist_cor_df1_vs_df2.png"))
    hist(cor_vec,main="correlation between df1 and df2")
    if(!is.null(plot_dir)) dev.off()
  } else ## below is just testing, 10K by 10K is too large
  if(ncol(df1)*ncol(df2)<2e6){
    mat1 = scale( as.matrix(df1 %>% select(-c("FID","IID")) ) )
    mat2 = scale(as.matrix(df2 %>% select(-c("FID","IID")) ) )
    cor_mat = t(mat1) %*% mat2 / nrow(mat1)
  }
  list(df1,df2,cor_vec)
}

tempo_list = suppressWarnings(fn_compare_expression_matrices(df1, df2) )
## [1]   838 14262
## [1]   838 12816
## There are 8557 genes in common
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## -0.98801  0.05389  0.36561  0.37017  0.74258  1.00000       11

Calculated correlation all genes agains all genes. Thsi cannot be done for all the genes, so I’m using a subset of 100 or so.

suppressMessages(library(wesanderson))



tileplot <- function(mat,no_label=FALSE)
{
  mat = data.frame(mat)
  mat$Var1 = factor(rownames(mat), levels=rownames(mat)) ## preserve rowname order
  melted_mat <- gather(mat,key=Var2,value=value,-Var1)
  melted_mat$Var2 = factor(melted_mat$Var2, levels=colnames(mat)) ## preserve colname order
  rango = range(melted_mat$value)
  melted_mat$val2 = cut(melted_mat$value,breaks=c(-1,-0.3,0.3,1))
  pp <- ggplot(melted_mat,aes(x=Var1,y=Var2,fill=val2)) + geom_tile() ##+scale_fill_gradientn(colours = c("#C00000", "#FF0000", "#FF8080", "#FFC0C0", "#FFFFFF", "#C0C0FF", "#8080FF", "#0000FF", "#0000C0"), limit = c(-1,1)) 
  if(no_label)  
    pp <- pp + 
    theme(axis.title.x=element_blank(),
          axis.text.x=element_blank(),
          axis.ticks.x=element_blank(),
          axis.title.y=element_blank(),
          axis.text.y=element_blank(),
          axis.ticks.y=element_blank()) +
    scale_fill_manual(values = wes_palette("GrandBudapest1", n = 3))
          #scale_fill_brewer(palette = "Dark2")
          # scale_fill_gradientn( colours=rainbow(5),limit=c(-1,1) )
##    scale_fill_gradient( limits = c(-.5,.5),low =  "blue", high = "red", na.value="black") 
  print(pp)
}


  df1 = fn_remove_ver_from_names(df1)
  df2 = fn_remove_ver_from_names(df2)

t_genelist = sample(intersect(colnames(df1[,-c(1:2)]),colnames(df2[,-c(1:2)])),100)
    mat1 = scale( as.matrix(df1[,t_genelist]  ) )
    mat2 = scale(as.matrix(df2[,t_genelist]  ) )
cor_mat = t(mat1) %*% mat2 / nrow(mat1)
tileplot(cor_mat,no_label=TRUE)

Reuse

Text and figures are licensed under Creative Commons Attribution CC BY 4.0. The source code is licensed under MIT.

Suggest changes

If you find any mistakes (including typos) or want to suggest changes, please feel free to edit the source file of this page on Github and create a pull request.

Citation

For attribution, please cite this work as

Haky Im (2020). fn_compare_expression_matrices. ImLab Notes. /post/2020/12/03/im-compare-expression-matrices/

BibTeX citation

@misc{
  title = "fn_compare_expression_matrices",
  author = "Haky Im",
  year = "2020",
  journal = "ImLab Notes",
  note = "/post/2020/12/03/im-compare-expression-matrices/"
}