Section 17 Conserved Paralog Substitution

This part of code collects the output of 06-ComplexesCoExpressionMSData, it collect all together and binds creating a dataset with all the stoichiometric co-expression for different subunits.

17.1 Load Parameters and Library

In this first part of the script as usually library and functions are called, plus information on how to retrieve the different files that contains the stoichiometric values for different subunits.

#Change Directory 
setwd("../ComplexScript/")

#Paralogs Palette
ParalogsColor <- c("#e6d5b8","#0278ae")

######################################
# PARAMETERS
######################################
library(VennDiagram)
library(RColorBrewer)
library(ComplexHeatmap)
library(circlize)
library(spatialR)
library(dplyr)
library(cowplot)
library(ggpubr)
library(rstatix)

library(ggrepel)
source("../ComplexScript/complexes_function.R")

First we read the output files generated by the complexes co-expression analysis, the output of the 07-ParalogsSubtitution.Rmd. That indicates paralog pairs inside MS neuronal differentiation data. While the results of the co-expression of subunits are stored inside the InputDir.stability

#Get the switch File, compare all the ParalogSwitch File in the Out/ directory
InputDir = "../out/paralogs/"
InputDir.stability = "../out/complex_coexpr/"

Files = list.files(InputDir)
CoexprFiles <- list.files(InputDir.stability)

We define the different conditions of the comparison in order to grep them in an efficient way and avoid taking different files.

#All subunits
Comps <- c("Neur.Stem","NPC.iPS","Neu.IPS","Neu.NPC",
           "DIV3.DIV0","DIV10.DIV0","DIV10.DIV3","DIV5.DIV1","DIV14.DIV1","DIV14.DIV5")

We then collect the info from different species, in a dataframe called Dataset. The SwitchFile variables contains the filename of the different files that stores the comparison of paralog pairs that are part of protein complexes during neuronal differentiation.

Species = c("drerio","rnorvegicus","hsapiens","mmusculus","drerio")
Dataset = c("Drerio","Frese","Djuric","MouseTMT","DrerioRNASeq")
Dataset = data.frame(Dataset = Dataset,Species=Species)
rownames(Dataset) = Dataset$Dataset

SwitchFile = Files[grep("PutativeSwitch",Files)]
Stability = CoexprFiles[grep("subunits_stability",CoexprFiles)]

We now select only file names that matches our comparison that we want to consider. This in order to avoid any different files to be considered in our analysis.

SwitchFile <- SwitchFile[grep(paste(Comps,collapse = '|'),SwitchFile)]
Stability <- Stability[grep(paste(Comps,collapse = '|'),Stability)]

Here we read all the files from the output of 06-ComplexesCoExpressionMSData and we collect them inside the ParalogsList variable. This contains the protein-coexpression values for each subunits in the different complexes, comparing paralog pairs. While StabilityList just contains the coexpression values for single subunits across all datasets.

#Change Directory 
setwd("../ComplexScript/")

#********************************************
#********************************************
#RUN THIS PART AFTER SETTING PARAMETERS

#ParalogsList
#Load Files
ParalogsList = lapply(SwitchFile,function(x){read.csv(paste(InputDir,x,sep=""),stringsAsFactors=FALSE)})
names(ParalogsList) = SwitchFile

#StabilityList
StabilityList = lapply(Stability,function(x){read.csv(paste(InputDir.stability,x,sep=""),stringsAsFactors=FALSE)})
names(StabilityList) = Stability

We first bind together the information coming from the complexes co-expression analysis. We extract the information coming from the species and conditions, and finally we also order them as factor, based on the Comps variable, for future plotting purpose.

AllSubunits <- do.call(rbind.data.frame,StabilityList)
AllSubunits$species <- unlist(lapply(strsplit(rownames(AllSubunits),"_"),function(x)x[1]))
AllSubunits$species <- Dataset[AllSubunits$species,"Species"]
AllSubunits$cond <- factor(AllSubunits$cond,levels = Comps)

17.2 Paralogs Subunits variability across datasets

We can store the information coming from the subunits co-expression analysis from the different datasets.

hsap <- ggplot(data=AllSubunits[AllSubunits$species=="hsapiens",]) + geom_density(aes(x=pvalue)) + facet_wrap(~cond) + theme_bw()
mmus <- ggplot(data=AllSubunits[AllSubunits$species=="mmusculus",]) + geom_density(aes(x=pvalue)) + facet_wrap(~cond)+ theme_bw()
rnor <- ggplot(data=AllSubunits[AllSubunits$species=="rnorvegicus",]) + geom_density(aes(x=pvalue)) + facet_wrap(~cond) + theme_bw()
drer <- ggplot(data=AllSubunits[AllSubunits$species=="drerio",]) + geom_density(aes(x=pvalue)) + facet_wrap(~cond) + theme_bw()

17.3 Paralogs Subunits variability across datasets ( Boxplot)

Here we can then plot the stoichiometric stability of each subunits, (the pvalue) column. Low pvalues denote high co-expression, and therefore high pvalues denotes high stoichiometric variability.

#Add test information
AllSubunits.test <- AllSubunits %>% group_by(species,cond) %>% 
                    wilcox_test(pvalue ~ has.Paralogs) %>%
                    add_significance("p") %>% add_xy_position(x = "cond") %>%
                    group_by(species) %>% mutate(x.pos = 1:n()) %>% ungroup()

pdf("../out/plots/p09Cons.pdf",width = 6.5,height = 3.4)
p09Cons <- ggplot(data=AllSubunits) + 
            geom_boxplot(aes(x=cond,y=pvalue,fill=has.Paralogs),alpha=0.4) + 
            facet_wrap(~species,ncol=2,scales = "free") + 
            stat_pvalue_manual(AllSubunits.test,x = 'x.pos',tip.length = 0.5,position = position_dodge(0.8),coord.flip = T) + 
            theme_bw() + scale_y_continuous(expand = expansion(mult = c(0.05, 0.1))) + coord_flip() + 
            scale_fill_manual(values = ParalogsColor) + xlab("") +
            ylab('stoichiometric stability (p.value)') + theme(legend.position = 'top')

p09Cons
dev.off()

## png 
##   2

17.4 save workspace

We then save the workspace for future analysis.

save.image("../Markdown/workspaces/08-ConservedSubstitution_CompleteWorkSpace.RData")

17.5 load previous workspace

load("../Markdown/workspaces/05-MSProteomicsData_ParalogsCompleteWorkspace.RData")

17.6 Shared Paralog substitution

In this section we address the number of shared paralog substitutions across datasets. We use the intensity values of the different paralog pairs to calculate ratios between paralogs. Then it will be possible to calculate the increase or decrese of this ratios during neuronal differentiation and address which substitutions are then shared by the different organism. First we load some important libraries

library(ggplot2)
library(tidyr)
library(data.table)
library(dplyr)
library(MASS)
library(paralogstools)
library(proteomicstools)
library(reshape2)
library(cowplot)
library(ComplexHeatmap)

source("../ComplexScript/complexes_function.R")

17.6.1 Load Protein Intensity data

We need to start by loading protein intensity data from the different datasets that we have. And organize them in a data-friendly way, so that will be easy to work with them.

Drerio <- "../Data/Dataset/processed/ZebrafishNeurogProcessed.txt"
Djuric <- "../Data/Dataset/processed/Djuric_et_al_2017_processed.csv"
Mouse <- "../Data/Dataset/DataSetProcessing/27052019_MouseNeuron_TMT10_normalized_protein_matrix.csv"
Frese <- "../Data/Dataset/processed/Frese_et_al_2017_processed.csv"

And we load them into different dataframes.

Drerio <- read.delim(Drerio,sep = "\t",header = T)
Djuric <- read.delim(Djuric,sep=",",header = T)
Frese <- read.delim(Frese,sep = ",",header = T)
Mouse <- read.delim(Mouse,sep = ",",header = T)

For mouse we add the indications coming from the different samples in the column names, and annotate with gene name. We also need to remove the last column of the dataset, since it represent a MS-run coming from the pooling of the different samples. We use the Annotated function to add the gene name annotation.

colnames(Mouse) <- c("X","DIV0.1","DIV0.2","DIV0.3","DIV3.1","DIV3.2","DIV3.3","DIV10.1","DIV10.2","DIV10.3")
Mouse <- Mouse[,-ncol(Mouse)]
#Annotate with gene name
Mouse <- spatialR::Annotate(Mouse,organism = "Mm","X",idsep = ";",annot = "SYMBOL")
#Leave only symbol and toupper for genes
Mouse <- Mouse[,-1];Mouse$SYMBOL<-toupper(Mouse$SYMBOL)

For Zebrafish data we select only columns index that are important and that contains the protein quantification, so the iBAQ values for both Light and Heavy channels across the different replicates.

#select interesting columns for different datasets
col.drerio <- c(2,134,grep("iBAQ.L.Mix",colnames(Drerio)),
                grep("iBAQ.H.Mix",colnames(Drerio)))

Now we subset all the dataset for the relevant column and and convert intensity values in log2.

#Subset datasets
Drerio <- Drerio[,col.drerio]
Drerio[Drerio==0] <- NA

#Convert in Log
Drerio[,-c(1:2)] <- log2(Drerio[,-c(1:2)])
Djuric <- Djuric[,c(1,3:12)]
colnames(Djuric) <- gsub("_",".",colnames(Djuric))

#Frese
Frese <- Frese[,c(1,11,2:7)]
Frese <- Frese[,-1]
colnames(Frese) <- gsub("DIV1\\.","DIV01.",colnames(Frese))

17.6.1.1 Split dataset on single IDs

Since each entries in the data can be actually assigned to multiple genes, we use SplitDatasetByID function to extend the dataset. Now each entry represent one sigle gene. Of course, genes that were belonging to the same proteinGroups have now the same quantifications.

Drerio <- SplitDatasetByID(Drerio,'genename')
Frese <- SplitDatasetByID(Frese,'SYMBOL')
Djuric <- SplitDatasetByID(Djuric,'Gene.Symbol')
Mouse <- SplitDatasetByID(Mouse,'SYMBOL')

17.6.2 Load Paralogs

Here we now generate all the possible paralog pairs, we need to load different paralogs datasets for the different organism.

Paralogs.mouse <- ParalogsSet(organism = "mmusculus",identifier = "SYMBOL",
                              filename = "../Data/Paralogs/mmusculus_SYMBOL_paralogs_v102.txt")

Paralogs.drerio <- ParalogsSet(organism = "drerio",identifier = "SYMBOL",
                              filename = "../Data/Paralogs/drerio_SYMBOL_paralogs_v102.txt")

Paralogs.hsapiens <- ParalogsSet(organism = "hsapiens",identifier = "SYMBOL",
                              filename = "../Data/Paralogs/hsapiens_SYMBOL_paralogs_v102.txt")

Paralogs.rnorvegi <- ParalogsSet(organism = "rnorvegicus",identifier = "SYMBOL",
                            filename = "../Data/Paralogs/rnorvegicus_SYMBOL_paralogs_v102.txt")

17.6.3 Load eggNOG annotations

Here we load the eggNOG annotations, coming from the mapping of the different proteomes.

eggNOG.Hsap <- "../Data/eggNOG/hsapiens_all_proteome.emapper_header.annotations"
eggNOG.Drer <- "../Data/eggNOG/drerio_all_proteome.emapper_header.annotations"
eggNOG.Rnor <- "../Data/eggNOG/rnorvegicus_all_proteome.emapper_header.annotations"
eggNOG.Mmus <- "../Data/eggNOG/mmusculus_all_proteome.emapper_header.annotations"

#read files
eggNOG.Hsap <- read.delim(eggNOG.Hsap,sep = "\t",header = T)
eggNOG.Drer <- read.delim(eggNOG.Drer,sep = "\t",header = T)
eggNOG.Rnor <- read.delim(eggNOG.Rnor,sep = "\t",header = T)
eggNOG.Mmus <- read.delim(eggNOG.Mmus,sep = "\t",header = T)

Then for each eggNOG annotation we extract the annotation relative to the class “Vertebrata.”

#Homo sapiens
eggNOG.hs.v <- lapply(strsplit(eggNOG.Hsap$eggNOG_OGs,split = ","),function(x)
  {
    x <- x[grep("Vertebrata",x)];if(length(x)==0)x<-""
    x <- gsub("@\\d+\\|Vertebrata","",x)
  }) %>% unlist()
#Mus musculus
eggNOG.mm.v <- lapply(strsplit(eggNOG.Mmus$eggNOG_OGs,split = ","),function(x)
  {
    x <- x[grep("Vertebrata",x)];if(length(x)==0)x<-""
    x <- gsub("@\\d+\\|Vertebrata","",x)
  }) %>% unlist()
#Rattus norvegicus
eggNOG.rn.v <- lapply(strsplit(eggNOG.Rnor$eggNOG_OGs,split = ","),function(x)
  {
    x <- x[grep("Vertebrata",x)];if(length(x)==0)x<-""
    x <- gsub("@\\d+\\|Vertebrata","",x)
  }) %>% unlist()
#Danio rerio
eggNOG.dr.v <- lapply(strsplit(eggNOG.Drer$eggNOG_OGs,split = ","),function(x)
  {
    x <- x[grep("Vertebrata",x)];if(length(x)==0)x<-""
    x <- gsub("@\\d+\\|Vertebrata","",x)
  }) %>% unlist()

Here we add back the annotation to the data.

eggNOG.Hsap$eggNOG.vertebrata <- eggNOG.hs.v
eggNOG.Drer$eggNOG.vertebrata <- eggNOG.dr.v
eggNOG.Rnor$eggNOG.vertebrata <- eggNOG.rn.v
eggNOG.Mmus$eggNOG.vertebrata <- eggNOG.mm.v

17.7 Calculate Paralog Pairs ratios

Here we calculate paralog ratios inside the different dataset using the CalculateParalogRatios function. This function takes in input a dataset with protein intensity and a ParalogSet object. It will first calculate all possible paralog pairs present in the data, then it will compute for each paralog pairs, ratios intended as Log2(Paralog1) - Log2(Paralog2) = Log(Paralog1/Paralog2). Then the ratios from each condition are tested for differences with a differential expression analysis done with limma. In this way we can define if the ratio between a specific paralog pairs increase or decrease across conditions. Conditions to extract are declared in with the exprsparameter.

17.7.1 Human

We do this from the human dataset. We use eggNOGPair function to map each gene pair to an eggNOG pair. The function eggNOGPair takes in input a dataframe, and with the pair parameters we indicates the column name where the paralog pairs are present. To remove any redundancy in the dataset we also restrinct just to unique rows in all the matrix. We also remove pairs that was not possible to map to an eggNOG.

Hsapiens.Ratios <- CalculateParalogsRatios(Paralogs.hsapiens,Djuric,
                                           idcol = "Gene.Symbol",
                                           cond_col = c("iPS37","NPC37","Neu37"),
                                           exprs = "iPS37,NPC37,Neu37,NPC37-iPS37,Neu37-iPS37",
                                           use = "pair")

## Warning: Partial NA coefficients for 2770 probe(s)

## Warning: Zero sample variances detected, have been offset away from zero

#Results matrix
Hsapiens.Ratios.m <- Hsapiens.Ratios$ratios
Hsapiens.Ratios.m$species <- "hsapiens"

#Add Hsapiens reference pairs
Pairs.hsap <- strsplit(Hsapiens.Ratios.m$pairs,split = " / ")
Pairs.hsap <- do.call(rbind.data.frame,Pairs.hsap);colnames(Pairs.hsap) <-c("p.x","p.y")
Hsapiens.Ratios.m <- cbind.data.frame(Pairs.hsap,Hsapiens.Ratios.m)

#Add eggNOG annotation
Hsapiens.Ratios.m <- eggNOGPair(Hsapiens.Ratios.m,pairs = c("p.x","p.y"),
                                eggNOG = eggNOG.Hsap) %>% unique()

#Remove Paralog Pairs with NA
Hsapiens.Ratios.m <-Hsapiens.Ratios.m[-grep("NA",Hsapiens.Ratios.m$eggNOG.pair),]

17.7.2 Zebrafish

We do this from the zebrafish dataset. We use eggNOGPair function to map each gene pair to an eggNOG pair. The function eggNOGPair takes in input a dataframe, and with the pair parameters we indicates the column name where the paralog pairs are present. To remove any redundancy in the dataset we also restrinct just to unique rows in all the matrix. We also remove pairs that was not possible to map to an eggNOG.

Drerio.Ratios <- CalculateParalogsRatios(Paralogs.drerio,Drerio,
                                idcol = "genename",
                                cond_col = c("iBAQ.L","iBAQ.H"),
                                exprs = c("iBAQ.L,iBAQ.H,iBAQ.H-iBAQ.L"),
                                use = "pair")

## Warning: Partial NA coefficients for 56 probe(s)

## Warning: Zero sample variances detected, have been offset away from zero

#Results matrix
Drerio.Ratios.m <- Drerio.Ratios$ratios
Drerio.Ratios.m$species <- "drerio"

#Add Hsapiens reference pairs
Pairs.drerio <- strsplit(Drerio.Ratios.m$pairs,split = " / ")
Pairs.drerio <- do.call(rbind.data.frame,Pairs.drerio);colnames(Pairs.drerio) <-c("p.x","p.y")
Drerio.Ratios.m <- cbind.data.frame(Drerio.Ratios.m,Pairs.drerio)

#Add eggNOG annotation
Drerio.Ratios.m <- eggNOGPair(Drerio.Ratios.m,pairs = c("p.x","p.y"),
                                eggNOG = eggNOG.Drer) %>% unique()

Drerio.Ratios.m <-Drerio.Ratios.m[-grep("NA",Drerio.Ratios.m$eggNOG.pair),]

17.7.3 Mouse

We do this from the mouse dataset. We use eggNOGPair function to map each gene pair to an eggNOG pair. The function eggNOGPair takes in input a dataframe, and with the pair parameters we indicates the column name where the paralog pairs are present. To remove any redundancy in the dataset we also restrinct just to unique rows in all the matrix. We also remove pairs that was not possible to map to an eggNOG.

Mouse.Ratios <- CalculateParalogsRatios(Paralogs.mouse,Mouse,
                                        idcol = "SYMBOL",
                                        cond_col = c("DIV0","DIV3","DIV10"),
                                        exprs = c("DIV0,DIV3,DIV10,DIV3-DIV0,DIV10-DIV0"),
                                        use = "pair")

## Warning: Zero sample variances detected, have been offset away from zero

#Results matrix
Mouse.Ratios.m <- Mouse.Ratios$ratios
Mouse.Ratios.m$species <- "mmusculus"

#Add Hsapiens reference pairs
Pairs <- strsplit(Mouse.Ratios.m$pairs,split = " / ")
Pairs <- do.call(rbind.data.frame,Pairs);colnames(Pairs) <-c("p.x","p.y")
#Ratios
Mouse.Ratios.m <- cbind.data.frame(Mouse.Ratios.m,Pairs)

#Add eggNOG annotation
Mouse.Ratios.m <- eggNOGPair(Mouse.Ratios.m,pairs = c("p.x","p.y"),
                                eggNOG = eggNOG.Mmus) %>% unique()

Mouse.Ratios.m <-Mouse.Ratios.m[-grep("NA",Mouse.Ratios.m$eggNOG.pair),]

17.7.4 Rat

We do this from the mouse rat. We use eggNOGPair function to map each gene pair to an eggNOG pair. The function eggNOGPair takes in input a dataframe, and with the pair parameters we indicates the column name where the paralog pairs are present. To remove any redundancy in the dataset we also restrinct just to unique rows in all the matrix. We also remove pairs that was not possible to map to an eggNOG.

Frese.Ratios <- CalculateParalogsRatios(Paralogs.rnorvegi,Frese,
                                        idcol = "SYMBOL",
                                        cond_col = c("DIV01","DIV5","DIV14"),
                                        exprs = c("DIV01,DIV5,DIV14,DIV5-DIV01,DIV14-DIV01"),
                                        use = "pair")

## Warning: Partial NA coefficients for 24 probe(s)

## Warning: Zero sample variances detected, have been offset away from zero

#Results matrix
Frese.Ratios.m <- Frese.Ratios$ratios
Frese.Ratios.m$species <- "rnorvegicus"

#Add Hsapiens reference pairs
Pairs <- strsplit(Frese.Ratios.m$pairs,split = " / ")
Pairs <- do.call(rbind.data.frame,Pairs);colnames(Pairs) <-c("p.x","p.y")

#Ratios
Frese.Ratios.m <- cbind.data.frame(Frese.Ratios.m,Pairs)

#Add eggNOG annotation
Frese.Ratios.m <- eggNOGPair(Frese.Ratios.m,pairs = c("p.x","p.y"),
                                eggNOG = eggNOG.Rnor) %>% unique()
Frese.Ratios.m <-Frese.Ratios.m[-grep("NA",Frese.Ratios.m$eggNOG.pair),]

17.8 Shared Substitutions

Bind dataset together creating a unique dataframe for all the datasets.

#Melt for each dataset ratios values and p.values
All.Sub <- lapply(list(Hsapiens.Ratios.m,Mouse.Ratios.m,Drerio.Ratios.m,Frese.Ratios.m),
                  function(x)
                  {
                    var.names <- colnames(x)[grep("diff",colnames(x))]
                    var.names <- gsub("_diff","",var.names)
                    
                    res = data.table::melt(setDT(x), measure = patterns("diff", "p.value"), 
                    value.name = c("ratio.diff", "pvalue"),variable.factor=T,value.factor=F,
                    variable.name = "condition")
                    
                    res$condition <- var.names[res$condition]
                    return(res)
                  })

#Bind them together
All.Sub <- do.call(rbind.data.frame,All.Sub) %>% as.data.frame()
All.Sub <- All.Sub[complete.cases(All.Sub),]

#print header
head(All.Sub)

##        p.y   p.x          pairs  species eggNOG.pair   condition ratio.diff
## 1     ABAT   OAT     OAT / ABAT hsapiens 48XYA;48W15 NPC37.iPS37  -9.019708
## 2    ABCF1 ABCF2  ABCF2 / ABCF1 hsapiens 490HY;491EZ NPC37.iPS37   0.020240
## 3    ABCF2 ABCF1  ABCF1 / ABCF2 hsapiens 491EZ;490HY NPC37.iPS37  -0.020240
## 7  ABHD14A  MEST MEST / ABHD14A hsapiens 4934T;495TA NPC37.iPS37   3.332240
## 9    ACAA1 ACAT2  ACAT2 / ACAA1 hsapiens 4964F;494Q2 NPC37.iPS37  -0.098630
## 10   ACAA1 ACAA2  ACAA2 / ACAA1 hsapiens 491BS;494Q2 NPC37.iPS37  -1.045300
##          pvalue
## 1  1.676366e-06
## 2  9.618169e-01
## 3  9.618169e-01
## 7  9.624705e-02
## 9  7.527687e-01
## 10 4.329075e-01

17.8.1 Combine pvalues

In order to give to each paralog pairs comparison a unified pvalue, we use the fisher method to combine the different test done on the different datasets. We use the fishercomb function from the metaRNASeq package. First we extend the data in a matrix format.

#Spread dataset
All.Sub.Pval <- pivot_wider(All.Sub,id_cols="eggNOG.pair",names_from="condition",
                            values_from="pvalue")

## Warning: Values are not uniquely identified; output will contain list-cols.
## * Use `values_fn = list` to suppress this warning.
## * Use `values_fn = length` to identify where the duplicates arise
## * Use `values_fn = {summary_fun}` to summarise duplicates

Since the same eggNOG pair can map multiple actual protein pairs, for each pair we calculate the mean of the pvalue. We use suppressWarnings to eliminate the prompt that is coming from the presence of NA inside the dataset.

#calculate mean pvalues for each entry
pvalues <- suppressWarnings(apply(All.Sub.Pval[,-1],2,function(x)sapply(x,function(y)mean(y,na.rm=T))))
rownames(pvalues) <- All.Sub.Pval$eggNOG.pair
pvalues <- as.data.frame(pvalues)

#Combine
comb.pval <- metaRNASeq::fishercomb(as.list(pvalues))$adjpval
names(comb.pval) <- rownames(pvalues)

#convert in dataframe
comb.pval <- as.data.frame(comb.pval)
comb.pval$pairs <- rownames(comb.pval)

17.8.2 All Ratio PCA

Also for the ratios, we refer to the eggNOG mapping, and for each pair, since multiple proteins can fall in the same eggNOG we use the mean value for each pair. Similar to what we do already for the pvalues.

library(factoextra)
#Mean Ratio for each eggNOGpair
All.Sub.M.ori <- pivot_wider(All.Sub[,c(5,6,7)],names_from = c(condition),values_from =  c(ratio.diff))

## Warning: Values are not uniquely identified; output will contain list-cols.
## * Use `values_fn = list` to suppress this warning.
## * Use `values_fn = length` to identify where the duplicates arise
## * Use `values_fn = {summary_fun}` to summarise duplicates

All.Sub.M <- suppressWarnings(apply(All.Sub.M.ori[,-1],2,
                                    function(x)sapply(x,function(y)mean(y,na.rm=T))))
#add rownames
rownames(All.Sub.M) <- All.Sub.M.ori$eggNOG.pair
All.Sub.M <- All.Sub.M[complete.cases(All.Sub.M),] %>% as.data.frame()

We transpose our numeric matrix in order to have proteins as features and different conditions as samples, adding annotation regarding the species.

#Prepare data
PCA.data <- t(All.Sub.M) %>% as.data.frame()
PCA.data$Species <- c("H.sapiens","H.sapiens","M.musculus",
                      "M.musculus","D.rerio","R.norvegicus","R.norvegicus")

rownames(PCA.data) <- c("NPC/iPS","Neu/iPS","DIV3/DIV0",
                        "DIV10/DIV0","Neu/Stem","DIV5/DIV01",
                        "DIV14/DIV01")

We then run a PCA using the prcomp function. And plot the results using the factoextra function fviz_pca_ind.

#Run PCA
PCA <- prcomp(PCA.data[,-ncol(PCA.data)]) 

#Plot PCA
pca.ratio <- fviz_pca_ind(PCA,axes = c(1,2),habillage = PCA.data$Species,
                          repel=T) + 
  ggtitle("Absolute paralog ratios differences - PCA") + 
             theme(legend.position = "bottom")

We save the plot output.

pdf("../out/plots/ratios_pca.pdf",width = 3.6,height = 3.6)
pca.ratio
dev.off()

## png 
##   2

pca.ratio

17.8.3 Get Conserved Substitutions

As a first step in the case we will add the combined pvalues coming from the analysis above.

#Add Combined pvalues
All.Sub.comp <- merge(All.Sub,comb.pval,by.x="eggNOG.pair",by.y="pairs")

In the case that the same eggNOG map both the paralog present in a pair, like this case

unique(All.Sub.comp[All.Sub.comp$eggNOG.pair=="48YUB;48YUB",1:3])

##       eggNOG.pair  p.y  p.x
## 29902 48YUB;48YUB DNM2 DNM1
## 29903 48YUB;48YUB DNM1 DNM2

We just create a unique identifier for both paralogs so that we will be able to distinguish the direction of the ratio for our further analysis.

#add unique eggNOGid for paralog pairs mapping to the same eggNOG
eggNOGpair <- apply((All.Sub.comp[,1:3]),1,function(x){
  z <- unlist(strsplit(x[1],split = ";"))
  pws <- order(x[c(2,3)])
  if(sum(duplicated(z))>0){z<-paste(z,c(pws[1],pws[2]),sep=".",collapse = ";")}
  else{z <- paste(z,collapse = ";")}
  return(z)
}) %>% unlist()

All.Sub.comp$eggNOG.pair <- eggNOGpair

In this chunks of code we count in how many species, and condition paralog substitution are shared. Since ratios direction directly depend on which paralog is at the denominator or numerator (Example, if Paralog1 / Paralog2 >0 then Paralog2 / Paralog1 <0 and so on) and since in these case we are considering all possible comparison, we just select ratio differences that are ‘increasing’ (with ratio.diff>0). This will in any case take into account the single pair paralog1 / paralog2 and paralog2 / paralog1.

#calculate shared substitutions
All.Sub.comp <- All.Sub.comp %>% 
                group_by(eggNOG.pair) %>%
                mutate(shared.cond = length(unique(condition[ratio.diff>0 & !is.na(ratio.diff)]))) %>%
                mutate(shared.spec = length(unique(species[ratio.diff>0 & !is.na(ratio.diff)]))) %>% 
                as.data.frame()

After counting in how many conditions each paralog pair was increasing, we select shared paralog pairs with shared.spec>=4 and ratio.diff>=5. This will select eggNOG pairs that are identified in all organism and whose ratio is consistentin 5 out of 7 the condition considered.

shared.pairs <- All.Sub.comp %>% 
                filter(shared.spec>=4 & shared.cond>=5 & !is.na(ratio.diff)) %>%
                filter(comb.pval<=0.05)  %>% dplyr::select(eggNOG.pair) %>% 
                unlist() %>%
                as.character()

From here we found 78 of unique eggNOG pairs identified and that are shared across species.

After selecting the specific pairs, we filter the data. Calculate also the average ratio differences across condition for every eggNOG pair

#Filter
All.Sub.shared <- All.Sub.comp %>% filter(eggNOG.pair %in% shared.pairs)
All.Sub.shared$eggNOG.pair <- factor(All.Sub.shared$eggNOG.pair,unique(All.Sub.shared$eggNOG.pair))

#Sort by average value
All.Sub.shared <- All.Sub.shared %>%
                  dplyr::group_by(eggNOG.pair) %>% 
                  dplyr::mutate(avg = mean(ratio.diff,na.rm=T)) %>%
                  arrange(desc(avg)) %>% as.data.frame()

We create a dataframe called Reliable.Sub that contains the shared substitution between organism

Reliable.Sub <- All.Sub.shared

17.8.4 Annotate with GOTerms

For each one of the shared substitutions, we add the annotation referring to each gene names for the GOEnrichment analysis. We use homo sapiens as reference organism, and we map using the GOAnnotation function.

#Add Human GOAnnotation paralog1
Reliable.Sub <- spatialR::GOAnnotation(Reliable.Sub,"p.x",organism = "Hs")
#Add Human GOAnnotation paralog2
Reliable.Sub <- spatialR::GOAnnotation(Reliable.Sub,"p.y",organism = "Hs")
#Change names
colnames(Reliable.Sub)[grep("GO",colnames(Reliable.Sub))] <- c("GO.x","GO.y")

17.8.5 GOEnrichment Shared Substitution

And we finally run a GOEnrichment analysisis using the GOEnrichment function. And summarize them with SummarizeGO function from the spatialR package.

keep <- unique(unlist(strsplit(Reliable.Sub$pairs," / ")))
geneList <- unique(unlist(strsplit(All.Sub.comp$pairs," / ")))

GOEnrich.Shared <- spatialR::GOEnrichment(geneList,keep,
                                          species = "Hs",ontology = "BP")
GOEnrich.Shared.CC <- spatialR::GOEnrichment(geneList,keep,
                                             species = "Hs",ontology = "CC")
GOEnrich.Shared.MF <- spatialR::GOEnrichment(geneList,keep,
                                             species = "Hs",ontology = "MF")

GOEnrich.Shared.Summ <- spatialR::SummarizeGO(GOEnrich.Shared)

GOEnrich.Shared.Summ.CC <- spatialR::SummarizeGO(GOEnrich.Shared.CC)

GOEnrich.Shared.Summ.MF <- spatialR::SummarizeGO(GOEnrich.Shared.MF)

GOEnrich.Shared.table <- GOEnrich.Shared$table

## Simple Plot
GOEnrich.Shared.Summ.CC$plot

17.8.6 Categories

From the different GOTerms that resulted enriched, we can extract some remarkable terms. Such as transport related proteins, redox related proteins and RNA splicing related proteins. We select functions grepping on the GOterms annotation for different keyword.

LIST <- spatialR::ShowGenesGOTerms(GOEnrich = GOEnrich.Shared,keep)
LIST.CC <- spatialR::ShowGenesGOTerms(GOEnrich = GOEnrich.Shared.CC,keep)

#Convert in dataframe
Reliable.Sub <- as.data.frame(Reliable.Sub)

#Proteins related to transport, grep transport on GOFunction
transport <- grep('transport',Reliable.Sub$term)
transport <- unique(unlist(strsplit(as.character(Reliable.Sub[transport,"pairs"]),split =" / ")))

#redox
redox <- grep('redox',Reliable.Sub$term)
redox <- unique(unlist(strsplit(as.character(Reliable.Sub[redox,"pairs"]),split =" / ")))

#splicing
splicing <- grep("RNA splicing",Reliable.Sub$term)
splicing <- unique(unlist(strsplit(as.character(Reliable.Sub[splicing,"pairs"]),split =" / ")))

17.8.7 Complex Heatmap

Then we can prepare the data to be plotted in a heatmap format. First with pivot_wider we transform the long-data in a matrix-like format. Again since some eggNOG pair can actually match multiple paralog pair. For each eggNOG pair we run the mean ratio differences. We convert in matrix format. And sort them by average of each row, to order eggNOG pairs in terms of their magnitude.

#Spread as matrix
Reliable.Sub.ori <- pivot_wider(Reliable.Sub[,c(1,6,7)],
                                names_from = c(condition),
                                values_from =  c(ratio.diff))

## Warning: Values are not uniquely identified; output will contain list-cols.
## * Use `values_fn = list` to suppress this warning.
## * Use `values_fn = length` to identify where the duplicates arise
## * Use `values_fn = {summary_fun}` to summarise duplicates

#Mean for every eggNOG pair
Reliable.Sub.m <- suppressWarnings(apply(Reliable.Sub.ori[,-1],2,function(x)sapply(x,function(y)mean(y,na.rm=T))))

#Rownames as eggNOG pairs
rownames(Reliable.Sub.m) <- Reliable.Sub.ori$eggNOG.pair

M <- Reliable.Sub.m %>% as.matrix()

#Sort by rowavg
rowavg <- order(rowMeans(M,na.rm = T),decreasing = T)
M <- M[rowavg,]
M <- as.data.frame(M)

As a last step for annotation purpose on our heatmap, we map back the eggNOG pairs to the Homo sapiens paralog pairs to have a gene annotation on the heatmap. And then we add the gene name notation as rownames, in order to be plotted on the heatmap. Since each eggNOG can map multiple genes when we merge the dataset, we just select unique entry. This in order to have a gene name annotation for each eggNOGpair.

M$eggNOG.pair <- rownames(M)
#Select Homo sapiens ratios
Ann <- All.Sub.comp %>% filter(species=='hsapiens')
Ann <- unique(Ann[,c(1,4)])

#Merge back with homo sapiens gene name
M <- merge(M,Ann,by.x='eggNOG.pair',by.y='eggNOG.pair')
rownames(M) <- M$pairs
#M <- M[!duplicated(M[,2:8]),]

#Subset to obtain a numeric matrix
M <- M[,2:8]
M <- M %>% as.matrix()

Now we run the Heatmap function from the ComplexHeatmap bioconductoR package. We define cate as a vector containing the annotation from the different functions defined here. And we plot them in this heatmap

library(circlize)

col_fun = colorRamp2(c(-2,0,6), c("#ffa59e","white", "deepskyblue3"))
lgd = Legend(col_fun = col_fun, title = "Relative\nRatio Increase")

#Order Alphabetically
M <- M[order(rownames(M),decreasing = F),]

#add Category
cate  <- character(length = nrow(M))
cate [grep(paste(splicing,collapse = "|"),rownames(M))] <- "RNA Splicing"
cate [grep(paste(transport,collapse = "|"),rownames(M))] <- "Transport"
cate [grep(paste(redox,collapse = "|"),rownames(M))] <- "Redox"
cate [cate==""] <- "Others"
cate  <- factor(cate,levels = c("Transport","Redox","RNA Splicing","Others"))

#Order Matrix
Order <- c('iBAQ.H.iBAQ.L','NPC37.iPS37','Neu37.iPS37',
         'DIV3.DIV0','DIV10.DIV0','DIV5.DIV01','DIV14.DIV01')
#Transpose
Mt <- t(M)

heatmap <- Heatmap(Mt[Order,],name = "Ratio Differences", 
                  rect_gp = gpar(col = "grey", lwd = 1),
                  na_col = "grey",cluster_rows = F,
                  cluster_columns = F,
                  row_dend_reorder = TRUE,
                  col=col_fun,
                  show_column_names = FALSE,
                  column_names_side = "top",
                  column_names_rot = 45,
                  column_names_gp = gpar(fontsize = 8),
                  row_title_rot = 0,
                  column_split = cate,
                  row_split = rev(c(rep(c("H.sapiens"),2),
                                rep(c("M.musculus"),2),
                                rep(c("R.norvegicus"),2),
                                rep(c("D.rerio"),1))),
                  top_annotation = HeatmapAnnotation(
                                foo = anno_block(gp = gpar(fill = c("orange","#5fad56","#b4436c","grey"),col = NA)),
                                                 height = unit(2,"mm"),border = NA))

heatmap

17.8.8 Sub-Heatmap

We also add a sub-heatmap highlighting the pairs that are related to specific functions.

subM <- M
#subM <- subM[order(rownames(subM),decreasing = F),]

heatmap.sub <- Heatmap(t(subM)[Order,], 
                  rect_gp = gpar(col = "grey", lwd = 1),
                  na_col = "grey",cluster_rows = F,
                  cluster_columns = F,
                  row_dend_reorder = TRUE,
                  col=col_fun,column_title = NULL,
                  show_column_names = T,column_split = cate,
                  row_split = rev(c(rep(c("H.sapiens"),2),
                                rep(c("M.musculus"),2),
                                rep(c("R.norvegicus"),2),
                                rep(c("D.rerio"),1))),
                  column_names_side = "bottom",
                  column_names_rot = 45,
                  column_gap = unit(c(4,4,4), "mm"),
                  column_names_gp = gpar(fontsize = 8),
                  row_title_rot = 0,top_annotation = 
                    HeatmapAnnotation(
                                foo = anno_block(gp = gpar(fill = c("orange","#5fad56","#b4436c","grey"),col = NA)),
                                                 height = unit(2,"mm"),border = NA))

17.8.9 Save Heatmap

And then we save both the heatmaps.

fn<-paste('../out/figures/Fig3/Paralogs_Heatmap_',Sys.Date(),'.pdf',sep = '')

pdf(fn,width = 13,height = 1.7)
heatmap
dev.off()

## png 
##   2

fn<-paste('../out/figures/Fig3/Paralogs_Sub_Heatmap_',Sys.Date(),
          '.pdf',sep = '')
pdf(fn,width = 18.5,height = 2.44)
heatmap.sub
dev.off()

## png 
##   2

17.9 Show Some Substitutions

We use the function PlotParalogsRatios to plot the ratios of different paralog pairs across datasets.

17.9.1 COPII

#Make a functions out of it
P <- c("SEC23A","SEC23B")
p1.sec <- PlotParalogsRatios(P,Mouse.Ratios)
p2.sec <- PlotParalogsRatios(P,Hsapiens.Ratios)
p3.sec <- PlotParalogsRatios(P,Drerio.Ratios)
p4.sec <- PlotParalogsRatios(P,Frese.Ratios)

sec23 <- gridExtra::grid.arrange(p1.sec$final,p2.sec$final,p3.sec$final,p4.sec$final,nrow=1)

pdf("../out/figures/Fig3/Sec23ab_ratios.pdf",width = 9.8,height = 4.7)
grid.draw(sec23)
dev.off()

## png 
##   2

17.9.2 VPS26A / VPS26B

#Make a functions out of it
P <- c("VPS26B","VPS26A")

p1 <- PlotParalogsRatios(P,Mouse.Ratios)
p2 <- PlotParalogsRatios(P,Hsapiens.Ratios)
p3 <- PlotParalogsRatios(P,Drerio.Ratios)
p4 <- PlotParalogsRatios(P,Frese.Ratios)

vps <- gridExtra::grid.arrange(p1$final,p2$final,p3$final,p4$final,nrow=1)

17.9.3 RAB14 / RAB8A

#Make a functions out of it
P <- c("RAB14","RAB8A")

p1 <- PlotParalogsRatios(P,Mouse.Ratios) 
p2 <- PlotParalogsRatios(P,Hsapiens.Ratios)
p3 <- PlotParalogsRatios(P,Drerio.Ratios) 
p4 <- PlotParalogsRatios(P,Frese.Ratios)

rabs <- gridExtra::grid.arrange(p1$final,p2$final,p3$final,p4$final,nrow=1)
grid::grid.draw(rabs)

17.9.4 DYNC1LI1 / DYNC1LI2

#Make a functions out of it
P <- c("DYNC1LI1","DYNC1LI2")

p1 <- PlotParalogsRatios(P,Mouse.Ratios) 
p2 <- PlotParalogsRatios(P,Hsapiens.Ratios)
p3 <- PlotParalogsRatios(P,Drerio.Ratios)
p4 <- PlotParalogsRatios(P,Frese.Ratios)

dynein <- gridExtra::grid.arrange(p1$final,p2$final,p3$final,p4$final,nrow=1)
grid::grid.draw(dynein)

17.9.5 TARDBP / SF3B4

#Make a functions out of it
P <- c("TARDBP","SF3B4")

p1 <- PlotParalogsRatios(P,Mouse.Ratios) 
p2 <- PlotParalogsRatios(P,Hsapiens.Ratios)
p3 <- PlotParalogsRatios(P,Drerio.Ratios)
p4 <- PlotParalogsRatios(P,Frese.Ratios)

tardbp <- gridExtra::grid.arrange(p1$final,p2$final,p3$final,p4$final,nrow=1)
grid::grid.draw(tardbp)

17.10 Save workspace

We save now the complete workspace of the analysis.

save.image("../Markdown/workspaces/09-SharedSubstitution_CompleteWorkspace.RData")

17.11 Fig2

We assemble the Figures, using the cowplot library.

load("../Markdown/workspaces/05-MSProteomicsData_ParalogsCompleteWorkspace.RData")
load("../Markdown/workspaces/08-ConservedSubstitution_CompleteWorkSpace.RData")
load("../Markdown/workspaces/09-SharedSubstitution_CompleteWorkspace.RData")

Starting from the figure2.

#Modify facetting
p09Cons.long <- p09Cons + facet_wrap(~species,nrow=1,scales = "free")

#Arrange plots
top.block <- plot_grid(p01MSData,p09Cons,ncol=2,labels = c("A","B"))
top.block <- plot_grid(p02MSData,p01MSData,ncol = 2,labels = c('A','B'))

## Warning: Removed 1703 rows containing non-finite values (stat_boxplot).

btt.block <- plot_grid(p09Cons.long,labels='C')

#Align
plots <- align_plots(p01MSData,p09Cons.long,align = 'hv',axis = 'lr')

#Fig2
Fig2 <- plot_grid(top.block,plots[[2]],nrow=2,rel_heights = c(1,0.55),
                  labels = c('','C'))

pdf(paste("../out/figures/Fig2/Fig2Draft_",Sys.Date(),".pdf",sep = ''),
    width = 10,height = 5.4)
Fig2
dev.off()

## png 
##   2

Fig2

17.12 Fig3

And Figure 3

heatmap = grid.grabExpr(ComplexHeatmap::draw(heatmap))
heatmap.sub = grid.grabExpr(ComplexHeatmap::draw(heatmap.sub))

Fig3 <- plot_grid(heatmap,heatmap.sub,sec23,nrow = 3,rel_heights = c(0.5,0.8,0.8),
                  labels = c("A","","C"))

pdf(paste("../out/figures/Fig3/","Fig3_",Sys.Date(),".pdf",sep = ""),width = 11.8,height = 7.6)
Fig3
dev.off()

## png 
##   2

Fig3

17.13 Supp Fig 5

fname <- paste('../out/figures/FigSupp5/SuppFig5_Updated_',Sys.Date(),'.pdf',
               sep = '')
pdf(fname,width = 7.10,height = 15)
FigSupp5 <- plot_grid(vps,dynein,rabs,nrow = 4,labels = c('A','B','C'))
FigSupp5
dev.off()

## png 
##   2

17.13.1 Fig Supp 8,9,10, 11

FigSupp8 <- pca.ratio
FigSupp9 <- vps
FigSupp10 <- rabs
FigSupp11 <- dynein

pdf(paste("../out/figures/FigSupp8/","FigSupp8_",Sys.Date(),".pdf",sep = ""),width = 6.51,height = 5.3)
grid.draw(FigSupp8)
dev.off()

## png 
##   2

pdf(paste("../out/figures/FigSupp9/","FigSupp9",Sys.Date(),".pdf",sep = ""),width = 9.8,height = 4.7)
grid.draw(FigSupp9)
dev.off()

## png 
##   2

pdf(paste("../out/figures/FigSupp10/","FigSupp10",Sys.Date(),".pdf",sep = ""),width = 9.8,height = 4.7)
grid.draw(FigSupp10)
dev.off()

## png 
##   2

pdf(paste("../out/figures/FigSupp11/","FigSupp11",Sys.Date(),".pdf",sep = ""),width = 9.8,height = 4.7)
grid.draw(FigSupp11)
dev.off()

## png 
##   2

17.14 Write Tables

17.14.1 Table S6

l <- list('Subunits Co-Expression' = AllSubunits)
#Write table
openxlsx::write.xlsx(l,'../out/tables/Table_S6.xlsx')

17.14.2 Table S7

l <- list('AllParalogs'=All.Sub.comp,
          'Shared Paralogs' = Reliable.Sub)
#Write table
openxlsx::write.xlsx(l,'../out/tables/Table_S7.xlsx')