Section 14 MS Proteomics Data Paralog Analysis

The huge chunck of code below, carries on the analysis on Paralogs Genes accross all the MS dataset that we have. The dataset information are read from a DataInfo files that contains important information on column names and where to get all the information in each file. Then it iterates throught each files carrying on this analysis in order.

Annotate each dataset with Paralogs definition
Annotate protein complexes
Calculate the proportion of differentially expressed proteins that have paralogs and those that have not
Plot for every dataset a volcano plot and a density plot with paralogs and not paralogs.
Calculates paralogs pairs that are regulated in opposite ways.
Carries on a GOEnrichment of those Opposite regulated paralogs.
Returns plots and files inside the Out/AnalyzeData/ folder

Clik here to show code

setwd("../ComplexScript/")

source("complexes_function.R")
library(fdrtool)
library(ggpubr)
library(rstatix)
library(dplyr)
library(paralogstools)
library(proteomicstools)
library(spatialR)
library(ComplexHeatmap)
library(MASS)
library(ggplot2)
library(topGO)
library(gridExtra)
library(biomaRt)

#Load data info for each dataset < ----
DataInfo <- read.table("../Data/Dataset/DataInfo.txt",sep = "\t",header = T,stringsAsFactors = F)

#FCthr and fdrTHR
FCthr <- 0.58
fdrTHR <- 0.05

#Paralogs.Expressions and GOterms list
PARALOGS <- vector(mode = "list",length = nrow(DataInfo))
DATASETS <- vector(mode = "list",length = nrow(DataInfo))
GOTABLES <- vector(mode = "list",length = nrow(DataInfo))
GOTABLES.Summ <- vector(mode = "list",length = nrow(DataInfo))
GOENRICH <- vector(mode = "list",length = nrow(DataInfo))

#Paralogs Palette
ParalogsColor <- c("#e6d5b8","#0278ae")

#Run Plots for all the datasets.
for (N in c(1:nrow(DataInfo)))
{
  #GetData information form DataInfo File
  filename <- DataInfo[N,"filename"]
  Id.col <- DataInfo[N,"Id.col"]
  fold.change <- DataInfo[N,"fold.change"]
  fdr.col <- DataInfo[N,"fdr.col"]
  condition.col <- DataInfo[N,"condition.col"]
  organism <- DataInfo[N,"organism"]
  out.label <- DataInfo[N,"out.label"]
  sep <- DataInfo[N,"sep"];if(sep=="\\t"){sep<-"\t"}
  complex.name <- DataInfo[N,"complex.name"]
  ID.type <- DataInfo[N,"ID.type"]
  OutName <- DataInfo[N,"out.label"]
  PlotDir <- paste("Plots/",strsplit(OutName,"/")[[1]][2],sep = "")
  ExpName <- strsplit(OutName,"/")[[1]][2]
  ParalogsFileName <- paste("../Data/Paralogs/",organism,"_",ID.type,"_paralogs_v102.txt",sep = "")
  species <- DataInfo[N,"species"]
  
  #Load Paralogs, get the Already Cleaned Paralogs from File with filename=""
  Paralogs <- ParalogsSet(organism = organism,identifier = ID.type,filename = ParalogsFileName)

  #Load Protein Complexes
  Complexes <- ComplexSet(organism = organism,identifier = ID.type)

  #Differential Expressions Analysis ####
  Data <- read.delim(filename,sep = sep,header = T,stringsAsFactors = F)
  Data <- SplitDatasetByID(Data,Id.col,sep = ";")
  Condition <- unique(Data[,condition.col])

  #Differentially expressed proteins Proportions
  DiffProteins <- nrow(Data[(Data[,fold.change] > FCthr | Data[,fold.change] < -FCthr) & Data[,fdr.col]<fdrTHR,])
  Percentage <- DiffProteins/nrow(Data)

  ##Get Paralogs ####
  has.Paralogs <- hasParalogs(Paralogs,Data[,Id.col])
  Data$has.Paralogs <- has.Paralogs
  Data$species <- organism
  Data$cond <- Condition
  
  #Add to List
  DATASETS[[N]] <- Data[,c(Id.col,fold.change,fdr.col,condition.col,"has.Paralogs","cond","species")]
  
  #DiffExpressed <- referring to protein groups
  Diff.Expressed <- Data[(Data[,fold.change] > FCthr | Data[,fold.change] < -FCthr),]
  Diff.Expressed <- Diff.Expressed[!is.na(Diff.Expressed[,fold.change]),]

  Proportions <- t(rbind(table(Diff.Expressed$has.Paralogs),table(Data$has.Paralogs)))
  rownames(Proportions) <- c("NoParalogs","HasParalogs"); colnames(Proportions) <- c("DiffExpr","Others")
  Proportions[,2] <- Proportions[,2] - Proportions[,1] ; Proportions <- t(Proportions)

  #Fisher
  FISHER <- fisher.test(Proportions)
  WILCOX <- wilcox.test(Data[Data$has.Paralogs==T,fold.change],Data[Data$has.Paralogs==F,fold.change])

  #Substitute Condition
  Condition <- gsub("\\/","_",Condition)

  #Volcano Plots + Density Plots
  pdf(paste(PlotDir,"_VolcanoDensity",Condition,".pdf",sep = ""),height = 3.92, width = 5.15)

  #Volcano
  p1 <- ggplot() + geom_point(data = Data[Data$has.Paralogs==T,],aes(x=eval(parse(text = fold.change)),y=log10(eval(parse(text = fdr.col)))*(-1),color=has.Paralogs),size=0.7) +  
    theme_minimal() +
    ylab("log10(pval)")  +
    geom_point(data = Data[Data$has.Paralogs==F,],aes(x=eval(parse(text = fold.change)),y=log10(eval(parse(text = fdr.col)))*(-1),color=has.Paralogs),size=0.7) + 
    scale_color_manual(values = c("#e6d5b8","#0278ae")) + 
    theme_bw() + ggtitle("Paralogs Diff.Expression")
    

  #Density Plot
  p2 <- ggplot(data=Data) + geom_density(aes(x=eval(parse(text = fold.change)),fill=has.Paralogs,color=has.Paralogs),alpha=0.5) + theme_minimal() +
    ggtitle(ExpName,subtitle = paste(Condition," Wilcox.pvalue:",format(WILCOX$p.value,digits = 3))) + xlab(fold.change)

  print(p1)
  print(p2)
  
  dev.off()
  
  print(p1)
  print(p2)
  
  #Paralogs Regulated in Opposite Way
  SUBUNITS <- subunits(Complexes)
  Paralogs.Expressions <- getParalogs(Paralogs,Diff.Expressed[,Id.col])
  Paralogs.Expressions <- Paralogs.Expressions[Paralogs.Expressions %in% Diff.Expressed[,Id.col]]
  Paralogs.Expressions <- data.frame(p.x = Paralogs.Expressions,p.y=names(Paralogs.Expressions))
  
  x.exp <- sapply(Paralogs.Expressions$p.x,function(x){(Diff.Expressed[Diff.Expressed[,Id.col]==x,fold.change])})
  y.exp <- sapply(Paralogs.Expressions$p.y,function(x){(Diff.Expressed[Diff.Expressed[,Id.col]==x,fold.change])})
  
  #Create a dataframe
  Paralogs.Expressions <- force.cbind(x.exp,y.exp)
  colnames(Paralogs.Expressions) <- c("p.x","p.y","x.exp","y.exp")
  
  Paralogs.Expressions$dens = get_density(Paralogs.Expressions$x.exp,Paralogs.Expressions$y.exp,n = 100)
  Paralogs.Expressions$dir <- apply(Paralogs.Expressions[,c(3,4)],1,function(x){if(x[1]*x[2]>0){return("same")}else{return("opposite")}})
  Paralogs.Expressions$subunits <- apply(Paralogs.Expressions[,c(1,2)],1,function(x){if(sum(x %in% SUBUNITS)==2){return("subunits")}else{return("others")}})
  Paralogs.Expressions$organism <- organism
  Paralogs.Expressions$condition <- Condition
  Paralogs.Expressions$species <- species
  Paralogs.Expressions <- unique(Paralogs.Expressions)
  Paralogs.Expressions <- Paralogs.Expressions %>% filter(p.x != p.y)
  
  #Add to list
  PARALOGS[[N]] <- Paralogs.Expressions
  
  Rs <- cor(Paralogs.Expressions$x.exp,Paralogs.Expressions$y.exp)

  #GGplot
  pdf(paste(PlotDir,"_ParalogsFoldChanges",Condition,".pdf",sep = ""),height = 3.45, width = 3.40)
  p3 <- ggplot(data = Paralogs.Expressions) + geom_point(aes(x=x.exp,y=y.exp),size=0.8) + theme_minimal() +
        xlab(paste("Paralog1",fold.change)) + ylab(paste("Paralog2",fold.change)) +
        ggtitle(paste(ExpName," | ",Condition,sep = ""),subtitle=paste("R: ",format(round(Rs,3)))) + theme(legend.position = "none")

  print(p3)
  dev.off()
  
  print(p3)
  
  #Percentage Of Paralogs in Opposite Directions
  Paralogs.Proportions <- table(Paralogs.Expressions$dir)
  
  #GOEnrichment Analysis on Paralogs ####
  AllParalogs <- unique(as.character(Paralogs.Expressions$p.x,Paralogs.Expressions$p.y))
  Opposite <- unique(as.character(unlist(Paralogs.Expressions[Paralogs.Expressions$dir=="opposite",c("p.x","p.y")])))
  Same <- unique(as.character(unlist(Paralogs.Expressions[Paralogs.Expressions$dir=="same",c("p.x","p.y")])))
  
  #if the organism is drerio put everything to lowercase
  if(species=="Dr")
  {
    geneList <- tolower(AllParalogs)
    interesting.x <- tolower(Opposite)
    interesting.y <- tolower(Same)
  }else if(species=="Hs")
  {
    geneList <- toupper(AllParalogs)
    interesting.x <- toupper(Opposite)
    interesting.y <- toupper(Same)
  }else{
    geneList <- stringr::str_to_title(AllParalogs)
    interesting.x <- stringr::str_to_title(Opposite)
    interesting.y <- stringr::str_to_title(Same)
  }
  
  #Annotation with spatialR GOEnrichment 
  EnrichmentGO.Opposite <- spatialR::GOEnrichment(geneList = (geneList),interesting = (interesting.x),species = species,ontology = "BP")
  EnrichmentGO.Same <- spatialR::GOEnrichment(geneList = geneList,interesting =interesting.y,species = species,ontology = "BP")
  
  #Add info column
  EnrichmentGO.Opposite$table$organism <- organism
  EnrichmentGO.Opposite$table$condition <- Condition
  
  #Summarized
  GOSumm <- SummarizeGO(EnrichmentGO.Opposite,plot = "scatter",)
  
  GOTABLES[[N]] <- EnrichmentGO.Opposite$table
  GOTABLES.Summ[[N]] <- GOSumm$reducedTerms
  GOENRICH[[N]] <- EnrichmentGO.Opposite
  
  #PlotGOEnrichment
  pdf(paste(PlotDir,"_ParalogsGOEnrichment_",Condition,".pdf",sep = ""),height = 7, width = 7)
  
  p2 <- ggplot(data=EnrichmentGO.Same$table[1:20,]) + geom_bar(aes(x=Term,y=log10(classic)*(-1)),stat="identity",alpha=0.3,color="black") +
        theme(axis.text.x = element_text(angle = 60,hjust = 1)) + coord_flip() + theme_minimal()+
        ggtitle("Paralogs Same dir.",subtitle = "Top20 shown, Same Paralogs vs All paralogs")+
        geom_hline(yintercept = -log10(0.05),lwd=0.5,color="red",lty=2) +
        ylab("-log10(p.value)")
  
  p3 <- ggplot(data=unique(EnrichmentGO.Opposite$table[1:20,])) + geom_bar(aes(x=Term,y=log10(classic)*(-1)),stat="identity",alpha=0.3,color="black") + 
        theme(axis.text.x = element_text(angle = 60,hjust = 1)) + coord_flip() + theme_minimal()+
        ggtitle("Paralogs Same dir.",subtitle = "Top20 shown, Opp. Paralogs vs All paralogs") + 
        geom_hline(yintercept = -log10(0.05),lwd=0.5,color="red",lty=2) +
        ylab("-log10(p.value)")
  
  print(p1)
  print(p2)
  print(p3)
  
  dev.off()
  
  print(p1)
  print(p2)
  print(p3)
  
  #Write Table for Analize Data
  write.table(Paralogs.Expressions,paste("Out/AnalizeData/",organism,Condition,"ParalogsExpression.txt",sep = ""),sep="\t",row.names = F)
  write.table(EnrichmentGO.Opposite$Table,paste("Out/AnalizeData/GOEnrichment/",organism,Condition,"_BP_","OppositeParalogs.txt",sep = ""),sep = "\t",row.names = F)
  write.table(EnrichmentGO.Same$Table,paste("Out/AnalizeData/GOEnrichment/",organism,Condition,"_BP_","SameParalogs.txt",sep = ""),sep = "\t",row.names = F)
}

## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.

## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.

## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.

## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.

## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.

## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.

## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.

## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.

## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.

## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.

## Warning: Removed 427 rows containing missing values (geom_point).

## Warning: Removed 135 rows containing missing values (geom_point).

## Warning: Removed 562 rows containing non-finite values (stat_density).

## Warning: Removed 427 rows containing missing values (geom_point).

## Warning: Removed 135 rows containing missing values (geom_point).

## Warning: Removed 562 rows containing non-finite values (stat_density).

## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.

## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.

## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## Warning: Removed 427 rows containing missing values (geom_point).

## Warning: Removed 135 rows containing missing values (geom_point).

## Warning: Removed 427 rows containing missing values (geom_point).

## Warning: Removed 135 rows containing missing values (geom_point).

## Warning: Removed 456 rows containing missing values (geom_point).

## Warning: Removed 122 rows containing missing values (geom_point).

## Warning: Removed 578 rows containing non-finite values (stat_density).

## Warning: Removed 456 rows containing missing values (geom_point).

## Warning: Removed 122 rows containing missing values (geom_point).

## Warning: Removed 578 rows containing non-finite values (stat_density).

## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.

## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.

## Warning: Removed 456 rows containing missing values (geom_point).

## Warning: Removed 122 rows containing missing values (geom_point).

## Warning: Removed 456 rows containing missing values (geom_point).

## Warning: Removed 122 rows containing missing values (geom_point).

## Warning: Removed 416 rows containing missing values (geom_point).

## Warning: Removed 147 rows containing missing values (geom_point).

## Warning: Removed 563 rows containing non-finite values (stat_density).

## Warning: Removed 416 rows containing missing values (geom_point).

## Warning: Removed 147 rows containing missing values (geom_point).

## Warning: Removed 563 rows containing non-finite values (stat_density).

## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.

## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.

## Warning: Removed 416 rows containing missing values (geom_point).

## Warning: Removed 147 rows containing missing values (geom_point).

## Warning: Removed 416 rows containing missing values (geom_point).

## Warning: Removed 147 rows containing missing values (geom_point).

## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.

## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.

14.1 save workspace

We save the information coming out from this analysis in a workspace.

save.image("../Markdown/workspaces/05-MSProteomicsData_Paralogs01.RData")

14.2 Add Information to Datasets

Add Species information to paralogs datasets. The PARALOGS datasets contains the paralog pairs in all the MS dataset considered. We cycle through each file and we add a column called species in order to annotate each dataset. We then

PARALOGS <- lapply(seq_along(PARALOGS),function(x){PARALOGS[[x]]$species<-DataInfo[x,"species"];return(PARALOGS[[x]])})
PARALOGS <- lapply(PARALOGS,function(x)unique(x))

We also declare again the color code to use for plotting paralog genes.

ParalogsColor <- c("#e6d5b8","#0278ae")

14.3 Paralogs more diff.expressed

Now we need to carry on some data manipulation. From each dataset, we update the column name to be consistent across all datasets. We bind them together, and calculate the absolute fold change values for each proteins.

DATASETS <- lapply(DATASETS, function(x){colnames(x)<-c("id","fold.change","fdr","cond","has.Paralogs","comparison","species");return(x)})
Dataset.combined <- do.call(rbind.data.frame,DATASETS)
Dataset.combined$abs.fold.change <- abs(Dataset.combined$fold.change)

After we use the rstatix package to run a wilcoxon test between genes that have paralogs and genes that do not have paralogs.

#Add test information
Datasets.test <- Dataset.combined %>% group_by(cond,species) %>% 
                 wilcox_test(abs.fold.change ~ has.Paralogs) %>%
                 add_significance("p") %>% add_xy_position(x = "cond") %>%
                 group_by(species) %>% mutate(x.pos = 1:n()) %>% ungroup()

And we assemble now all the data in a boxplot

pdf("../out/plots/p02MSData.pdf",height = 5,width = 5)
p02MSData <- ggplot(data=Dataset.combined) + 
             geom_boxplot(aes(x=cond,y=abs(fold.change),fill=has.Paralogs),
                          alpha=0.4) + 
             scale_fill_manual(values = ParalogsColor) + 
             stat_pvalue_manual(Datasets.test,x = 'x.pos',
                                tip.length = 0.5,position = position_dodge(0.8),
                                coord.flip = T) + 
             facet_wrap(.~species,nrow = 2,scales = "free") + coord_flip() + 
              theme_bw() + 
             xlab("") + ylab("absolute(Log2(Fold Change))")

p02MSData
dev.off()

## png 
##   2

p02MSData

14.4 Proportion of Diff. Expressed Paralogs

Here we count paralog pairs that are regulated in a concerted or opposite manner regarding their fold change. For this reason we bind the PARALOGS data together (this dataset contains the direct comparison of paralog pairs during neuronal differentiation in all the dataset). We defining only unique entries, avoiding duplicated instances that may arise. After using the variable dir that store if a specific paralg pairs is regulated in the same way, we count the entries. We need to divide by 2 to take into account just for unique pairs. Since both (Paralog1 , Paralog2 and Paralog2, Paralog1) are now present in this specific datasets.

All.Paralogs.Dataset <- do.call(rbind.data.frame,PARALOGS)
All.Paralogs.Dataset <- unique(All.Paralogs.Dataset)

#We need to divide by 2 because we need to consider unique pairs
All.Paralogs.Dataset.count <- All.Paralogs.Dataset %>%
                              group_by(dir,condition,organism) %>%
                              summarise(n=n()/2)

All.Paralogs.Dataset.count <- as.data.frame(All.Paralogs.Dataset.count)

We assemble this counting in a barplot format, facetting for the different organisms.

pdf("../out/plots/p01MSData.pdf",height = 5,width = 5)
p01MSData <- ggplot(data = All.Paralogs.Dataset.count) + 
             geom_bar(aes(y=n,x=condition,fill=dir),
                      stat = "identity",color="black") + coord_flip() + 
             facet_wrap(.~organism,scales = "free") + 
             theme_bw() + 
             scale_fill_brewer(palette = "BrBG",
                               name = "Paralogs protein Fold Change",
                               labels = c("Opposite sign", "Same sign")) +
             ylab("# of unique diff. expressed paralog pairs") + xlab("") +
             theme(legend.position = "top")

p01MSData
dev.off()

## png 
##   2

p01MSData

14.5 Coregulation of Paralog Pairs

We subset the paralog pairs and create the function GetQuadrant for getting the quadrant for each observation.

All.Paralogs.Dataset <- All.Paralogs.Dataset %>% filter(p.x!=p.y)
All.Paralogs.Dataset <- All.Paralogs.Dataset %>% group_by(species,condition) %>% mutate(corr=cor(x.exp,y.exp))

GetQuadrant <- function(x,y)
{
  v <- character(length = length(x))
  v[x<0 & y>0] <- "q1"
  v[x>0 & y>0] <- "q2"
  v[x>0 & y<0] <- "q3"
  v[x<0 & y<0] <- "q4"
  return(v)
}

Prepare dataset informations to be shown on plot, for example here we arrange the pvalues for the correlations, the number of proteins present in each quadrants and the coordinate where to plot each label. First we group the dataset by organism and condition. Then for each of these, we retrieve the location of every protein (in which quadrant of the plot) they reside, using the GetQuadrant function, we then group also by the quadrant and calculate with quadrant.count the number of entries. We then run a correlation test on each group storing information from the correlation score and the pvalues. We add further information of formatting the pvalues, and position that will be useful for plotting purposes.

All.Paralogs.Dataset.corr.quadrant <- All.Paralogs.Dataset %>% 
                                      group_by(organism,condition) %>% 
                                   mutate(quadrant = GetQuadrant(x.exp,y.exp)) %>%
                                   group_by(organism,condition,quadrant) %>% 
                                   mutate(quadrant.count= n()) %>%
                                   group_by(organism,condition) %>%
                                   summarize(corr=cor.test(x.exp,y.exp)$estimate,
                                             p.value = cor.test(x.exp,y.exp)$p.value,quadrant,quadrant.count,x.exp,y.exp) %>%
                                   as.data.frame() %>% 
                                   mutate(signif= ifelse(p.value< 2.2e-16,"< 2.2e-16",paste("=",formatC(p.value,format = "e",digits = 2)))) %>% 
                                   group_by(organism,condition,quadrant,add = T) %>%
                                   mutate(x.pos = mean(c(min(x.exp),max(x.exp))),
                                          y.pos = mean(c(min(y.exp),max(y.exp)))) %>%
                                  dplyr::select(organism,condition,corr,p.value,quadrant.count,signif,x.pos,y.pos) %>%
                                  unique() %>% as.data.frame()

## Warning: The `add` argument of `group_by()` is deprecated as of dplyr 1.0.0.
## Please use the `.add` argument instead.

#Only unique paralogs in each quadrant
All.Paralogs.Dataset.corr.quadrant$quadrant.count <- ifelse(All.Paralogs.Dataset.corr.quadrant$quadrant %in% c("q2","q4"),All.Paralogs.Dataset.corr.quadrant$quadrant.count/2,
                                                            All.Paralogs.Dataset.corr.quadrant$quadrant.count)
All.Paralogs.Dataset.corr <- unique(All.Paralogs.Dataset.corr.quadrant[,c(2:5,7)])

We can plot the totality of the dataset, that represents the paralog co-regulation across all species and condition, plus informations on the number of unique pairs present in each quadrant and correlations score.

pdf("../out/plots/p01MSData.pdf",height = 5,width = 5)

p03MSData <- ggplot(data=All.Paralogs.Dataset) +
             geom_point(aes(x.exp,y.exp,color=dir),size=0.7) + 
             scale_color_manual(values=c("#DFC27D","lightgrey")) + 
             facet_wrap(organism~condition,scales = "free") + theme_bw() +
             geom_smooth(aes(x.exp,y.exp),color="orange",method = "lm") + 
             geom_text(data=All.Paralogs.Dataset.corr,x=-Inf,y=Inf,
                       aes(label=paste("R:",round(corr,digits = 2))),hjust=-0.1,vjust=1.5,color="red",size=3.5) +
             geom_text(data=All.Paralogs.Dataset.corr,x=+Inf,y=-Inf,
                       aes(label=paste("p.value",signif)),hjust=1.15,vjust=-1,color="black",size=3) + 
             geom_label(data=All.Paralogs.Dataset.corr.quadrant %>% filter(quadrant!="q3"),
                        aes(label=quadrant.count,x=x.pos,y=y.pos),size=3.5) + 
             xlab("Log2 Fold Change (Paralog1)") + ylab("Log2 Fold Change(Paralog2)") + 
             theme(legend.position = "none")

p03MSData
dev.off()

## png 
##   2

p03MSData

14.6 GO Enrichment Diff.Expressed Paralogs

14.7 Common Trace

Even thou in each specific dataset the GOEnrichment was highlighting specific terms was possible to extract some common GOTerm that were shared across organisms. We grep on GOterms name to highlight specific GOTerm and retrieve genes that are associated with specific

14.7.1 Transport and vesicles

We select factors with some key words and we show them in a classic barplot colored by dataset.

keywords <- paste(c("vesicle","synapse","transport"),collapse = "|")

#transport GOTERM
TERMS.transport <- lapply(GOTABLES,function(x)x[grep(keywords,x$Term),])
TERMS.transport <- do.call(rbind.data.frame,TERMS.transport)

#Sort Them
TERMS.transport <- TERMS.transport[order(TERMS.transport$p.adjust),]
TERMS.transport <- TERMS.transport %>% group_by(organism) %>% 
                   mutate(pos = 1:n()) %>% filter(pos <= 3) %>% as.data.frame()

TERMS.transport$condition <-gsub("\\.","/",TERMS.transport$condition)
TERMS.transport$condition <- gsub("0","0/",TERMS.transport$condition)              

#BARPLOT
p05MSData <- ggplot(data=TERMS.transport) + 
             geom_bar(aes(y=Term,x=-log10(p.adjust)),color=NA,stat = "identity",
                                                     fill='#B4CDE2') + 
            facet_wrap(.~organism,ncol=1,scales = "free_y") + theme_bw() + 
            geom_text(aes(y=Term,x=-log10(p.adjust)/2,label=condition)) + 
            geom_text(aes(y=Term,x=-log10(p.adjust)+0.25,label=Significant),color="red")

pdf("../out/plots/p05MSData.pdf",width = 7,height = 8)
p05MSData
dev.off()

## png 
##   2

p05MSData

14.7.2 DNA binding

We select factors with some key words and we show them in a classic barplot colored by dataset.

keywords <- paste(c("DNA","chromatin"),collapse = "|")

#dna GOTERM
TERMS.dna <- lapply(GOTABLES,function(x)x[grep(keywords,x$Term),])
TERMS.dna <- do.call(rbind.data.frame,TERMS.dna)

#Sort Them
TERMS.dna <- TERMS.dna[order(TERMS.dna$p.adjust),]
TERMS.dna <- TERMS.dna %>% group_by(organism) %>% 
                   mutate(pos = 1:n()) %>% filter(pos <= 3) %>% as.data.frame()

TERMS.dna$condition <-gsub("\\.","/",TERMS.dna$condition)
TERMS.dna$condition <- gsub("3","3/",TERMS.dna$condition)              

#BARPLOT
pdf("../out/plots/p06MSData.pdf",width = 7,height = 8)

p06MSData <- ggplot(data=TERMS.dna) + 
  geom_bar(aes(y=Term,x=-log10(p.adjust)),fill='#F9B4AF',
           color=NA,stat = "identity") + 
              facet_wrap(.~organism,ncol=1,scales = "free_y") + theme_bw() + 
              scale_fill_brewer(palette = "Pastel1") + 
              geom_text(aes(y=Term,x=-log10(p.adjust)/2,label=condition)) + 
              geom_text(aes(y=Term,x=-log10(p.adjust)+0.25,label=Significant),color="red")

p06MSData
dev.off()

## png 
##   2

p06MSData

14.8 GO pie plot

In this sections we represent the different composition in enrichment as a pie plot first we load all the genes related to the terms enriched in the data. We also need to change the character to lower or to title for some specific organism to match the GOannotation.

Opposite_p <-lapply(PARALOGS,function(x)unlist(x[x$dir=='opposite',c('p.x','p.y')]))
Opposite_p <-lapply(Opposite_p,unique)
names(Opposite_p) <- paste(DataInfo$condition,DataInfo$species,sep = '_')
#Change strtotitle for gene name only for some species
to_change <- Opposite_p[grep('Mm|Rn',names(Opposite_p))]
#Substitute in List
Opposite_p[grep('Mm|Rn',names(Opposite_p))] <- lapply(to_change,
                                                      stringr::str_to_title)
#For zebrafish genenames are alltolower
to_change <- Opposite_p[grep('Dr',names(Opposite_p))]
#Substitute in List
Opposite_p[grep('Dr',names(Opposite_p))] <- lapply(to_change,tolower)

We then extract the GOAnnotation for the enrichmed GOTerm and create a dataframe called LIST_l that contains all the information

#Extract gene in each go term
LIST_l <- lapply(seq_along(Opposite_p),function(x)
  {
    df <- unlist(spatialR::ShowGenesGOTerms(GOENRICH[[x]],Opposite_p[[x]]))
    df <- df %>% as.data.frame(row.names = names(df))
    df$GO <- gsub('\\d+$','',rownames(df))
    df$cond <- DataInfo$condition[x];df$species <- DataInfo$species[x]
    colnames(df) <- c('gene','GO','cond','species')
    return(df)
});names(LIST_l) <- paste(DataInfo$condition,DataInfo$species,sep = '_')
LIST_l <- do.call(rbind.data.frame,LIST_l)

We grep on keyword related to DNA and chromatin.

keywords_d <- paste(c("DNA","chromatin"),collapse = "|")
keywords_t <- paste(c("vesicle","synapse","transport"),collapse = "|")

transport_g <- LIST_l[grep(keywords_t,LIST_l$GO),'gene']
dna_g <- LIST_l[grep(keywords_d,LIST_l$GO),'gene']

LIST_l$subset <- 'others'
LIST_l$subset[LIST_l$gene %in% transport_g] <- 'Transport,synapse,vesicles'
LIST_l$subset[LIST_l$gene %in% dna_g] <- 'DNA and chromatin'
LIST_l <- unique(LIST_l[,c('gene','cond','species','subset')])

We change the order of some factors

#Summarize data
Order_l <- c('Transport,synapse,vesicles','DNA and chromatin','others')

LIST_l_s <- LIST_l %>% group_by(subset,cond,species) %>% mutate(count=n()) %>%
            dplyr::select(cond,species,subset,count) %>% as.data.frame() %>%
            unique() %>% 
            arrange(factor(subset,levels = rev(Order_l))) %>%
            group_by(cond,species) %>% 
            mutate(perc = count/sum(count),
                   ypos = cumsum(perc)-(perc/2)) %>%
            mutate(lab = paste(format(perc*100,digits=1),'%',sep = ''))

#Order factors
Order <- c('NPC.iPS','Neu.IPS','Neu.NPC','DIV3.DIV0','DIV10.DIV0','DIV10.DIV3','DIV5.DIV1',
           'DIV14.DIV1','Neur.Stem')

LIST_l_s$cond <- factor(LIST_l_s$cond,levels = Order)
LIST_l_s$subset <- factor(LIST_l_s$subset,levels = Order_l)

We plot the pie plot.

cp <- coord_polar(theta = "y",start = 0,direction = 1)
cp$is_free <- function() TRUE

pdf(paste('../out/figures/FigSupp4/GOPie_',Sys.Date(),'.pdf',sep = ''),
    width = 6.75,height = 4.41)

go_pie <- ggplot(LIST_l_s, aes(x="", y=perc, fill=subset)) +
          geom_bar(stat="identity", width=1) +
          cp + theme_void() + 
          facet_wrap(.~cond,scales = 'free',nrow = 1) + 
          geom_text(aes(y=ypos,label=lab),size=3) + 
          theme(aspect.ratio = 1,legend.position = 'top') + xlab('') + ylab('') + 
          scale_fill_manual(values = c('#B4CDE2','#F9B4AF','#DDDDDD'))
go_pie
dev.off()

## png 
##   2

go_pie

14.9 FigSupp5

pdf(paste("../out/figures/FigSupp5/SuppFig5_",Sys.Date(),".pdf",sep = ''),
    width = 9,height = 6)
Supp.Fig5 <- p02MSData
p02MSData

## Warning: Removed 1703 rows containing non-finite values (stat_boxplot).

dev.off()

## png 
##   2

Supp.Fig5

## Warning: Removed 1703 rows containing non-finite values (stat_boxplot).

14.10 FigSupp6

pdf(paste("../out/figures/FigSupp6/ParalogScatter_",Sys.Date(),".pdf",sep = ''),
    width = 7.5,height = 6.5)
Supp.Fig6 <- p03MSData
Supp.Fig6
dev.off()

## png 
##   2

Supp.Fig6

14.11 SuppFig6

pdf(paste("../out/figures/FigSupp6/GObarplot_",Sys.Date(),".pdf",sep = ''),
    width = 12,height = 5.26)
GObarplot <- plot_grid(p05MSData,p06MSData,ncol = 2)
GObarplot
dev.off()

## png 
##   2

GObarplot

save.image("../Markdown/workspaces/05-MSProteomicsData_ParalogsCompleteWorkspace.RData")

14.12 Supp Figure 4

pdf(paste('../out/figures/FigSupp4/SuppFig4_Updated',Sys.Date(),'.pdf',sep=''),
    width = 9,height = 15)
SuppFig4 <- plot_grid(p03MSData,go_pie,GObarplot,nrow = 3,
                      rel_heights = c(1,0.3,0.8),labels = c('A','B'))
SuppFig4
dev.off()

## png 
##   2

14.13 Write Tables

14.13.1 Table S5

l <- list('Paralog Pairs' = All.Paralogs.Dataset[,-c(5,7,11)],
          'GOEnrichment (ORA)' = do.call(rbind.data.frame,GOTABLES))

#Write table
openxlsx::write.xlsx(l,'../out/tables/Table_S5.xlsx')