Section 14 MS Proteomics Data Paralog Analysis
The huge chunck of code below, carries on the analysis on Paralogs Genes accross all the MS dataset that we have. The dataset information are read from a DataInfo files that contains important information on column names and where to get all the information in each file. Then it iterates throught each files carrying on this analysis in order.
- Annotate each dataset with Paralogs definition
- Annotate protein complexes
- Calculate the proportion of differentially expressed proteins that have paralogs and those that have not
- Plot for every dataset a volcano plot and a density plot with paralogs and not paralogs.
- Calculates paralogs pairs that are regulated in opposite ways.
- Carries on a GOEnrichment of those Opposite regulated paralogs.
- Returns plots and files inside the
Out/AnalyzeData/folder
Clik here to show code
setwd("../ComplexScript/")
source("complexes_function.R")
library(fdrtool)
library(ggpubr)
library(rstatix)
library(dplyr)
library(paralogstools)
library(proteomicstools)
library(spatialR)
library(ComplexHeatmap)
library(MASS)
library(ggplot2)
library(topGO)
library(gridExtra)
library(biomaRt)
#Load data info for each dataset < ----
DataInfo <- read.table("../Data/Dataset/DataInfo.txt",sep = "\t",header = T,stringsAsFactors = F)
#FCthr and fdrTHR
FCthr <- 0.58
fdrTHR <- 0.05
#Paralogs.Expressions and GOterms list
PARALOGS <- vector(mode = "list",length = nrow(DataInfo))
DATASETS <- vector(mode = "list",length = nrow(DataInfo))
GOTABLES <- vector(mode = "list",length = nrow(DataInfo))
GOTABLES.Summ <- vector(mode = "list",length = nrow(DataInfo))
GOENRICH <- vector(mode = "list",length = nrow(DataInfo))
#Paralogs Palette
ParalogsColor <- c("#e6d5b8","#0278ae")
#Run Plots for all the datasets.
for (N in c(1:nrow(DataInfo)))
{
#GetData information form DataInfo File
filename <- DataInfo[N,"filename"]
Id.col <- DataInfo[N,"Id.col"]
fold.change <- DataInfo[N,"fold.change"]
fdr.col <- DataInfo[N,"fdr.col"]
condition.col <- DataInfo[N,"condition.col"]
organism <- DataInfo[N,"organism"]
out.label <- DataInfo[N,"out.label"]
sep <- DataInfo[N,"sep"];if(sep=="\\t"){sep<-"\t"}
complex.name <- DataInfo[N,"complex.name"]
ID.type <- DataInfo[N,"ID.type"]
OutName <- DataInfo[N,"out.label"]
PlotDir <- paste("Plots/",strsplit(OutName,"/")[[1]][2],sep = "")
ExpName <- strsplit(OutName,"/")[[1]][2]
ParalogsFileName <- paste("../Data/Paralogs/",organism,"_",ID.type,"_paralogs_v102.txt",sep = "")
species <- DataInfo[N,"species"]
#Load Paralogs, get the Already Cleaned Paralogs from File with filename=""
Paralogs <- ParalogsSet(organism = organism,identifier = ID.type,filename = ParalogsFileName)
#Load Protein Complexes
Complexes <- ComplexSet(organism = organism,identifier = ID.type)
#Differential Expressions Analysis ####
Data <- read.delim(filename,sep = sep,header = T,stringsAsFactors = F)
Data <- SplitDatasetByID(Data,Id.col,sep = ";")
Condition <- unique(Data[,condition.col])
#Differentially expressed proteins Proportions
DiffProteins <- nrow(Data[(Data[,fold.change] > FCthr | Data[,fold.change] < -FCthr) & Data[,fdr.col]<fdrTHR,])
Percentage <- DiffProteins/nrow(Data)
##Get Paralogs ####
has.Paralogs <- hasParalogs(Paralogs,Data[,Id.col])
Data$has.Paralogs <- has.Paralogs
Data$species <- organism
Data$cond <- Condition
#Add to List
DATASETS[[N]] <- Data[,c(Id.col,fold.change,fdr.col,condition.col,"has.Paralogs","cond","species")]
#DiffExpressed <- referring to protein groups
Diff.Expressed <- Data[(Data[,fold.change] > FCthr | Data[,fold.change] < -FCthr),]
Diff.Expressed <- Diff.Expressed[!is.na(Diff.Expressed[,fold.change]),]
Proportions <- t(rbind(table(Diff.Expressed$has.Paralogs),table(Data$has.Paralogs)))
rownames(Proportions) <- c("NoParalogs","HasParalogs"); colnames(Proportions) <- c("DiffExpr","Others")
Proportions[,2] <- Proportions[,2] - Proportions[,1] ; Proportions <- t(Proportions)
#Fisher
FISHER <- fisher.test(Proportions)
WILCOX <- wilcox.test(Data[Data$has.Paralogs==T,fold.change],Data[Data$has.Paralogs==F,fold.change])
#Substitute Condition
Condition <- gsub("\\/","_",Condition)
#Volcano Plots + Density Plots
pdf(paste(PlotDir,"_VolcanoDensity",Condition,".pdf",sep = ""),height = 3.92, width = 5.15)
#Volcano
p1 <- ggplot() + geom_point(data = Data[Data$has.Paralogs==T,],aes(x=eval(parse(text = fold.change)),y=log10(eval(parse(text = fdr.col)))*(-1),color=has.Paralogs),size=0.7) +
theme_minimal() +
ylab("log10(pval)") +
geom_point(data = Data[Data$has.Paralogs==F,],aes(x=eval(parse(text = fold.change)),y=log10(eval(parse(text = fdr.col)))*(-1),color=has.Paralogs),size=0.7) +
scale_color_manual(values = c("#e6d5b8","#0278ae")) +
theme_bw() + ggtitle("Paralogs Diff.Expression")
#Density Plot
p2 <- ggplot(data=Data) + geom_density(aes(x=eval(parse(text = fold.change)),fill=has.Paralogs,color=has.Paralogs),alpha=0.5) + theme_minimal() +
ggtitle(ExpName,subtitle = paste(Condition," Wilcox.pvalue:",format(WILCOX$p.value,digits = 3))) + xlab(fold.change)
print(p1)
print(p2)
dev.off()
print(p1)
print(p2)
#Paralogs Regulated in Opposite Way
SUBUNITS <- subunits(Complexes)
Paralogs.Expressions <- getParalogs(Paralogs,Diff.Expressed[,Id.col])
Paralogs.Expressions <- Paralogs.Expressions[Paralogs.Expressions %in% Diff.Expressed[,Id.col]]
Paralogs.Expressions <- data.frame(p.x = Paralogs.Expressions,p.y=names(Paralogs.Expressions))
x.exp <- sapply(Paralogs.Expressions$p.x,function(x){(Diff.Expressed[Diff.Expressed[,Id.col]==x,fold.change])})
y.exp <- sapply(Paralogs.Expressions$p.y,function(x){(Diff.Expressed[Diff.Expressed[,Id.col]==x,fold.change])})
#Create a dataframe
Paralogs.Expressions <- force.cbind(x.exp,y.exp)
colnames(Paralogs.Expressions) <- c("p.x","p.y","x.exp","y.exp")
Paralogs.Expressions$dens = get_density(Paralogs.Expressions$x.exp,Paralogs.Expressions$y.exp,n = 100)
Paralogs.Expressions$dir <- apply(Paralogs.Expressions[,c(3,4)],1,function(x){if(x[1]*x[2]>0){return("same")}else{return("opposite")}})
Paralogs.Expressions$subunits <- apply(Paralogs.Expressions[,c(1,2)],1,function(x){if(sum(x %in% SUBUNITS)==2){return("subunits")}else{return("others")}})
Paralogs.Expressions$organism <- organism
Paralogs.Expressions$condition <- Condition
Paralogs.Expressions$species <- species
Paralogs.Expressions <- unique(Paralogs.Expressions)
Paralogs.Expressions <- Paralogs.Expressions %>% filter(p.x != p.y)
#Add to list
PARALOGS[[N]] <- Paralogs.Expressions
Rs <- cor(Paralogs.Expressions$x.exp,Paralogs.Expressions$y.exp)
#GGplot
pdf(paste(PlotDir,"_ParalogsFoldChanges",Condition,".pdf",sep = ""),height = 3.45, width = 3.40)
p3 <- ggplot(data = Paralogs.Expressions) + geom_point(aes(x=x.exp,y=y.exp),size=0.8) + theme_minimal() +
xlab(paste("Paralog1",fold.change)) + ylab(paste("Paralog2",fold.change)) +
ggtitle(paste(ExpName," | ",Condition,sep = ""),subtitle=paste("R: ",format(round(Rs,3)))) + theme(legend.position = "none")
print(p3)
dev.off()
print(p3)
#Percentage Of Paralogs in Opposite Directions
Paralogs.Proportions <- table(Paralogs.Expressions$dir)
#GOEnrichment Analysis on Paralogs ####
AllParalogs <- unique(as.character(Paralogs.Expressions$p.x,Paralogs.Expressions$p.y))
Opposite <- unique(as.character(unlist(Paralogs.Expressions[Paralogs.Expressions$dir=="opposite",c("p.x","p.y")])))
Same <- unique(as.character(unlist(Paralogs.Expressions[Paralogs.Expressions$dir=="same",c("p.x","p.y")])))
#if the organism is drerio put everything to lowercase
if(species=="Dr")
{
geneList <- tolower(AllParalogs)
interesting.x <- tolower(Opposite)
interesting.y <- tolower(Same)
}else if(species=="Hs")
{
geneList <- toupper(AllParalogs)
interesting.x <- toupper(Opposite)
interesting.y <- toupper(Same)
}else{
geneList <- stringr::str_to_title(AllParalogs)
interesting.x <- stringr::str_to_title(Opposite)
interesting.y <- stringr::str_to_title(Same)
}
#Annotation with spatialR GOEnrichment
EnrichmentGO.Opposite <- spatialR::GOEnrichment(geneList = (geneList),interesting = (interesting.x),species = species,ontology = "BP")
EnrichmentGO.Same <- spatialR::GOEnrichment(geneList = geneList,interesting =interesting.y,species = species,ontology = "BP")
#Add info column
EnrichmentGO.Opposite$table$organism <- organism
EnrichmentGO.Opposite$table$condition <- Condition
#Summarized
GOSumm <- SummarizeGO(EnrichmentGO.Opposite,plot = "scatter",)
GOTABLES[[N]] <- EnrichmentGO.Opposite$table
GOTABLES.Summ[[N]] <- GOSumm$reducedTerms
GOENRICH[[N]] <- EnrichmentGO.Opposite
#PlotGOEnrichment
pdf(paste(PlotDir,"_ParalogsGOEnrichment_",Condition,".pdf",sep = ""),height = 7, width = 7)
p2 <- ggplot(data=EnrichmentGO.Same$table[1:20,]) + geom_bar(aes(x=Term,y=log10(classic)*(-1)),stat="identity",alpha=0.3,color="black") +
theme(axis.text.x = element_text(angle = 60,hjust = 1)) + coord_flip() + theme_minimal()+
ggtitle("Paralogs Same dir.",subtitle = "Top20 shown, Same Paralogs vs All paralogs")+
geom_hline(yintercept = -log10(0.05),lwd=0.5,color="red",lty=2) +
ylab("-log10(p.value)")
p3 <- ggplot(data=unique(EnrichmentGO.Opposite$table[1:20,])) + geom_bar(aes(x=Term,y=log10(classic)*(-1)),stat="identity",alpha=0.3,color="black") +
theme(axis.text.x = element_text(angle = 60,hjust = 1)) + coord_flip() + theme_minimal()+
ggtitle("Paralogs Same dir.",subtitle = "Top20 shown, Opp. Paralogs vs All paralogs") +
geom_hline(yintercept = -log10(0.05),lwd=0.5,color="red",lty=2) +
ylab("-log10(p.value)")
print(p1)
print(p2)
print(p3)
dev.off()
print(p1)
print(p2)
print(p3)
#Write Table for Analize Data
write.table(Paralogs.Expressions,paste("Out/AnalizeData/",organism,Condition,"ParalogsExpression.txt",sep = ""),sep="\t",row.names = F)
write.table(EnrichmentGO.Opposite$Table,paste("Out/AnalizeData/GOEnrichment/",organism,Condition,"_BP_","OppositeParalogs.txt",sep = ""),sep = "\t",row.names = F)
write.table(EnrichmentGO.Same$Table,paste("Out/AnalizeData/GOEnrichment/",organism,Condition,"_BP_","SameParalogs.txt",sep = ""),sep = "\t",row.names = F)
}## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
## Warning: Removed 427 rows containing missing values (geom_point).
## Warning: Removed 135 rows containing missing values (geom_point).
## Warning: Removed 562 rows containing non-finite values (stat_density).
## Warning: Removed 427 rows containing missing values (geom_point).
## Warning: Removed 135 rows containing missing values (geom_point).
## Warning: Removed 562 rows containing non-finite values (stat_density).
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: Removed 427 rows containing missing values (geom_point).
## Warning: Removed 135 rows containing missing values (geom_point).
## Warning: Removed 427 rows containing missing values (geom_point).
## Warning: Removed 135 rows containing missing values (geom_point).
## Warning: Removed 456 rows containing missing values (geom_point).
## Warning: Removed 122 rows containing missing values (geom_point).
## Warning: Removed 578 rows containing non-finite values (stat_density).
## Warning: Removed 456 rows containing missing values (geom_point).
## Warning: Removed 122 rows containing missing values (geom_point).
## Warning: Removed 578 rows containing non-finite values (stat_density).
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
## Warning: Removed 456 rows containing missing values (geom_point).
## Warning: Removed 122 rows containing missing values (geom_point).
## Warning: Removed 456 rows containing missing values (geom_point).
## Warning: Removed 122 rows containing missing values (geom_point).
## Warning: Removed 416 rows containing missing values (geom_point).
## Warning: Removed 147 rows containing missing values (geom_point).
## Warning: Removed 563 rows containing non-finite values (stat_density).
## Warning: Removed 416 rows containing missing values (geom_point).
## Warning: Removed 147 rows containing missing values (geom_point).
## Warning: Removed 563 rows containing non-finite values (stat_density).
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
## Warning: Removed 416 rows containing missing values (geom_point).
## Warning: Removed 147 rows containing missing values (geom_point).
## Warning: Removed 416 rows containing missing values (geom_point).
## Warning: Removed 147 rows containing missing values (geom_point).
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
14.1 save workspace
We save the information coming out from this analysis in a workspace.
save.image("../Markdown/workspaces/05-MSProteomicsData_Paralogs01.RData")14.2 Add Information to Datasets
Add Species information to paralogs datasets. The PARALOGS datasets contains the paralog pairs in all the MS dataset considered. We cycle through each file and we add a column called species in order to annotate each dataset. We then
PARALOGS <- lapply(seq_along(PARALOGS),function(x){PARALOGS[[x]]$species<-DataInfo[x,"species"];return(PARALOGS[[x]])})
PARALOGS <- lapply(PARALOGS,function(x)unique(x))We also declare again the color code to use for plotting paralog genes.
ParalogsColor <- c("#e6d5b8","#0278ae")14.3 Paralogs more diff.expressed
Now we need to carry on some data manipulation. From each dataset, we update the column name to be consistent across all datasets. We bind them together, and calculate the absolute fold change values for each proteins.
DATASETS <- lapply(DATASETS, function(x){colnames(x)<-c("id","fold.change","fdr","cond","has.Paralogs","comparison","species");return(x)})
Dataset.combined <- do.call(rbind.data.frame,DATASETS)
Dataset.combined$abs.fold.change <- abs(Dataset.combined$fold.change)After we use the rstatix package to run a wilcoxon test between genes that have paralogs and genes that do not have paralogs.
#Add test information
Datasets.test <- Dataset.combined %>% group_by(cond,species) %>%
wilcox_test(abs.fold.change ~ has.Paralogs) %>%
add_significance("p") %>% add_xy_position(x = "cond") %>%
group_by(species) %>% mutate(x.pos = 1:n()) %>% ungroup()And we assemble now all the data in a boxplot
pdf("../out/plots/p02MSData.pdf",height = 5,width = 5)
p02MSData <- ggplot(data=Dataset.combined) +
geom_boxplot(aes(x=cond,y=abs(fold.change),fill=has.Paralogs),
alpha=0.4) +
scale_fill_manual(values = ParalogsColor) +
stat_pvalue_manual(Datasets.test,x = 'x.pos',
tip.length = 0.5,position = position_dodge(0.8),
coord.flip = T) +
facet_wrap(.~species,nrow = 2,scales = "free") + coord_flip() +
theme_bw() +
xlab("") + ylab("absolute(Log2(Fold Change))")
p02MSData
dev.off()## png
## 2
p02MSData14.4 Proportion of Diff. Expressed Paralogs
Here we count paralog pairs that are regulated in a concerted or opposite manner regarding their fold change. For this reason we bind the PARALOGS data together (this dataset contains the direct comparison of paralog pairs during neuronal differentiation in all the dataset). We defining only unique entries, avoiding duplicated instances that may arise. After using the variable dir that store if a specific paralg pairs is regulated in the same way, we count the entries. We need to divide by 2 to take into account just for unique pairs. Since both (Paralog1 , Paralog2 and Paralog2, Paralog1) are now present in this specific datasets.
All.Paralogs.Dataset <- do.call(rbind.data.frame,PARALOGS)
All.Paralogs.Dataset <- unique(All.Paralogs.Dataset)
#We need to divide by 2 because we need to consider unique pairs
All.Paralogs.Dataset.count <- All.Paralogs.Dataset %>%
group_by(dir,condition,organism) %>%
summarise(n=n()/2)
All.Paralogs.Dataset.count <- as.data.frame(All.Paralogs.Dataset.count)We assemble this counting in a barplot format, facetting for the different organisms.
pdf("../out/plots/p01MSData.pdf",height = 5,width = 5)
p01MSData <- ggplot(data = All.Paralogs.Dataset.count) +
geom_bar(aes(y=n,x=condition,fill=dir),
stat = "identity",color="black") + coord_flip() +
facet_wrap(.~organism,scales = "free") +
theme_bw() +
scale_fill_brewer(palette = "BrBG",
name = "Paralogs protein Fold Change",
labels = c("Opposite sign", "Same sign")) +
ylab("# of unique diff. expressed paralog pairs") + xlab("") +
theme(legend.position = "top")
p01MSData
dev.off()## png
## 2
p01MSData14.5 Coregulation of Paralog Pairs
We subset the paralog pairs and create the function GetQuadrant for getting the quadrant for each observation.
All.Paralogs.Dataset <- All.Paralogs.Dataset %>% filter(p.x!=p.y)
All.Paralogs.Dataset <- All.Paralogs.Dataset %>% group_by(species,condition) %>% mutate(corr=cor(x.exp,y.exp))
GetQuadrant <- function(x,y)
{
v <- character(length = length(x))
v[x<0 & y>0] <- "q1"
v[x>0 & y>0] <- "q2"
v[x>0 & y<0] <- "q3"
v[x<0 & y<0] <- "q4"
return(v)
}Prepare dataset informations to be shown on plot, for example here we arrange the pvalues for the correlations, the number of proteins present in each quadrants and the coordinate where to plot each label. First we group the dataset by organism and condition. Then for each of these, we retrieve the location of every protein (in which quadrant of the plot) they reside, using the GetQuadrant function, we then group also by the quadrant and calculate with quadrant.count the number of entries. We then run a correlation test on each group storing information from the correlation score and the pvalues. We add further information of formatting the pvalues, and position that will be useful for plotting purposes.
All.Paralogs.Dataset.corr.quadrant <- All.Paralogs.Dataset %>%
group_by(organism,condition) %>%
mutate(quadrant = GetQuadrant(x.exp,y.exp)) %>%
group_by(organism,condition,quadrant) %>%
mutate(quadrant.count= n()) %>%
group_by(organism,condition) %>%
summarize(corr=cor.test(x.exp,y.exp)$estimate,
p.value = cor.test(x.exp,y.exp)$p.value,quadrant,quadrant.count,x.exp,y.exp) %>%
as.data.frame() %>%
mutate(signif= ifelse(p.value< 2.2e-16,"< 2.2e-16",paste("=",formatC(p.value,format = "e",digits = 2)))) %>%
group_by(organism,condition,quadrant,add = T) %>%
mutate(x.pos = mean(c(min(x.exp),max(x.exp))),
y.pos = mean(c(min(y.exp),max(y.exp)))) %>%
dplyr::select(organism,condition,corr,p.value,quadrant.count,signif,x.pos,y.pos) %>%
unique() %>% as.data.frame()## Warning: The `add` argument of `group_by()` is deprecated as of dplyr 1.0.0.
## Please use the `.add` argument instead.
#Only unique paralogs in each quadrant
All.Paralogs.Dataset.corr.quadrant$quadrant.count <- ifelse(All.Paralogs.Dataset.corr.quadrant$quadrant %in% c("q2","q4"),All.Paralogs.Dataset.corr.quadrant$quadrant.count/2,
All.Paralogs.Dataset.corr.quadrant$quadrant.count)
All.Paralogs.Dataset.corr <- unique(All.Paralogs.Dataset.corr.quadrant[,c(2:5,7)])We can plot the totality of the dataset, that represents the paralog co-regulation across all species and condition, plus informations on the number of unique pairs present in each quadrant and correlations score.
pdf("../out/plots/p01MSData.pdf",height = 5,width = 5)
p03MSData <- ggplot(data=All.Paralogs.Dataset) +
geom_point(aes(x.exp,y.exp,color=dir),size=0.7) +
scale_color_manual(values=c("#DFC27D","lightgrey")) +
facet_wrap(organism~condition,scales = "free") + theme_bw() +
geom_smooth(aes(x.exp,y.exp),color="orange",method = "lm") +
geom_text(data=All.Paralogs.Dataset.corr,x=-Inf,y=Inf,
aes(label=paste("R:",round(corr,digits = 2))),hjust=-0.1,vjust=1.5,color="red",size=3.5) +
geom_text(data=All.Paralogs.Dataset.corr,x=+Inf,y=-Inf,
aes(label=paste("p.value",signif)),hjust=1.15,vjust=-1,color="black",size=3) +
geom_label(data=All.Paralogs.Dataset.corr.quadrant %>% filter(quadrant!="q3"),
aes(label=quadrant.count,x=x.pos,y=y.pos),size=3.5) +
xlab("Log2 Fold Change (Paralog1)") + ylab("Log2 Fold Change(Paralog2)") +
theme(legend.position = "none")
p03MSData
dev.off()## png
## 2
p03MSData14.6 GO Enrichment Diff.Expressed Paralogs
14.7 Common Trace
Even thou in each specific dataset the GOEnrichment was highlighting specific terms was possible to extract some common GOTerm that were shared across organisms. We grep on GOterms name to highlight specific GOTerm and retrieve genes that are associated with specific
14.7.1 Transport and vesicles
We select factors with some key words and we show them in a classic barplot colored by dataset.
keywords <- paste(c("vesicle","synapse","transport"),collapse = "|")
#transport GOTERM
TERMS.transport <- lapply(GOTABLES,function(x)x[grep(keywords,x$Term),])
TERMS.transport <- do.call(rbind.data.frame,TERMS.transport)
#Sort Them
TERMS.transport <- TERMS.transport[order(TERMS.transport$p.adjust),]
TERMS.transport <- TERMS.transport %>% group_by(organism) %>%
mutate(pos = 1:n()) %>% filter(pos <= 3) %>% as.data.frame()
TERMS.transport$condition <-gsub("\\.","/",TERMS.transport$condition)
TERMS.transport$condition <- gsub("0","0/",TERMS.transport$condition)
#BARPLOT
p05MSData <- ggplot(data=TERMS.transport) +
geom_bar(aes(y=Term,x=-log10(p.adjust)),color=NA,stat = "identity",
fill='#B4CDE2') +
facet_wrap(.~organism,ncol=1,scales = "free_y") + theme_bw() +
geom_text(aes(y=Term,x=-log10(p.adjust)/2,label=condition)) +
geom_text(aes(y=Term,x=-log10(p.adjust)+0.25,label=Significant),color="red")
pdf("../out/plots/p05MSData.pdf",width = 7,height = 8)
p05MSData
dev.off()## png
## 2
p05MSData14.7.2 DNA binding
We select factors with some key words and we show them in a classic barplot colored by dataset.
keywords <- paste(c("DNA","chromatin"),collapse = "|")
#dna GOTERM
TERMS.dna <- lapply(GOTABLES,function(x)x[grep(keywords,x$Term),])
TERMS.dna <- do.call(rbind.data.frame,TERMS.dna)
#Sort Them
TERMS.dna <- TERMS.dna[order(TERMS.dna$p.adjust),]
TERMS.dna <- TERMS.dna %>% group_by(organism) %>%
mutate(pos = 1:n()) %>% filter(pos <= 3) %>% as.data.frame()
TERMS.dna$condition <-gsub("\\.","/",TERMS.dna$condition)
TERMS.dna$condition <- gsub("3","3/",TERMS.dna$condition)
#BARPLOT
pdf("../out/plots/p06MSData.pdf",width = 7,height = 8)
p06MSData <- ggplot(data=TERMS.dna) +
geom_bar(aes(y=Term,x=-log10(p.adjust)),fill='#F9B4AF',
color=NA,stat = "identity") +
facet_wrap(.~organism,ncol=1,scales = "free_y") + theme_bw() +
scale_fill_brewer(palette = "Pastel1") +
geom_text(aes(y=Term,x=-log10(p.adjust)/2,label=condition)) +
geom_text(aes(y=Term,x=-log10(p.adjust)+0.25,label=Significant),color="red")
p06MSData
dev.off()## png
## 2
p06MSData14.8 GO pie plot
In this sections we represent the different composition in enrichment as a pie plot first we load all the genes related to the terms enriched in the data. We also need to change the character to lower or to title for some specific organism to match the GOannotation.
Opposite_p <-lapply(PARALOGS,function(x)unlist(x[x$dir=='opposite',c('p.x','p.y')]))
Opposite_p <-lapply(Opposite_p,unique)
names(Opposite_p) <- paste(DataInfo$condition,DataInfo$species,sep = '_')
#Change strtotitle for gene name only for some species
to_change <- Opposite_p[grep('Mm|Rn',names(Opposite_p))]
#Substitute in List
Opposite_p[grep('Mm|Rn',names(Opposite_p))] <- lapply(to_change,
stringr::str_to_title)
#For zebrafish genenames are alltolower
to_change <- Opposite_p[grep('Dr',names(Opposite_p))]
#Substitute in List
Opposite_p[grep('Dr',names(Opposite_p))] <- lapply(to_change,tolower)We then extract the GOAnnotation for the enrichmed GOTerm and create a
dataframe called LIST_l that contains all the information
#Extract gene in each go term
LIST_l <- lapply(seq_along(Opposite_p),function(x)
{
df <- unlist(spatialR::ShowGenesGOTerms(GOENRICH[[x]],Opposite_p[[x]]))
df <- df %>% as.data.frame(row.names = names(df))
df$GO <- gsub('\\d+$','',rownames(df))
df$cond <- DataInfo$condition[x];df$species <- DataInfo$species[x]
colnames(df) <- c('gene','GO','cond','species')
return(df)
});names(LIST_l) <- paste(DataInfo$condition,DataInfo$species,sep = '_')
LIST_l <- do.call(rbind.data.frame,LIST_l)We grep on keyword related to DNA and chromatin.
keywords_d <- paste(c("DNA","chromatin"),collapse = "|")
keywords_t <- paste(c("vesicle","synapse","transport"),collapse = "|")
transport_g <- LIST_l[grep(keywords_t,LIST_l$GO),'gene']
dna_g <- LIST_l[grep(keywords_d,LIST_l$GO),'gene']
LIST_l$subset <- 'others'
LIST_l$subset[LIST_l$gene %in% transport_g] <- 'Transport,synapse,vesicles'
LIST_l$subset[LIST_l$gene %in% dna_g] <- 'DNA and chromatin'
LIST_l <- unique(LIST_l[,c('gene','cond','species','subset')])We change the order of some factors
#Summarize data
Order_l <- c('Transport,synapse,vesicles','DNA and chromatin','others')
LIST_l_s <- LIST_l %>% group_by(subset,cond,species) %>% mutate(count=n()) %>%
dplyr::select(cond,species,subset,count) %>% as.data.frame() %>%
unique() %>%
arrange(factor(subset,levels = rev(Order_l))) %>%
group_by(cond,species) %>%
mutate(perc = count/sum(count),
ypos = cumsum(perc)-(perc/2)) %>%
mutate(lab = paste(format(perc*100,digits=1),'%',sep = ''))
#Order factors
Order <- c('NPC.iPS','Neu.IPS','Neu.NPC','DIV3.DIV0','DIV10.DIV0','DIV10.DIV3','DIV5.DIV1',
'DIV14.DIV1','Neur.Stem')
LIST_l_s$cond <- factor(LIST_l_s$cond,levels = Order)
LIST_l_s$subset <- factor(LIST_l_s$subset,levels = Order_l)We plot the pie plot.
cp <- coord_polar(theta = "y",start = 0,direction = 1)
cp$is_free <- function() TRUE
pdf(paste('../out/figures/FigSupp4/GOPie_',Sys.Date(),'.pdf',sep = ''),
width = 6.75,height = 4.41)
go_pie <- ggplot(LIST_l_s, aes(x="", y=perc, fill=subset)) +
geom_bar(stat="identity", width=1) +
cp + theme_void() +
facet_wrap(.~cond,scales = 'free',nrow = 1) +
geom_text(aes(y=ypos,label=lab),size=3) +
theme(aspect.ratio = 1,legend.position = 'top') + xlab('') + ylab('') +
scale_fill_manual(values = c('#B4CDE2','#F9B4AF','#DDDDDD'))
go_pie
dev.off()## png
## 2
go_pie14.9 FigSupp5
pdf(paste("../out/figures/FigSupp5/SuppFig5_",Sys.Date(),".pdf",sep = ''),
width = 9,height = 6)
Supp.Fig5 <- p02MSData
p02MSData## Warning: Removed 1703 rows containing non-finite values (stat_boxplot).
dev.off()## png
## 2
Supp.Fig5## Warning: Removed 1703 rows containing non-finite values (stat_boxplot).
14.10 FigSupp6
pdf(paste("../out/figures/FigSupp6/ParalogScatter_",Sys.Date(),".pdf",sep = ''),
width = 7.5,height = 6.5)
Supp.Fig6 <- p03MSData
Supp.Fig6
dev.off()## png
## 2
Supp.Fig614.11 SuppFig6
pdf(paste("../out/figures/FigSupp6/GObarplot_",Sys.Date(),".pdf",sep = ''),
width = 12,height = 5.26)
GObarplot <- plot_grid(p05MSData,p06MSData,ncol = 2)
GObarplot
dev.off()## png
## 2
GObarplotsave.image("../Markdown/workspaces/05-MSProteomicsData_ParalogsCompleteWorkspace.RData")14.12 Supp Figure 4
pdf(paste('../out/figures/FigSupp4/SuppFig4_Updated',Sys.Date(),'.pdf',sep=''),
width = 9,height = 15)
SuppFig4 <- plot_grid(p03MSData,go_pie,GObarplot,nrow = 3,
rel_heights = c(1,0.3,0.8),labels = c('A','B'))
SuppFig4
dev.off()## png
## 2
14.13 Write Tables
14.13.1 Table S5
l <- list('Paralog Pairs' = All.Paralogs.Dataset[,-c(5,7,11)],
'GOEnrichment (ORA)' = do.call(rbind.data.frame,GOTABLES))
#Write table
openxlsx::write.xlsx(l,'../out/tables/Table_S5.xlsx')