Section 14 MS Proteomics Data Paralog Analysis
The huge chunck of code below, carries on the analysis on Paralogs Genes accross all the MS dataset that we have. The dataset information are read from a DataInfo
files that contains important information on column names and where to get all the information in each file. Then it iterates throught each files carrying on this analysis in order.
- Annotate each dataset with Paralogs definition
- Annotate protein complexes
- Calculate the proportion of differentially expressed proteins that have paralogs and those that have not
- Plot for every dataset a volcano plot and a density plot with paralogs and not paralogs.
- Calculates paralogs pairs that are regulated in opposite ways.
- Carries on a GOEnrichment of those Opposite regulated paralogs.
- Returns plots and files inside the
Out/AnalyzeData/
folder
Clik here to show code
setwd("../ComplexScript/")
source("complexes_function.R")
library(fdrtool)
library(ggpubr)
library(rstatix)
library(dplyr)
library(paralogstools)
library(proteomicstools)
library(spatialR)
library(ComplexHeatmap)
library(MASS)
library(ggplot2)
library(topGO)
library(gridExtra)
library(biomaRt)
#Load data info for each dataset < ----
<- read.table("../Data/Dataset/DataInfo.txt",sep = "\t",header = T,stringsAsFactors = F)
DataInfo
#FCthr and fdrTHR
<- 0.58
FCthr <- 0.05
fdrTHR
#Paralogs.Expressions and GOterms list
<- vector(mode = "list",length = nrow(DataInfo))
PARALOGS <- vector(mode = "list",length = nrow(DataInfo))
DATASETS <- vector(mode = "list",length = nrow(DataInfo))
GOTABLES <- vector(mode = "list",length = nrow(DataInfo))
GOTABLES.Summ <- vector(mode = "list",length = nrow(DataInfo))
GOENRICH
#Paralogs Palette
<- c("#e6d5b8","#0278ae")
ParalogsColor
#Run Plots for all the datasets.
for (N in c(1:nrow(DataInfo)))
{#GetData information form DataInfo File
<- DataInfo[N,"filename"]
filename <- DataInfo[N,"Id.col"]
Id.col <- DataInfo[N,"fold.change"]
fold.change <- DataInfo[N,"fdr.col"]
fdr.col <- DataInfo[N,"condition.col"]
condition.col <- DataInfo[N,"organism"]
organism <- DataInfo[N,"out.label"]
out.label <- DataInfo[N,"sep"];if(sep=="\\t"){sep<-"\t"}
sep <- DataInfo[N,"complex.name"]
complex.name <- DataInfo[N,"ID.type"]
ID.type <- DataInfo[N,"out.label"]
OutName <- paste("Plots/",strsplit(OutName,"/")[[1]][2],sep = "")
PlotDir <- strsplit(OutName,"/")[[1]][2]
ExpName <- paste("../Data/Paralogs/",organism,"_",ID.type,"_paralogs_v102.txt",sep = "")
ParalogsFileName <- DataInfo[N,"species"]
species
#Load Paralogs, get the Already Cleaned Paralogs from File with filename=""
<- ParalogsSet(organism = organism,identifier = ID.type,filename = ParalogsFileName)
Paralogs
#Load Protein Complexes
<- ComplexSet(organism = organism,identifier = ID.type)
Complexes
#Differential Expressions Analysis ####
<- read.delim(filename,sep = sep,header = T,stringsAsFactors = F)
Data <- SplitDatasetByID(Data,Id.col,sep = ";")
Data <- unique(Data[,condition.col])
Condition
#Differentially expressed proteins Proportions
<- nrow(Data[(Data[,fold.change] > FCthr | Data[,fold.change] < -FCthr) & Data[,fdr.col]<fdrTHR,])
DiffProteins <- DiffProteins/nrow(Data)
Percentage
##Get Paralogs ####
<- hasParalogs(Paralogs,Data[,Id.col])
has.Paralogs $has.Paralogs <- has.Paralogs
Data$species <- organism
Data$cond <- Condition
Data
#Add to List
<- Data[,c(Id.col,fold.change,fdr.col,condition.col,"has.Paralogs","cond","species")]
DATASETS[[N]]
#DiffExpressed <- referring to protein groups
<- Data[(Data[,fold.change] > FCthr | Data[,fold.change] < -FCthr),]
Diff.Expressed <- Diff.Expressed[!is.na(Diff.Expressed[,fold.change]),]
Diff.Expressed
<- t(rbind(table(Diff.Expressed$has.Paralogs),table(Data$has.Paralogs)))
Proportions rownames(Proportions) <- c("NoParalogs","HasParalogs"); colnames(Proportions) <- c("DiffExpr","Others")
2] <- Proportions[,2] - Proportions[,1] ; Proportions <- t(Proportions)
Proportions[,
#Fisher
<- fisher.test(Proportions)
FISHER <- wilcox.test(Data[Data$has.Paralogs==T,fold.change],Data[Data$has.Paralogs==F,fold.change])
WILCOX
#Substitute Condition
<- gsub("\\/","_",Condition)
Condition
#Volcano Plots + Density Plots
pdf(paste(PlotDir,"_VolcanoDensity",Condition,".pdf",sep = ""),height = 3.92, width = 5.15)
#Volcano
<- ggplot() + geom_point(data = Data[Data$has.Paralogs==T,],aes(x=eval(parse(text = fold.change)),y=log10(eval(parse(text = fdr.col)))*(-1),color=has.Paralogs),size=0.7) +
p1 theme_minimal() +
ylab("log10(pval)") +
geom_point(data = Data[Data$has.Paralogs==F,],aes(x=eval(parse(text = fold.change)),y=log10(eval(parse(text = fdr.col)))*(-1),color=has.Paralogs),size=0.7) +
scale_color_manual(values = c("#e6d5b8","#0278ae")) +
theme_bw() + ggtitle("Paralogs Diff.Expression")
#Density Plot
<- ggplot(data=Data) + geom_density(aes(x=eval(parse(text = fold.change)),fill=has.Paralogs,color=has.Paralogs),alpha=0.5) + theme_minimal() +
p2 ggtitle(ExpName,subtitle = paste(Condition," Wilcox.pvalue:",format(WILCOX$p.value,digits = 3))) + xlab(fold.change)
print(p1)
print(p2)
dev.off()
print(p1)
print(p2)
#Paralogs Regulated in Opposite Way
<- subunits(Complexes)
SUBUNITS <- getParalogs(Paralogs,Diff.Expressed[,Id.col])
Paralogs.Expressions <- Paralogs.Expressions[Paralogs.Expressions %in% Diff.Expressed[,Id.col]]
Paralogs.Expressions <- data.frame(p.x = Paralogs.Expressions,p.y=names(Paralogs.Expressions))
Paralogs.Expressions
<- sapply(Paralogs.Expressions$p.x,function(x){(Diff.Expressed[Diff.Expressed[,Id.col]==x,fold.change])})
x.exp <- sapply(Paralogs.Expressions$p.y,function(x){(Diff.Expressed[Diff.Expressed[,Id.col]==x,fold.change])})
y.exp
#Create a dataframe
<- force.cbind(x.exp,y.exp)
Paralogs.Expressions colnames(Paralogs.Expressions) <- c("p.x","p.y","x.exp","y.exp")
$dens = get_density(Paralogs.Expressions$x.exp,Paralogs.Expressions$y.exp,n = 100)
Paralogs.Expressions$dir <- apply(Paralogs.Expressions[,c(3,4)],1,function(x){if(x[1]*x[2]>0){return("same")}else{return("opposite")}})
Paralogs.Expressions$subunits <- apply(Paralogs.Expressions[,c(1,2)],1,function(x){if(sum(x %in% SUBUNITS)==2){return("subunits")}else{return("others")}})
Paralogs.Expressions$organism <- organism
Paralogs.Expressions$condition <- Condition
Paralogs.Expressions$species <- species
Paralogs.Expressions<- unique(Paralogs.Expressions)
Paralogs.Expressions <- Paralogs.Expressions %>% filter(p.x != p.y)
Paralogs.Expressions
#Add to list
<- Paralogs.Expressions
PARALOGS[[N]]
<- cor(Paralogs.Expressions$x.exp,Paralogs.Expressions$y.exp)
Rs
#GGplot
pdf(paste(PlotDir,"_ParalogsFoldChanges",Condition,".pdf",sep = ""),height = 3.45, width = 3.40)
<- ggplot(data = Paralogs.Expressions) + geom_point(aes(x=x.exp,y=y.exp),size=0.8) + theme_minimal() +
p3 xlab(paste("Paralog1",fold.change)) + ylab(paste("Paralog2",fold.change)) +
ggtitle(paste(ExpName," | ",Condition,sep = ""),subtitle=paste("R: ",format(round(Rs,3)))) + theme(legend.position = "none")
print(p3)
dev.off()
print(p3)
#Percentage Of Paralogs in Opposite Directions
<- table(Paralogs.Expressions$dir)
Paralogs.Proportions
#GOEnrichment Analysis on Paralogs ####
<- unique(as.character(Paralogs.Expressions$p.x,Paralogs.Expressions$p.y))
AllParalogs <- unique(as.character(unlist(Paralogs.Expressions[Paralogs.Expressions$dir=="opposite",c("p.x","p.y")])))
Opposite <- unique(as.character(unlist(Paralogs.Expressions[Paralogs.Expressions$dir=="same",c("p.x","p.y")])))
Same
#if the organism is drerio put everything to lowercase
if(species=="Dr")
{<- tolower(AllParalogs)
geneList <- tolower(Opposite)
interesting.x <- tolower(Same)
interesting.y else if(species=="Hs")
}
{<- toupper(AllParalogs)
geneList <- toupper(Opposite)
interesting.x <- toupper(Same)
interesting.y else{
}<- stringr::str_to_title(AllParalogs)
geneList <- stringr::str_to_title(Opposite)
interesting.x <- stringr::str_to_title(Same)
interesting.y
}
#Annotation with spatialR GOEnrichment
<- spatialR::GOEnrichment(geneList = (geneList),interesting = (interesting.x),species = species,ontology = "BP")
EnrichmentGO.Opposite <- spatialR::GOEnrichment(geneList = geneList,interesting =interesting.y,species = species,ontology = "BP")
EnrichmentGO.Same
#Add info column
$table$organism <- organism
EnrichmentGO.Opposite$table$condition <- Condition
EnrichmentGO.Opposite
#Summarized
<- SummarizeGO(EnrichmentGO.Opposite,plot = "scatter",)
GOSumm
<- EnrichmentGO.Opposite$table
GOTABLES[[N]] <- GOSumm$reducedTerms
GOTABLES.Summ[[N]] <- EnrichmentGO.Opposite
GOENRICH[[N]]
#PlotGOEnrichment
pdf(paste(PlotDir,"_ParalogsGOEnrichment_",Condition,".pdf",sep = ""),height = 7, width = 7)
<- ggplot(data=EnrichmentGO.Same$table[1:20,]) + geom_bar(aes(x=Term,y=log10(classic)*(-1)),stat="identity",alpha=0.3,color="black") +
p2 theme(axis.text.x = element_text(angle = 60,hjust = 1)) + coord_flip() + theme_minimal()+
ggtitle("Paralogs Same dir.",subtitle = "Top20 shown, Same Paralogs vs All paralogs")+
geom_hline(yintercept = -log10(0.05),lwd=0.5,color="red",lty=2) +
ylab("-log10(p.value)")
<- ggplot(data=unique(EnrichmentGO.Opposite$table[1:20,])) + geom_bar(aes(x=Term,y=log10(classic)*(-1)),stat="identity",alpha=0.3,color="black") +
p3 theme(axis.text.x = element_text(angle = 60,hjust = 1)) + coord_flip() + theme_minimal()+
ggtitle("Paralogs Same dir.",subtitle = "Top20 shown, Opp. Paralogs vs All paralogs") +
geom_hline(yintercept = -log10(0.05),lwd=0.5,color="red",lty=2) +
ylab("-log10(p.value)")
print(p1)
print(p2)
print(p3)
dev.off()
print(p1)
print(p2)
print(p3)
#Write Table for Analize Data
write.table(Paralogs.Expressions,paste("Out/AnalizeData/",organism,Condition,"ParalogsExpression.txt",sep = ""),sep="\t",row.names = F)
write.table(EnrichmentGO.Opposite$Table,paste("Out/AnalizeData/GOEnrichment/",organism,Condition,"_BP_","OppositeParalogs.txt",sep = ""),sep = "\t",row.names = F)
write.table(EnrichmentGO.Same$Table,paste("Out/AnalizeData/GOEnrichment/",organism,Condition,"_BP_","SameParalogs.txt",sep = ""),sep = "\t",row.names = F)
}
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
## Warning: Removed 427 rows containing missing values (geom_point).
## Warning: Removed 135 rows containing missing values (geom_point).
## Warning: Removed 562 rows containing non-finite values (stat_density).
## Warning: Removed 427 rows containing missing values (geom_point).
## Warning: Removed 135 rows containing missing values (geom_point).
## Warning: Removed 562 rows containing non-finite values (stat_density).
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: Removed 427 rows containing missing values (geom_point).
## Warning: Removed 135 rows containing missing values (geom_point).
## Warning: Removed 427 rows containing missing values (geom_point).
## Warning: Removed 135 rows containing missing values (geom_point).
## Warning: Removed 456 rows containing missing values (geom_point).
## Warning: Removed 122 rows containing missing values (geom_point).
## Warning: Removed 578 rows containing non-finite values (stat_density).
## Warning: Removed 456 rows containing missing values (geom_point).
## Warning: Removed 122 rows containing missing values (geom_point).
## Warning: Removed 578 rows containing non-finite values (stat_density).
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
## Warning: Removed 456 rows containing missing values (geom_point).
## Warning: Removed 122 rows containing missing values (geom_point).
## Warning: Removed 456 rows containing missing values (geom_point).
## Warning: Removed 122 rows containing missing values (geom_point).
## Warning: Removed 416 rows containing missing values (geom_point).
## Warning: Removed 147 rows containing missing values (geom_point).
## Warning: Removed 563 rows containing non-finite values (stat_density).
## Warning: Removed 416 rows containing missing values (geom_point).
## Warning: Removed 147 rows containing missing values (geom_point).
## Warning: Removed 563 rows containing non-finite values (stat_density).
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
## Warning: Removed 416 rows containing missing values (geom_point).
## Warning: Removed 147 rows containing missing values (geom_point).
## Warning: Removed 416 rows containing missing values (geom_point).
## Warning: Removed 147 rows containing missing values (geom_point).
## Warning: `show_guide` has been deprecated. Please use `show.legend` instead.
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
14.1 save workspace
We save the information coming out from this analysis in a workspace.
save.image("../Markdown/workspaces/05-MSProteomicsData_Paralogs01.RData")
14.2 Add Information to Datasets
Add Species information to paralogs datasets. The PARALOGS
datasets contains the paralog pairs in all the MS dataset considered. We cycle through each file and we add a column called species
in order to annotate each dataset. We then
<- lapply(seq_along(PARALOGS),function(x){PARALOGS[[x]]$species<-DataInfo[x,"species"];return(PARALOGS[[x]])})
PARALOGS <- lapply(PARALOGS,function(x)unique(x)) PARALOGS
We also declare again the color code to use for plotting paralog genes.
<- c("#e6d5b8","#0278ae") ParalogsColor
14.3 Paralogs more diff.expressed
Now we need to carry on some data manipulation. From each dataset, we update the column name to be consistent across all datasets. We bind them together, and calculate the absolute fold change values for each proteins.
<- lapply(DATASETS, function(x){colnames(x)<-c("id","fold.change","fdr","cond","has.Paralogs","comparison","species");return(x)})
DATASETS <- do.call(rbind.data.frame,DATASETS)
Dataset.combined $abs.fold.change <- abs(Dataset.combined$fold.change) Dataset.combined
After we use the rstatix
package to run a wilcoxon test between genes that have paralogs and genes that do not have paralogs.
#Add test information
<- Dataset.combined %>% group_by(cond,species) %>%
Datasets.test wilcox_test(abs.fold.change ~ has.Paralogs) %>%
add_significance("p") %>% add_xy_position(x = "cond") %>%
group_by(species) %>% mutate(x.pos = 1:n()) %>% ungroup()
And we assemble now all the data in a boxplot
pdf("../out/plots/p02MSData.pdf",height = 5,width = 5)
<- ggplot(data=Dataset.combined) +
p02MSData geom_boxplot(aes(x=cond,y=abs(fold.change),fill=has.Paralogs),
alpha=0.4) +
scale_fill_manual(values = ParalogsColor) +
stat_pvalue_manual(Datasets.test,x = 'x.pos',
tip.length = 0.5,position = position_dodge(0.8),
coord.flip = T) +
facet_wrap(.~species,nrow = 2,scales = "free") + coord_flip() +
theme_bw() +
xlab("") + ylab("absolute(Log2(Fold Change))")
p02MSDatadev.off()
## png
## 2
p02MSData
14.4 Proportion of Diff. Expressed Paralogs
Here we count paralog pairs that are regulated in a concerted or opposite manner regarding their fold change. For this reason we bind the PARALOGS
data together (this dataset contains the direct comparison of paralog pairs during neuronal differentiation in all the dataset). We defining only unique entries, avoiding duplicated instances that may arise. After using the variable dir
that store if a specific paralg pairs is regulated in the same way, we count the entries. We need to divide by 2 to take into account just for unique pairs. Since both (Paralog1 , Paralog2 and Paralog2, Paralog1) are now present in this specific datasets.
<- do.call(rbind.data.frame,PARALOGS)
All.Paralogs.Dataset <- unique(All.Paralogs.Dataset)
All.Paralogs.Dataset
#We need to divide by 2 because we need to consider unique pairs
<- All.Paralogs.Dataset %>%
All.Paralogs.Dataset.count group_by(dir,condition,organism) %>%
summarise(n=n()/2)
<- as.data.frame(All.Paralogs.Dataset.count) All.Paralogs.Dataset.count
We assemble this counting in a barplot format, facetting for the different organisms.
pdf("../out/plots/p01MSData.pdf",height = 5,width = 5)
<- ggplot(data = All.Paralogs.Dataset.count) +
p01MSData geom_bar(aes(y=n,x=condition,fill=dir),
stat = "identity",color="black") + coord_flip() +
facet_wrap(.~organism,scales = "free") +
theme_bw() +
scale_fill_brewer(palette = "BrBG",
name = "Paralogs protein Fold Change",
labels = c("Opposite sign", "Same sign")) +
ylab("# of unique diff. expressed paralog pairs") + xlab("") +
theme(legend.position = "top")
p01MSDatadev.off()
## png
## 2
p01MSData
14.5 Coregulation of Paralog Pairs
We subset the paralog pairs and create the function GetQuadrant
for getting the quadrant for each observation.
<- All.Paralogs.Dataset %>% filter(p.x!=p.y)
All.Paralogs.Dataset <- All.Paralogs.Dataset %>% group_by(species,condition) %>% mutate(corr=cor(x.exp,y.exp))
All.Paralogs.Dataset
<- function(x,y)
GetQuadrant
{<- character(length = length(x))
v <0 & y>0] <- "q1"
v[x>0 & y>0] <- "q2"
v[x>0 & y<0] <- "q3"
v[x<0 & y<0] <- "q4"
v[xreturn(v)
}
Prepare dataset informations to be shown on plot, for example here we arrange the pvalues for the correlations, the number of proteins present in each quadrants and the coordinate where to plot each label. First we group the dataset by organism
and condition
. Then for each of these, we retrieve the location of every protein (in which quadrant of the plot) they reside, using the GetQuadrant
function, we then group also by the quadrant and calculate with quadrant.count
the number of entries. We then run a correlation test on each group storing information from the correlation score and the pvalues. We add further information of formatting the pvalues, and position that will be useful for plotting purposes.
<- All.Paralogs.Dataset %>%
All.Paralogs.Dataset.corr.quadrant group_by(organism,condition) %>%
mutate(quadrant = GetQuadrant(x.exp,y.exp)) %>%
group_by(organism,condition,quadrant) %>%
mutate(quadrant.count= n()) %>%
group_by(organism,condition) %>%
summarize(corr=cor.test(x.exp,y.exp)$estimate,
p.value = cor.test(x.exp,y.exp)$p.value,quadrant,quadrant.count,x.exp,y.exp) %>%
as.data.frame() %>%
mutate(signif= ifelse(p.value< 2.2e-16,"< 2.2e-16",paste("=",formatC(p.value,format = "e",digits = 2)))) %>%
group_by(organism,condition,quadrant,add = T) %>%
mutate(x.pos = mean(c(min(x.exp),max(x.exp))),
y.pos = mean(c(min(y.exp),max(y.exp)))) %>%
::select(organism,condition,corr,p.value,quadrant.count,signif,x.pos,y.pos) %>%
dplyrunique() %>% as.data.frame()
## Warning: The `add` argument of `group_by()` is deprecated as of dplyr 1.0.0.
## Please use the `.add` argument instead.
#Only unique paralogs in each quadrant
$quadrant.count <- ifelse(All.Paralogs.Dataset.corr.quadrant$quadrant %in% c("q2","q4"),All.Paralogs.Dataset.corr.quadrant$quadrant.count/2,
All.Paralogs.Dataset.corr.quadrant$quadrant.count)
All.Paralogs.Dataset.corr.quadrant<- unique(All.Paralogs.Dataset.corr.quadrant[,c(2:5,7)]) All.Paralogs.Dataset.corr
We can plot the totality of the dataset, that represents the paralog co-regulation across all species and condition, plus informations on the number of unique pairs present in each quadrant and correlations score.
pdf("../out/plots/p01MSData.pdf",height = 5,width = 5)
<- ggplot(data=All.Paralogs.Dataset) +
p03MSData geom_point(aes(x.exp,y.exp,color=dir),size=0.7) +
scale_color_manual(values=c("#DFC27D","lightgrey")) +
facet_wrap(organism~condition,scales = "free") + theme_bw() +
geom_smooth(aes(x.exp,y.exp),color="orange",method = "lm") +
geom_text(data=All.Paralogs.Dataset.corr,x=-Inf,y=Inf,
aes(label=paste("R:",round(corr,digits = 2))),hjust=-0.1,vjust=1.5,color="red",size=3.5) +
geom_text(data=All.Paralogs.Dataset.corr,x=+Inf,y=-Inf,
aes(label=paste("p.value",signif)),hjust=1.15,vjust=-1,color="black",size=3) +
geom_label(data=All.Paralogs.Dataset.corr.quadrant %>% filter(quadrant!="q3"),
aes(label=quadrant.count,x=x.pos,y=y.pos),size=3.5) +
xlab("Log2 Fold Change (Paralog1)") + ylab("Log2 Fold Change(Paralog2)") +
theme(legend.position = "none")
p03MSDatadev.off()
## png
## 2
p03MSData
14.6 GO Enrichment Diff.Expressed Paralogs
14.7 Common Trace
Even thou in each specific dataset the GOEnrichment was highlighting specific terms was possible to extract some common GOTerm that were shared across organisms. We grep on GOterms name to highlight specific GOTerm and retrieve genes that are associated with specific
14.7.1 Transport and vesicles
We select factors with some key words and we show them in a classic barplot colored by dataset.
<- paste(c("vesicle","synapse","transport"),collapse = "|")
keywords
#transport GOTERM
<- lapply(GOTABLES,function(x)x[grep(keywords,x$Term),])
TERMS.transport <- do.call(rbind.data.frame,TERMS.transport)
TERMS.transport
#Sort Them
<- TERMS.transport[order(TERMS.transport$p.adjust),]
TERMS.transport <- TERMS.transport %>% group_by(organism) %>%
TERMS.transport mutate(pos = 1:n()) %>% filter(pos <= 3) %>% as.data.frame()
$condition <-gsub("\\.","/",TERMS.transport$condition)
TERMS.transport$condition <- gsub("0","0/",TERMS.transport$condition)
TERMS.transport
#BARPLOT
<- ggplot(data=TERMS.transport) +
p05MSData geom_bar(aes(y=Term,x=-log10(p.adjust)),color=NA,stat = "identity",
fill='#B4CDE2') +
facet_wrap(.~organism,ncol=1,scales = "free_y") + theme_bw() +
geom_text(aes(y=Term,x=-log10(p.adjust)/2,label=condition)) +
geom_text(aes(y=Term,x=-log10(p.adjust)+0.25,label=Significant),color="red")
pdf("../out/plots/p05MSData.pdf",width = 7,height = 8)
p05MSDatadev.off()
## png
## 2
p05MSData
14.7.2 DNA binding
We select factors with some key words and we show them in a classic barplot colored by dataset.
<- paste(c("DNA","chromatin"),collapse = "|")
keywords
#dna GOTERM
<- lapply(GOTABLES,function(x)x[grep(keywords,x$Term),])
TERMS.dna <- do.call(rbind.data.frame,TERMS.dna)
TERMS.dna
#Sort Them
<- TERMS.dna[order(TERMS.dna$p.adjust),]
TERMS.dna <- TERMS.dna %>% group_by(organism) %>%
TERMS.dna mutate(pos = 1:n()) %>% filter(pos <= 3) %>% as.data.frame()
$condition <-gsub("\\.","/",TERMS.dna$condition)
TERMS.dna$condition <- gsub("3","3/",TERMS.dna$condition)
TERMS.dna
#BARPLOT
pdf("../out/plots/p06MSData.pdf",width = 7,height = 8)
<- ggplot(data=TERMS.dna) +
p06MSData geom_bar(aes(y=Term,x=-log10(p.adjust)),fill='#F9B4AF',
color=NA,stat = "identity") +
facet_wrap(.~organism,ncol=1,scales = "free_y") + theme_bw() +
scale_fill_brewer(palette = "Pastel1") +
geom_text(aes(y=Term,x=-log10(p.adjust)/2,label=condition)) +
geom_text(aes(y=Term,x=-log10(p.adjust)+0.25,label=Significant),color="red")
p06MSDatadev.off()
## png
## 2
p06MSData
14.8 GO pie plot
In this sections we represent the different composition in enrichment as a pie plot first we load all the genes related to the terms enriched in the data. We also need to change the character to lower or to title for some specific organism to match the GOannotation.
<-lapply(PARALOGS,function(x)unlist(x[x$dir=='opposite',c('p.x','p.y')]))
Opposite_p <-lapply(Opposite_p,unique)
Opposite_p names(Opposite_p) <- paste(DataInfo$condition,DataInfo$species,sep = '_')
#Change strtotitle for gene name only for some species
<- Opposite_p[grep('Mm|Rn',names(Opposite_p))]
to_change #Substitute in List
grep('Mm|Rn',names(Opposite_p))] <- lapply(to_change,
Opposite_p[::str_to_title)
stringr#For zebrafish genenames are alltolower
<- Opposite_p[grep('Dr',names(Opposite_p))]
to_change #Substitute in List
grep('Dr',names(Opposite_p))] <- lapply(to_change,tolower) Opposite_p[
We then extract the GOAnnotation for the enrichmed GOTerm and create a
dataframe called LIST_l
that contains all the information
#Extract gene in each go term
<- lapply(seq_along(Opposite_p),function(x)
LIST_l
{<- unlist(spatialR::ShowGenesGOTerms(GOENRICH[[x]],Opposite_p[[x]]))
df <- df %>% as.data.frame(row.names = names(df))
df $GO <- gsub('\\d+$','',rownames(df))
df$cond <- DataInfo$condition[x];df$species <- DataInfo$species[x]
dfcolnames(df) <- c('gene','GO','cond','species')
return(df)
names(LIST_l) <- paste(DataInfo$condition,DataInfo$species,sep = '_')
});<- do.call(rbind.data.frame,LIST_l) LIST_l
We grep on keyword related to DNA and chromatin.
<- paste(c("DNA","chromatin"),collapse = "|")
keywords_d <- paste(c("vesicle","synapse","transport"),collapse = "|")
keywords_t
<- LIST_l[grep(keywords_t,LIST_l$GO),'gene']
transport_g <- LIST_l[grep(keywords_d,LIST_l$GO),'gene']
dna_g
$subset <- 'others'
LIST_l$subset[LIST_l$gene %in% transport_g] <- 'Transport,synapse,vesicles'
LIST_l$subset[LIST_l$gene %in% dna_g] <- 'DNA and chromatin'
LIST_l<- unique(LIST_l[,c('gene','cond','species','subset')]) LIST_l
We change the order of some factors
#Summarize data
<- c('Transport,synapse,vesicles','DNA and chromatin','others')
Order_l
<- LIST_l %>% group_by(subset,cond,species) %>% mutate(count=n()) %>%
LIST_l_s ::select(cond,species,subset,count) %>% as.data.frame() %>%
dplyrunique() %>%
arrange(factor(subset,levels = rev(Order_l))) %>%
group_by(cond,species) %>%
mutate(perc = count/sum(count),
ypos = cumsum(perc)-(perc/2)) %>%
mutate(lab = paste(format(perc*100,digits=1),'%',sep = ''))
#Order factors
<- c('NPC.iPS','Neu.IPS','Neu.NPC','DIV3.DIV0','DIV10.DIV0','DIV10.DIV3','DIV5.DIV1',
Order 'DIV14.DIV1','Neur.Stem')
$cond <- factor(LIST_l_s$cond,levels = Order)
LIST_l_s$subset <- factor(LIST_l_s$subset,levels = Order_l) LIST_l_s
We plot the pie plot.
<- coord_polar(theta = "y",start = 0,direction = 1)
cp $is_free <- function() TRUE
cp
pdf(paste('../out/figures/FigSupp4/GOPie_',Sys.Date(),'.pdf',sep = ''),
width = 6.75,height = 4.41)
<- ggplot(LIST_l_s, aes(x="", y=perc, fill=subset)) +
go_pie geom_bar(stat="identity", width=1) +
+ theme_void() +
cp facet_wrap(.~cond,scales = 'free',nrow = 1) +
geom_text(aes(y=ypos,label=lab),size=3) +
theme(aspect.ratio = 1,legend.position = 'top') + xlab('') + ylab('') +
scale_fill_manual(values = c('#B4CDE2','#F9B4AF','#DDDDDD'))
go_piedev.off()
## png
## 2
go_pie
14.9 FigSupp5
pdf(paste("../out/figures/FigSupp5/SuppFig5_",Sys.Date(),".pdf",sep = ''),
width = 9,height = 6)
<- p02MSData
Supp.Fig5 p02MSData
## Warning: Removed 1703 rows containing non-finite values (stat_boxplot).
dev.off()
## png
## 2
Supp.Fig5
## Warning: Removed 1703 rows containing non-finite values (stat_boxplot).
14.10 FigSupp6
pdf(paste("../out/figures/FigSupp6/ParalogScatter_",Sys.Date(),".pdf",sep = ''),
width = 7.5,height = 6.5)
<- p03MSData
Supp.Fig6
Supp.Fig6dev.off()
## png
## 2
Supp.Fig6
14.11 SuppFig6
pdf(paste("../out/figures/FigSupp6/GObarplot_",Sys.Date(),".pdf",sep = ''),
width = 12,height = 5.26)
<- plot_grid(p05MSData,p06MSData,ncol = 2)
GObarplot
GObarplotdev.off()
## png
## 2
GObarplot
save.image("../Markdown/workspaces/05-MSProteomicsData_ParalogsCompleteWorkspace.RData")
14.12 Supp Figure 4
pdf(paste('../out/figures/FigSupp4/SuppFig4_Updated',Sys.Date(),'.pdf',sep=''),
width = 9,height = 15)
<- plot_grid(p03MSData,go_pie,GObarplot,nrow = 3,
SuppFig4 rel_heights = c(1,0.3,0.8),labels = c('A','B'))
SuppFig4dev.off()
## png
## 2
14.13 Write Tables
14.13.1 Table S5
<- list('Paralog Pairs' = All.Paralogs.Dataset[,-c(5,7,11)],
l 'GOEnrichment (ORA)' = do.call(rbind.data.frame,GOTABLES))
#Write table
::write.xlsx(l,'../out/tables/Table_S5.xlsx') openxlsx