Section 11 Process Freese et al. 2017
Here we reprocess and calculate differential expression analysis for the dataset of Frese et al 2017. The data were analized again in MaxQuant.
library(org.Rn.eg.db)
library(limma)
#Load the MaxQuant quantification
<- read.delim("../Data/Dataset/frese_et_al/combined/txt/proteinGroups.txt",
frese sep = "\t",header = T)
11.1 Process Ratios
In this chunk of code we read the file coming from the MaxQuant analysis and we extract the H,M,L ratios for the different replicates, and we convert them in log2, creating a matrix of ratios.
<- frese[,c(2,grep("Ratio\\.\\w.\\w.normalized.rep",colnames(frese)))]
frese.m <- frese.m[,-1] %>% as.data.frame()
frese.m
#log2
<- log2(frese.m)
frese.m
#create a matrix
rownames(frese.m) <- frese$Majority.protein.IDs
<- rownames(frese.m) ROWS
head(frese.m)
## Ratio.M.L.normalized.rep1 Ratio.H.L.normalized.rep1
## A0A059NZR0;Q6AXU6 -0.2657608 -1.2261724
## A0A059NZV6;Q5BK20 NaN NaN
## A0A068FP44;Q4AE70 -0.3324621 -0.5402117
## A0A096MIS3;Q63186 0.7085846 0.2417182
## A0A096MIS7;Q66HC1 -0.6612961 -1.5155215
## A0A096MIT7;O70441 0.1054099 0.3747335
## Ratio.H.M.normalized.rep1 Ratio.M.L.normalized.rep2
## A0A059NZR0;Q6AXU6 -0.9985004 1.3977482
## A0A059NZV6;Q5BK20 NaN NaN
## A0A068FP44;Q4AE70 -0.2037013 0.8927407
## A0A096MIS3;Q63186 -0.3656855 -0.3916260
## A0A096MIS7;Q66HC1 -0.8285882 0.9987010
## A0A096MIT7;O70441 0.2879450 -1.0583528
## Ratio.H.L.normalized.rep2 Ratio.H.M.normalized.rep2
## A0A059NZR0;Q6AXU6 0.986592967 -0.4531031
## A0A059NZV6;Q5BK20 NaN NaN
## A0A068FP44;Q4AE70 0.390447578 -0.3907935
## A0A096MIS3;Q63186 0.410992287 0.8835426
## A0A096MIS7;Q66HC1 0.656816680 -0.1274312
## A0A096MIT7;O70441 0.003890027 0.9553872
After we change column names accoring to the different conditions that we have in the dataset, we also need to adjust to the correct ratios, that we want to use, for this we will need to change sign to the log2 ratio in order to have the correct comparison of interest. Finally we remove the contaminants.
#Change names of the ratios
colnames(frese.m) <- c("Log.DIV5.DIV1.rep1","Log.DIV14.DIV1.rep1","Log.DIV14.DIV5.rep1",
"Log.DIV14.DIV1.rep2","Log.DIV14.DIV5.rep2","Log.DIV5.DIV1.rep2")
#Change sign of the ratios, since some of the label are swaped to get the correct
#sign we need to invert the ratios
c(4,5)] <- frese.m[,c(4,5)]*(-1)
frese.m[,#Convert in Numeric
<- apply(frese.m,2,as.numeric)
frese.m #add ID
rownames(frese.m) <- ROWS
#Remove Contaminants
<- frese.m[-grep("CON",rownames(frese.m)),] frese.m
After having our matrix we calculate the mean Log between replicates, we create a new annotation column called id
in order to run the spatialR
package function Annotate
that is used for adding the genename information to the datasets.
<- c("Log.DIV5.DIV1","Log.DIV14.DIV1","Log.DIV14.DIV5")
Conds #Calculate average reps
<- sapply(Conds,function(x)rowMeans(frese.m[,grep(x,colnames(frese.m))],na.rm = T))
frese.m <- as.data.frame(frese.m)
frese.m #Add Annotations
$id <- rownames(frese.m)
frese.m## Add Annotation
<- spatialR::Annotate(frese.m,organism = "Rn",idcol = "id",annot = "SYMBOL")
frese.m $SYMBOL <- toupper(frese.m$SYMBOL) frese.m
To check that the comparison are comparable to the one obtained by the authors, we compare to the original data.
<- "../Data/Dataset/Frese_et_al.csv"
frese.ori <- read.delim(frese.ori,sep = ",",header = T)
frese.ori #Merge with previous to see if is similar
<- merge(frese.m,frese.ori,by.x="SYMBOL",by.y="gene.name") frese.merged
To check that the comparisons are done correctly. The column marked with average
represents the original values from Frese et al. and the other ones represents the values that we obtained with our analysis.
library(GGally)
ggpairs(frese.merged[,c(15:16,2:4)]) + theme_bw()
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 218 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 217 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 220 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 218 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 217 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 220 rows containing missing values
## Warning: Removed 218 rows containing missing values (geom_point).
## Warning: Removed 218 rows containing missing values (geom_point).
## Warning: Removed 218 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 219 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 220 rows containing missing values
## Warning: Removed 217 rows containing missing values (geom_point).
## Warning: Removed 217 rows containing missing values (geom_point).
## Warning: Removed 219 rows containing missing values (geom_point).
## Warning: Removed 217 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 220 rows containing missing values
## Warning: Removed 220 rows containing missing values (geom_point).
## Warning: Removed 220 rows containing missing values (geom_point).
## Warning: Removed 220 rows containing missing values (geom_point).
## Warning: Removed 220 rows containing missing values (geom_point).
## Warning: Removed 220 rows containing non-finite values (stat_density).
11.1.1 fdrtools
Finally we calculate false discovery rate statistics with the fdrtool
package.
library(fdrtool)
#DATA
<- frese.m[complete.cases(frese.m),]
frese.m <- apply(frese.m[,c(1:3)],2,function(x)fdrtool(x, statistic = "normal")) fdrtool.res
## Step 1... determine cutoff point
## Step 2... estimate parameters of null distribution and eta0
## Step 3... compute p-values and estimate empirical PDF/CDF
## Step 4... compute q-values and local fdr
## Step 5... prepare for plotting
##
## Step 1... determine cutoff point
## Step 2... estimate parameters of null distribution and eta0
## Step 3... compute p-values and estimate empirical PDF/CDF
## Step 4... compute q-values and local fdr
## Step 5... prepare for plotting
##
## Step 1... determine cutoff point
## Step 2... estimate parameters of null distribution and eta0
## Step 3... compute p-values and estimate empirical PDF/CDF
## Step 4... compute q-values and local fdr
## Step 5... prepare for plotting
#add information to dataset
<- lapply(fdrtool.res,function(x)x$pval)
pvals <- lapply(fdrtool.res,function(x)x$qval)
qvals
<- do.call(cbind.data.frame,pvals);
pvals colnames(pvals) <- paste(colnames(pvals),"pvalue",sep = ".")
<- do.call(cbind.data.frame,qvals);
qvals colnames(qvals) <- paste(colnames(qvals),"qvalue",sep = ".")
<- cbind.data.frame(frese.m,pvals,qvals) frese.m
11.1.2 Add iBAQ Intensity Values
Then we take only the iBAQ values L, M, and H, we convert in Log2 and create an intensity matrix for all the proteins in the data. We set 0 values to NA. We then remove some excess columns and convert everything in Log2. We end up with a matrix.
#grep only intensity
<- frese[,c(2,grep("iBAQ.",colnames(frese)))]
frese <- frese[,-c(grep("iBAQ.rep",colnames(frese)))]
frese <- frese[-c(2:4)]
frese #Set 0 to NA
==0] <- NA
frese[frese
#create a matrix
.2 <- log2(frese[,-1])
frese.mrownames(frese.m.2) <- frese$Majority.protein.IDs
Finally we change column names, according to the different conditions that we have.
#change intensity to condition names
colnames(frese.m.2)[grepl("iBAQ.L.rep1|M.rep2",colnames(frese.m.2))] <-
c("DIV1.rep1","DIV1.rep2")
#change intensity to condition names
colnames(frese.m.2)[grepl(".M.rep1|H.rep2",colnames(frese.m.2))] <-
c("DIV5.rep1","DIV5.rep2")
#change intensity to condition names
colnames(frese.m.2)[grepl(".H.rep1|L.rep2",colnames(frese.m.2))] <-
c("DIV14.rep1","DIV14.rep2")
merge the datasets
#Add ID
.2$id <- rownames(frese.m.2)
frese.m#merge them together
<- merge(frese.m.2,frese.m,by="id") frese.m
Add Condition columns, that will be needed for further analysis.
$condition1 <- "DIV5.DIV1"
frese.m$condition2 <- "DIV14.DIV1" frese.m
head(frese.m)
## id DIV1.rep1 DIV5.rep1 DIV14.rep1 DIV14.rep2 DIV1.rep2
## 1 A0A059NZR0;Q6AXU6 28.55237 28.18754 27.40649 25.37855 27.09929
## 2 A0A068FP44;Q4AE70 23.25130 22.13474 22.27634 21.45448 22.65535
## 3 A0A096MIS3;Q63186 21.79658 22.13837 21.93398 19.94407 19.21312
## 4 A0A096MIS7;Q66HC1 20.47635 19.14044 18.70022 17.15551 19.16999
## 5 A0A096MIT7;O70441 23.32704 23.29151 23.76517 21.44071 20.58848
## 6 A0A096MIV5;A2VD14 22.52246 21.05993 20.50833 19.52755 19.60364
## DIV5.rep2 Log.DIV5.DIV1 Log.DIV14.DIV1 Log.DIV14.DIV5 SYMBOL
## 1 26.28588 -0.3594319 -1.3119603 -0.9925467 JPT1
## 2 21.65714 -0.3616278 -0.7164762 -0.2970744 NA;CARM1
## 3 20.24609 0.7960636 0.3166721 -0.3883389 NA;EIF2B4
## 4 18.32562 -0.3943637 -1.2571113 -0.7427024 NA;TDRD3
## 5 21.47234 0.5303985 0.7165431 0.1420275 NA;SYN3
## 6 19.20381 -0.4328050 -0.1109626 0.1784916 ABCF2
## Log.DIV5.DIV1.pvalue Log.DIV14.DIV1.pvalue Log.DIV14.DIV5.pvalue
## 1 0.5291643 0.1870108 0.09136559
## 2 0.5266509 0.4711684 0.61334924
## 3 0.1634012 0.7501166 0.50891275
## 4 0.4899178 0.2061186 0.20649076
## 5 0.3530937 0.4711270 0.80910814
## 6 0.4486057 0.9111421 0.76143339
## Log.DIV5.DIV1.qvalue Log.DIV14.DIV1.qvalue Log.DIV14.DIV5.qvalue condition1
## 1 0.9578073 0.8653271 0.7923482 DIV5.DIV1
## 2 0.9576145 0.9418219 0.9624281 DIV5.DIV1
## 3 0.8751529 0.9626487 0.9550643 DIV5.DIV1
## 4 0.9545808 0.8762669 0.8960909 DIV5.DIV1
## 5 0.9380708 0.9418171 0.9712571 DIV5.DIV1
## 6 0.9506048 0.9690454 0.9695124 DIV5.DIV1
## condition2
## 1 DIV14.DIV1
## 2 DIV14.DIV1
## 3 DIV14.DIV1
## 4 DIV14.DIV1
## 5 DIV14.DIV1
## 6 DIV14.DIV1
Clean NAs from gene Symbol
$SYMBOL <- gsub("NA;|NA|;NA","",frese.m$SYMBOL) frese.m
We finally create a dataframe, that we save as Frese_et_al_2017_processed.csv
and proceed to save the workspace.
write.csv(frese.m,"../Data/Dataset/processed/Frese_et_al_2017_processed.csv",row.names = F)
And create a specific workspace for this markdown.
save.image("../Markdown/workspaces/03-MS-ZebrafishData_CompleteWorkspace.RData")