Section 11 Process Freese et al. 2017

Here we reprocess and calculate differential expression analysis for the dataset of Frese et al 2017. The data were analized again in MaxQuant.

library(org.Rn.eg.db)
library(limma)

#Load the MaxQuant quantification
frese <- read.delim("../Data/Dataset/frese_et_al/combined/txt/proteinGroups.txt",
                    sep = "\t",header = T)

11.1 Process Ratios

In this chunk of code we read the file coming from the MaxQuant analysis and we extract the H,M,L ratios for the different replicates, and we convert them in log2, creating a matrix of ratios.

frese.m <- frese[,c(2,grep("Ratio\\.\\w.\\w.normalized.rep",colnames(frese)))]
frese.m <- frese.m[,-1] %>% as.data.frame()

#log2
frese.m <- log2(frese.m)

#create a matrix
rownames(frese.m) <- frese$Majority.protein.IDs
ROWS <- rownames(frese.m)

head(frese.m)

##                   Ratio.M.L.normalized.rep1 Ratio.H.L.normalized.rep1
## A0A059NZR0;Q6AXU6                -0.2657608                -1.2261724
## A0A059NZV6;Q5BK20                       NaN                       NaN
## A0A068FP44;Q4AE70                -0.3324621                -0.5402117
## A0A096MIS3;Q63186                 0.7085846                 0.2417182
## A0A096MIS7;Q66HC1                -0.6612961                -1.5155215
## A0A096MIT7;O70441                 0.1054099                 0.3747335
##                   Ratio.H.M.normalized.rep1 Ratio.M.L.normalized.rep2
## A0A059NZR0;Q6AXU6                -0.9985004                 1.3977482
## A0A059NZV6;Q5BK20                       NaN                       NaN
## A0A068FP44;Q4AE70                -0.2037013                 0.8927407
## A0A096MIS3;Q63186                -0.3656855                -0.3916260
## A0A096MIS7;Q66HC1                -0.8285882                 0.9987010
## A0A096MIT7;O70441                 0.2879450                -1.0583528
##                   Ratio.H.L.normalized.rep2 Ratio.H.M.normalized.rep2
## A0A059NZR0;Q6AXU6               0.986592967                -0.4531031
## A0A059NZV6;Q5BK20                       NaN                       NaN
## A0A068FP44;Q4AE70               0.390447578                -0.3907935
## A0A096MIS3;Q63186               0.410992287                 0.8835426
## A0A096MIS7;Q66HC1               0.656816680                -0.1274312
## A0A096MIT7;O70441               0.003890027                 0.9553872

After we change column names accoring to the different conditions that we have in the dataset, we also need to adjust to the correct ratios, that we want to use, for this we will need to change sign to the log2 ratio in order to have the correct comparison of interest. Finally we remove the contaminants.

#Change names of the ratios
colnames(frese.m) <- c("Log.DIV5.DIV1.rep1","Log.DIV14.DIV1.rep1","Log.DIV14.DIV5.rep1",
                       "Log.DIV14.DIV1.rep2","Log.DIV14.DIV5.rep2","Log.DIV5.DIV1.rep2")

#Change sign of the ratios, since some of the label are swaped to get the correct
#sign we need to invert the ratios
frese.m[,c(4,5)] <- frese.m[,c(4,5)]*(-1)
#Convert in Numeric
frese.m <- apply(frese.m,2,as.numeric)
#add ID
rownames(frese.m) <- ROWS
#Remove Contaminants
frese.m <- frese.m[-grep("CON",rownames(frese.m)),]

After having our matrix we calculate the mean Log between replicates, we create a new annotation column called id in order to run the spatialR package function Annotate that is used for adding the genename information to the datasets.

Conds <- c("Log.DIV5.DIV1","Log.DIV14.DIV1","Log.DIV14.DIV5")
#Calculate average reps
frese.m <- sapply(Conds,function(x)rowMeans(frese.m[,grep(x,colnames(frese.m))],na.rm = T))
frese.m <- as.data.frame(frese.m)
#Add Annotations
frese.m$id <- rownames(frese.m)
## Add Annotation
frese.m <- spatialR::Annotate(frese.m,organism = "Rn",idcol = "id",annot = "SYMBOL")
frese.m$SYMBOL <- toupper(frese.m$SYMBOL)

To check that the comparison are comparable to the one obtained by the authors, we compare to the original data.

frese.ori <- "../Data/Dataset/Frese_et_al.csv"
frese.ori <- read.delim(frese.ori,sep = ",",header = T)
#Merge with previous to see if is similar
frese.merged <- merge(frese.m,frese.ori,by.x="SYMBOL",by.y="gene.name")

To check that the comparisons are done correctly. The column marked with average represents the original values from Frese et al. and the other ones represents the values that we obtained with our analysis.

library(GGally)
ggpairs(frese.merged[,c(15:16,2:4)]) + theme_bw()

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 218 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 217 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 220 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 218 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 217 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 220 rows containing missing values

## Warning: Removed 218 rows containing missing values (geom_point).

## Warning: Removed 218 rows containing missing values (geom_point).

## Warning: Removed 218 rows containing non-finite values (stat_density).

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 219 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 220 rows containing missing values

## Warning: Removed 217 rows containing missing values (geom_point).

## Warning: Removed 217 rows containing missing values (geom_point).

## Warning: Removed 219 rows containing missing values (geom_point).

## Warning: Removed 217 rows containing non-finite values (stat_density).

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 220 rows containing missing values

## Warning: Removed 220 rows containing missing values (geom_point).

## Warning: Removed 220 rows containing missing values (geom_point).

## Warning: Removed 220 rows containing missing values (geom_point).

## Warning: Removed 220 rows containing missing values (geom_point).

## Warning: Removed 220 rows containing non-finite values (stat_density).

11.1.1 fdrtools

Finally we calculate false discovery rate statistics with the fdrtool package.

library(fdrtool)
#DATA
frese.m <- frese.m[complete.cases(frese.m),]
fdrtool.res <- apply(frese.m[,c(1:3)],2,function(x)fdrtool(x, statistic = "normal"))

## Step 1... determine cutoff point
## Step 2... estimate parameters of null distribution and eta0
## Step 3... compute p-values and estimate empirical PDF/CDF
## Step 4... compute q-values and local fdr
## Step 5... prepare for plotting

## 
## Step 1... determine cutoff point
## Step 2... estimate parameters of null distribution and eta0
## Step 3... compute p-values and estimate empirical PDF/CDF
## Step 4... compute q-values and local fdr
## Step 5... prepare for plotting

## 
## Step 1... determine cutoff point
## Step 2... estimate parameters of null distribution and eta0
## Step 3... compute p-values and estimate empirical PDF/CDF
## Step 4... compute q-values and local fdr
## Step 5... prepare for plotting

#add information to dataset
pvals <- lapply(fdrtool.res,function(x)x$pval)
qvals <- lapply(fdrtool.res,function(x)x$qval)

pvals <- do.call(cbind.data.frame,pvals);
colnames(pvals) <- paste(colnames(pvals),"pvalue",sep = ".")
qvals <- do.call(cbind.data.frame,qvals);
colnames(qvals) <- paste(colnames(qvals),"qvalue",sep = ".")

frese.m <- cbind.data.frame(frese.m,pvals,qvals)

11.1.2 Add iBAQ Intensity Values

Then we take only the iBAQ values L, M, and H, we convert in Log2 and create an intensity matrix for all the proteins in the data. We set 0 values to NA. We then remove some excess columns and convert everything in Log2. We end up with a matrix.

#grep only intensity
frese <- frese[,c(2,grep("iBAQ.",colnames(frese)))]
frese <- frese[,-c(grep("iBAQ.rep",colnames(frese)))]
frese <- frese[-c(2:4)]
#Set 0 to NA
frese[frese==0] <- NA

#create a matrix
frese.m.2 <- log2(frese[,-1])
rownames(frese.m.2) <- frese$Majority.protein.IDs

Finally we change column names, according to the different conditions that we have.

#change intensity to condition names
colnames(frese.m.2)[grepl("iBAQ.L.rep1|M.rep2",colnames(frese.m.2))] <-
  c("DIV1.rep1","DIV1.rep2")
#change intensity to condition names
colnames(frese.m.2)[grepl(".M.rep1|H.rep2",colnames(frese.m.2))] <-
  c("DIV5.rep1","DIV5.rep2")
#change intensity to condition names
colnames(frese.m.2)[grepl(".H.rep1|L.rep2",colnames(frese.m.2))] <-
  c("DIV14.rep1","DIV14.rep2")

merge the datasets

#Add ID
frese.m.2$id <- rownames(frese.m.2)
#merge them together
frese.m <- merge(frese.m.2,frese.m,by="id")

Add Condition columns, that will be needed for further analysis.

frese.m$condition1 <- "DIV5.DIV1"
frese.m$condition2 <- "DIV14.DIV1"

head(frese.m)

##                  id DIV1.rep1 DIV5.rep1 DIV14.rep1 DIV14.rep2 DIV1.rep2
## 1 A0A059NZR0;Q6AXU6  28.55237  28.18754   27.40649   25.37855  27.09929
## 2 A0A068FP44;Q4AE70  23.25130  22.13474   22.27634   21.45448  22.65535
## 3 A0A096MIS3;Q63186  21.79658  22.13837   21.93398   19.94407  19.21312
## 4 A0A096MIS7;Q66HC1  20.47635  19.14044   18.70022   17.15551  19.16999
## 5 A0A096MIT7;O70441  23.32704  23.29151   23.76517   21.44071  20.58848
## 6 A0A096MIV5;A2VD14  22.52246  21.05993   20.50833   19.52755  19.60364
##   DIV5.rep2 Log.DIV5.DIV1 Log.DIV14.DIV1 Log.DIV14.DIV5    SYMBOL
## 1  26.28588    -0.3594319     -1.3119603     -0.9925467      JPT1
## 2  21.65714    -0.3616278     -0.7164762     -0.2970744  NA;CARM1
## 3  20.24609     0.7960636      0.3166721     -0.3883389 NA;EIF2B4
## 4  18.32562    -0.3943637     -1.2571113     -0.7427024  NA;TDRD3
## 5  21.47234     0.5303985      0.7165431      0.1420275   NA;SYN3
## 6  19.20381    -0.4328050     -0.1109626      0.1784916     ABCF2
##   Log.DIV5.DIV1.pvalue Log.DIV14.DIV1.pvalue Log.DIV14.DIV5.pvalue
## 1            0.5291643             0.1870108            0.09136559
## 2            0.5266509             0.4711684            0.61334924
## 3            0.1634012             0.7501166            0.50891275
## 4            0.4899178             0.2061186            0.20649076
## 5            0.3530937             0.4711270            0.80910814
## 6            0.4486057             0.9111421            0.76143339
##   Log.DIV5.DIV1.qvalue Log.DIV14.DIV1.qvalue Log.DIV14.DIV5.qvalue condition1
## 1            0.9578073             0.8653271             0.7923482  DIV5.DIV1
## 2            0.9576145             0.9418219             0.9624281  DIV5.DIV1
## 3            0.8751529             0.9626487             0.9550643  DIV5.DIV1
## 4            0.9545808             0.8762669             0.8960909  DIV5.DIV1
## 5            0.9380708             0.9418171             0.9712571  DIV5.DIV1
## 6            0.9506048             0.9690454             0.9695124  DIV5.DIV1
##   condition2
## 1 DIV14.DIV1
## 2 DIV14.DIV1
## 3 DIV14.DIV1
## 4 DIV14.DIV1
## 5 DIV14.DIV1
## 6 DIV14.DIV1

Clean NAs from gene Symbol

frese.m$SYMBOL <- gsub("NA;|NA|;NA","",frese.m$SYMBOL)

We finally create a dataframe, that we save as Frese_et_al_2017_processed.csv and proceed to save the workspace.

write.csv(frese.m,"../Data/Dataset/processed/Frese_et_al_2017_processed.csv",row.names = F)

And create a specific workspace for this markdown.

save.image("../Markdown/workspaces/03-MS-ZebrafishData_CompleteWorkspace.RData")