library(GDCRNATools)
library(sqldf)
library(gplots)

# set up directories for downloaded data
setwd("tcga-data/") # Change this to a folder in your computer
project <- "TCGA-BRCA"
rnadir <- paste(project, "RNAseq", sep = "/")
mirdir <- paste(project, "miRNAs", sep = "/")

### Download RNAseq data
gdcRNADownload(
  project.id = project,
  data.type = "RNAseq",
  write.manifest = TRUE,
  method = "gdc-client", ## use gdc-client tool to download data
  directory = rnadir
)

### Download miRNAs data
gdcRNADownload(
  project.id = project,
  data.type = "miRNAs",
  write.manifest = TRUE,
  method = "gdc-client", ## use gdc-client tool to download data
  directory = mirdir
)

###########################################################

###########################################################
##                 2 Data organization                   ##


### Parse RNAseq metadata
metaMatrix.RNA <- gdcParseMetadata(
  project.id = project,
  data.type = "RNAseq",
  write.meta = FALSE
)

# Filter duplicated samples in RNAseq metadata
metaMatrix.RNA <- gdcFilterDuplicate(metaMatrix.RNA)
# Filter non-Primary Tumor and non-Solid Tissue Normal samples in RNAseq metadata
metaMatrix.RNA <- gdcFilterSampleType(metaMatrix.RNA)
# metaMatrix.RNA <- metaMatrix.RNA[metaMatrix.RNA$sample_type == "PrimaryTumor",]

### Parse miRNAs metadata
metaMatrix.MIR <- gdcParseMetadata(
  project.id = project,
  data.type = "miRNAs",
  write.meta = FALSE
)

# Filter duplicated samples in miRNAs metadata
metaMatrix.MIR <- gdcFilterDuplicate(metaMatrix.MIR)
# Filter non-Primary Tumor and non-Solid Tissue Normal samples in miRNAs metadata
metaMatrix.MIR <- gdcFilterSampleType(metaMatrix.MIR)

# metaMatrix.RNA <- metaMatrix.RNA[metaMatrix.RNA$sample_type == "PrimaryTumor",]
samples <- intersect(as.vector(metaMatrix.RNA$sample), as.vector(metaMatrix.MIR$sample))

# Common samples
metaMatrix.RNA <- metaMatrix.RNA[metaMatrix.RNA$sample %in% samples, ]
metaMatrix.MIR <- metaMatrix.MIR[metaMatrix.MIR$sample %in% samples, ]

# Filter tumor_stage==NA samples
metaMatrix.RNA <- metaMatrix.RNA[!is.na(metaMatrix.RNA$tumor_stage), ]
metaMatrix.MIR <- metaMatrix.MIR[metaMatrix.MIR$sample %in% metaMatrix.RNA$sample, ]

# Filter tumor_stage==stagex
metaMatrix.RNA <- metaMatrix.RNA[metaMatrix.RNA$tumor_stage != "stagex", ]
metaMatrix.MIR <- metaMatrix.MIR[metaMatrix.MIR$sample %in% metaMatrix.RNA$sample, ]

### Merge raw counts data
# Merge RNAseq data
rnaCounts <- gdcRNAMerge(
  metadata = metaMatrix.RNA,
  path = rnadir,
  organized = FALSE, ## if target data are in folders
  data.type = "RNAseq"
)

# Merge miRNAs data
mirCounts <- gdcRNAMerge(
  metadata = metaMatrix.MIR,
  path = mirdir,
  organized = FALSE, ## if target data are in folders
  data.type = "miRNAs"
)


### TMM normalization and voom transformation
# Normalization of RNAseq data
rnaExpr <- gdcVoomNormalization(counts = rnaCounts, filter = FALSE)

# Normalization of miRNAs data
mirExpr <- gdcVoomNormalization(counts = mirCounts, filter = FALSE)

######################################################################
# DE Gene Calls
DEGAll.limma <- gdcDEAnalysis(
  counts = rnaCounts,
  group = metaMatrix.RNA$sample_type,
  comparison = "PrimaryTumor-SolidTissueNormal",
  method = "limma",
  n.cores = 4
)

DEGAll.edgeR <- gdcDEAnalysis(
  counts = rnaCounts,
  group = metaMatrix.RNA$sample_type,
  comparison = "PrimaryTumor-SolidTissueNormal",
  method = "edgeR",
  n.cores = 4
)

### All DEGs
fc <- 2
deALL.limma <- gdcDEReport(deg = DEGAll.limma, gene.type = "all", fc = fc)
deALL.limma$mrna <- rownames(deALL.limma)
deALL.limma.up <- deALL.limma[deALL.limma$logFC > 0, ]
deALL.limma.down <- deALL.limma[deALL.limma$logFC < 0, ]

deALL.edgeR <- gdcDEReport(deg = DEGAll.edgeR, gene.type = "all", fc = fc)
deALL.edgeR$mrna <- rownames(deALL.edgeR)
deALL.edgeR.up <- deALL.edgeR[deALL.edgeR$logFC > 0, ]
deALL.edgeR.down <- deALL.edgeR[deALL.edgeR$logFC < 0, ]


# Combine results of two approaches by averaging
L.up <- deALL.limma.up[, c(1, 2, 3, 6, 7, 9)]
R.up <- deALL.edgeR.up[, c(1, 2, 3, 6, 7, 8)]
L.down <- deALL.limma.down[, c(1, 2, 3, 6, 7, 9)]
R.down <- deALL.edgeR.down[, c(1, 2, 3, 6, 7, 8)]

df.up <- rbind(L.up[intersect(L.up$mrna, R.up$mrna), ], R.up[intersect(R.up$mrna, L.up$mrna), ])
df.down <- rbind(L.down[intersect(L.down$mrna, R.down$mrna), ], R.down[intersect(R.down$mrna, L.down$mrna), ])

df <- rbind(df.up, df.down)

# logFC
combined.results <- sqldf('SELECT mrna, symbol, "group", AVG(logFC) AS logFC, AVG(PValue) AS PValue, AVG(FDR) AS FDR
                          FROM df GROUP BY mrna')

################################################################
# DEG visualization
gdcVolcanoPlot(combined.results)
gdcBarPlot(deg = combined.results, angle = 45, data.type = "RNAseq")

degName <- combined.results$mrna

my.heatmap <- function(deg.id, metadata, rna.expr) {
  degDa <- rna.expr[deg.id, ]
  sampleCol <- ifelse(metadata$sample_type == "SolidTissueNormal",
    "blue", "red"
  )
  lmat <- rbind(c(4, 3), c(2, 1))
  lwid <- c(2, 4)
  lhei <- c(1, 5)
  heatmap.2(as.matrix(degDa),
    col = bluered(75), trace = "none",
    cexCol = 0.32, cexRow = 0.1, dendrogram = "none", srtCol = 90,
    adjCol = c(0.8, 0.15), density.info = "none", labRow = NA,
    key.title = NA, na.color = NA, lwid = lwid, lhei = lhei,
    margins = c(3, 3), labCol = NA, key.xlab = "Normalized intensity",
    scale = "row", ColSideColors = sampleCol
  )
}
my.heatmap(deg.id = degName, metadata = metaMatrix.RNA, rna.expr = rnaExpr)

#################################################################################### 33
#                                 Writing files

write.csv(x = rnaExpr, file = "breast_cancer/rna.csv")
write.csv(x = mirExpr, file = "breast_cancer/mirna.csv")
write.csv(x = combined.results, file = "breast_cancer/diff_genes.csv", row.names = FALSE)
write.csv(x = rnaExpr[degName, ], file = "breast_cancer/diff_genes_expr.csv")
write.csv(x = metaMatrix.RNA, file = "breast_cancer/metamatrix_rna.csv", row.names = FALSE)


######################################################################
# DE mirna Calls
DEGAll.limma <- gdcDEAnalysis(
  counts = mirCounts,
  group = metaMatrix.MIR$sample_type,
  comparison = "PrimaryTumor-SolidTissueNormal",
  method = "limma",
  n.cores = 4
)

DEGAll.edgeR <- gdcDEAnalysis(
  counts = mirCounts,
  group = metaMatrix.MIR$sample_type,
  comparison = "PrimaryTumor-SolidTissueNormal",
  method = "edgeR",
  n.cores = 4
)

### All DE mirnas
fc <- 2
deALL.limma <- gdcDEReport(deg = DEGAll.limma, gene.type = "all", fc = fc)
deALL.limma$mirna <- rownames(deALL.limma)
deALL.limma.up <- deALL.limma[deALL.limma$logFC > 0, ]
deALL.limma.down <- deALL.limma[deALL.limma$logFC < 0, ]

deALL.edgeR <- gdcDEReport(deg = DEGAll.edgeR, gene.type = "all", fc = fc)
deALL.edgeR$mirna <- rownames(deALL.edgeR)
deALL.edgeR.up <- deALL.edgeR[deALL.edgeR$logFC > 0, ]
deALL.edgeR.down <- deALL.edgeR[deALL.edgeR$logFC < 0, ]


# Combine results of two approaches by averaging
L.up <- deALL.limma.up[, c(1, 4, 5, 7)]
R.up <- deALL.edgeR.up[, c(1, 4, 5, 6)]
L.down <- deALL.limma.down[, c(1, 4, 5, 7)]
R.down <- deALL.edgeR.down[, c(1, 4, 5, 6)]

df.up <- rbind(L.up[intersect(L.up$mirna, R.up$mirna), ], R.up[intersect(R.up$mirna, L.up$mirna), ])
df.down <- rbind(L.down[intersect(L.down$mirna, R.down$mirna), ], R.down[intersect(R.down$mirna, L.down$mirna), ])

df <- rbind(df.up, df.down)

# logFC
combined.results <- sqldf("SELECT mirna, AVG(logFC) AS logFC, AVG(PValue) AS PValue, AVG(FDR) AS FDR
                          FROM df GROUP BY mirna")
write.csv(x = combined.results, file = "breast_cancer/diff_mirnas.csv", row.names = FALSE)
