Analyze TCGA DATA
after downloading data
library(TCGAbiolinks)
library(dplyr)
library(SummarizedExperiment)
rse.gbm <- get(load("rnaseq.recurTum_NormTissue.GBM_20220603.rda"))
dataPrep.gbm <- TCGAanalyze_Preprocessing(object = rse.gbm,
cor.cut = 0.6,
datatype="unstranded",
filename="rnaseq.recurTum_NormTissue.GBM_20220607.correlation.png"
)
table(rse.gbm$sample_type)
gbm.rnasdeq.dataNorm <- TCGAanalyze_Normalization(tabDF = dataPrep.gbm,
geneInfo = TCGAbiolinks::geneInfoHT,
method = "gcContent"
)
gbm.rnasdeq.dataFilt <- TCGAanalyze_Filtering(tabDF = gbm.rnasdeq.dataNorm,
method = "quantile",
qnt.cut = 0.25
)
normalTissue <- colnames(rse.gbm)[rse.gbm$sample_type == "Solid Tissue Normal"]
#rse.gbm[rse.gbm$sample_type == "Solid Tissue Normal",]
recurTumor <- colnames(rse.gbm)[rse.gbm$sample_type == "Recurrent Tumor"]
gbm.rnaseqNormal.dataFilt <- subset(gbm.rnasdeq.dataFilt,
select = colnames(gbm.rnasdeq.dataFilt) %in% normalTissue)
gbm.rnaseqTumor.dataFilt <- subset(gbm.rnasdeq.dataFilt,
select = colnames(gbm.rnasdeq.dataFilt) %in% recurTumor)
# head(gbm.rnaseqNormal.dataFilt )
# head(gbm.rnaseqTumor.dataFilt)
# recurTumor
colnames(rse.gbm) #[rse.gbm$sample_type == "Solid Tissue Normal"]
colnames(gbm.rnasdeq.dataFilt)
rse.gbm
identical(rownames(gbm.rnaseqTumor.dataFilt), rownames(gbm.rnaseqNormal.dataFilt))
length(unique(rownames(gbm.rnaseqTumor.dataFilt) ))
# rownames(gbm.rnaseqNormal.dataFilt) <- geneIDs1$SYMBOL
gbm.DEG <- TCGAanalyze_DEA(mat1 = gbm.rnaseqNormal.dataFilt,
mat2 = gbm.rnaseqTumor.dataFilt,
Cond1type = "NORMAL", Cond2type = "RECURTUMOR",
fdr.cut = 0.01,
logFC.cut = 1,
method = "glmLRT"
)
head(gbm.DEG)
g4count <- read.csv("../downloadedDataGEO_20220524/Genes_G4number.in2csv.csv")
head(g4count)
modDEG <- gbm.DEG %>% left_join(y=g4count, by= c("gene_name" = "test_id")) %>%
dplyr::rename( g4Num = combined_exon_introns_window200) %>% na.omit() %>% as.data.frame(row.names = 1:nrow(.))
# dplyr::filter(g4Num > 3)
head(modDEG)
modDEG[(modDEG$g4Num >= 5) & (abs(modDEG$logFC) > 1),] %>% head()
library(ggplot2)
# geom_smooth(method = "glm", method.args = list(family = "binomial"))
# }
ggplot(data = modDEG[(modDEG$g4Num >= 4) & (abs(modDEG$logFC) > 1.5) & (modDEG$PValue <= 0.05),],
aes(x = g4Num, y = logCPM, color=PValue)) +
geom_point() +
# geom_smooth(se = TRUE,method = "gam", formula = y ~ s(x))
geom_smooth(se = TRUE, method = "glm")
ansEA.gbm.normalVsTumor <- TCGAanalyze_EAcomplete(
TFname="DEA genes NORMAL Vs RECUR_GBM",
RegulonList = gbm.DEG$gene_name)
TCGAvisualize_EAbarplot(tf = rownames(ansEA.gbm.normalVsTumor$ResBP),
GOBPTab = ansEA.gbm.normalVsTumor$ResBP,
GOCCTab = ansEA.gbm.normalVsTumor$ResCC,
GOMFTab = ansEA.gbm.normalVsTumor$ResMF,
PathTab = ansEA.gbm.normalVsTumor$ResPat,
nRGTab = gbm.DEG$gene_name,
filename = NULL,
nBar = 20)
# ) +
# geom_point(aes(x=logFC,y=-log10(PValue), alpha=-log10(PValue), size=-log10(PValue)))
?GDCquery
print(TCGAbiolinks:::getGDCprojects()$project_id[grepl("TCGA", TCGAbiolinks:::getGDCprojects()$project_id)])
combData <- cbind(gbm.rnaseqNormal.dataFilt,gbm.rnaseqTumor.dataFilt)
dim(gbm.rnaseqNormal.dataFilt)
data_subset <- combData[rowSums(combData)>50000,]
pheatmap::pheatmap(data_subset)
cal_z_score <- function(x){
(x - mean(x)) / sd(x)
}
data_subset_norm <- t(apply(data_subset, 1, cal_z_score))
pheatmap::pheatmap(data_subset_norm)
dim(gbm.rnaseqNormal.dataFilt) #,
dim(gbm.rnaseqTumor.dataFilt)
sampCol <- data.frame(sample = rep(c("normal", "tumor"), c(5,13)))
row.names(sampCol) <- colnames(combData)
head(sampCol)
head(data_subset_norm)
data_subset_norm <- t(apply(data_subset, 1, cal_z_score))
pheatmap::pheatmap(data_subset_norm, annotation_col = sampCol)