pre-selected gene vs TCGA expression 비교
library(TCGAbiolinks)
library(limma)
library(edgeR)
library(ggrepel)
#CRC data download
query <- GDCquery(project = "TCGA-COAD",
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "STAR - Counts")
GDCdownload(query)
colon_data <- GDCprepare(query)
#TCGA-CRC data에서 gene_name & FPKM 값 추출
gene_name <- rowData(colon_data)$gene_name
fpkm_values <- assay(colon_data, "fpkm_unstrand")
df <- data.frame(values = gene_name)
expression_data <- data.frame(gene_name = df, fpkm = fpkm_values)
#pre-selected gene list 생성
#기존의 output으로 만든 파일 활용
file_path <- "top_markers_통합.xlsx"
sheet1_data <- read_excel(file_path, sheet = 1)
sheet2_data <- read_excel(file_path, sheet = 2)
sheet3_data <- read_excel(file_path, sheet = 3)
sheet4_data <- read_excel(file_path, sheet = 4)
F1 <- sheet1_data %>%filter(p_val < 0.05 & abs(avg_log2FC) > 0.5) %>% select(gene)
F2 <- sheet2_data %>%filter(p_val < 0.05 & abs(avg_log2FC) > 0.5) %>% select(gene)
F3 <- sheet3_data %>%filter(p_val < 0.05 & abs(avg_log2FC) > 0.5) %>% select(gene)
F4 <- sheet4_data %>%filter(p_val < 0.05 & abs(avg_log2FC) > 0.5) %>% select(gene)
#pre-selection된 gene과 overlapping
intersection_df <- merge(df, F1, by.x = "values", by.y = "gene")
#overlap된 유전자만 발현값 filtering
F1_F <- sheet2[sheet1$gene %in% intersection_df$values, ]
CRC_1F <- expression_data[expression_data$values%in% intersection_df$values,]
#TCGA data (CRC_1F) normal vs cancer 분석
sample_ids <- colnames(CRC_1F)
normal_samples <- sample_ids[grepl("11A", sample_ids)] # 정상 샘플
cancer_samples <- sample_ids[grepl("01A", sample_ids)] # 암 샘플
CRC_1F_normal <- CRC_1F [, normal_samples]
CRC_1F_cancer <- CRC_1F [, cancer_samples]
print(dim(CRC_1F_normal))
print(dim(CRC_1F_cancer))
group <- factor(c(rep("Normal", length(normal_samples)), rep("Cancer", length(cancer_samples))))
CRC1F_combined <- cbind(CRC_1F_normal, CRC_1F_cancer)
design <- model.matrix(~ group)
#data가 ENSG 이기 때문에 symbol 이름 추가
libraray(ensembl)
ensembl <- useMart("ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl")
results <- topTable(fit, coef = 2, n = Inf)
gene_ids_clean <- sub("\\.\\d+$", "", rownames(results))
gene_symbols <- getBM(attributes = c("ensembl_gene_id", "hgnc_symbol"),
filters = "ensembl_gene_id",
values = gene_ids_clean,
mart = ensembl)
# Gene Symbol을 results에 추가
results$Gene_Symbol <- gene_symbols$hgnc_symbol[match(rownames(results), gene_symbols$ensembl_gene_id)]
# 결과 확인
head(results)
# 열 순서 바꾸기results <- results[, c("Gene_Symbol", "logFC", "AveExpr", "t", "P.Value", "adj.P.Val","B")]
common_genes <- intersect(F1$gene, results$Gene_Symbol)
merged_data <- data.frame(
gene = common_genes,
expression_A = F1_F$avg_log2FC[match(common_genes, F1_F$gene)],
expression_B = results$logFC[match(common_genes, results$Gene_Symbol)]
)
# Scatter plot with ggrepel for labels (expression_B >= 0.5)
ggplot(merged_data, aes(x = expression_A, y = expression_B)) +
geom_point(aes(color = gene), size = 2) + # 점 표시
geom_text_repel(aes(label = ifelse(abs(expression_B) >= 0.5, gene, "")), # expression_B >= 0.5일 때만 이름 표시
size = 3, color = "black", box.padding = 0.35, point.padding = 0.5) + # 이름 표시 설정
labs(title = "Gene Expression Comparison (A vs B)",
x = "Expression A", y = "Expression B") + # 제목과 축 레이블
theme_minimal() + # 깔끔한 테마
theme(legend.position = "none") # 범례 숨기기
#Survival 분석
clinical_data <- colData(colon_data)
survival_data <- clinical_data[, c("sample", "vital_status", "ajcc_pathologic_stage", "sample_type")]
library(survival)
library(ggplot2)
library(TCGAbiolinks)
댓글
댓글 쓰기