pre-selected gene vs TCGA expression 비교

- 11월 21, 2024

library(TCGAbiolinks)

library(limma)

library(edgeR)
library(ggrepel)

#CRC data download
query <- GDCquery(project = "TCGA-COAD", 
data.category = "Transcriptome Profiling", 
data.type = "Gene Expression Quantification", 
workflow.type = "STAR - Counts")

GDCdownload(query)

colon_data <- GDCprepare(query)


#TCGA-CRC data에서 gene_name & FPKM 값 추출
gene_name <- rowData(colon_data)$gene_name
fpkm_values <- assay(colon_data, "fpkm_unstrand")
df <- data.frame(values = gene_name)
expression_data <- data.frame(gene_name = df, fpkm = fpkm_values)

#pre-selected gene list 생성
#기존의 output으로 만든 파일 활용

file_path <- "top_markers_통합.xlsx"

sheet1_data <- read_excel(file_path, sheet = 1)
sheet2_data <- read_excel(file_path, sheet = 2)
sheet3_data <- read_excel(file_path, sheet = 3)
sheet4_data <- read_excel(file_path, sheet = 4)

F1 <- sheet1_data %>%filter(p_val < 0.05 & abs(avg_log2FC) > 0.5) %>% select(gene)
F2 <- sheet2_data %>%filter(p_val < 0.05 & abs(avg_log2FC) > 0.5) %>% select(gene)
F3 <- sheet3_data %>%filter(p_val < 0.05 & abs(avg_log2FC) > 0.5) %>% select(gene)
F4 <- sheet4_data %>%filter(p_val < 0.05 & abs(avg_log2FC) > 0.5) %>% select(gene)

#pre-selection된 gene과 overlapping
intersection_df <- merge(df, F1, by.x = "values", by.y = "gene")

#overlap된 유전자만 발현값 filtering
F1_F <- sheet2[sheet1$gene %in% intersection_df$values, ]
CRC_1F <- expression_data[expression_data$values%in% intersection_df$values,]

#TCGA data (CRC_1F) normal vs cancer 분석

sample_ids <- colnames(CRC_1F)
normal_samples <- sample_ids[grepl("11A", sample_ids)]  # 정상 샘플
cancer_samples <- sample_ids[grepl("01A", sample_ids)]  # 암 샘플
CRC_1F_normal <- CRC_1F [, normal_samples]
CRC_1F_cancer <- CRC_1F [, cancer_samples]
print(dim(CRC_1F_normal))
print(dim(CRC_1F_cancer))

group <- factor(c(rep("Normal", length(normal_samples)), rep("Cancer", length(cancer_samples))))
CRC1F_combined <- cbind(CRC_1F_normal, CRC_1F_cancer)
design <- model.matrix(~ group)


#data가 ENSG 이기 때문에 symbol 이름 추가

libraray(ensembl)
ensembl <- useMart("ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl")

results <- topTable(fit, coef = 2, n = Inf)
gene_ids_clean <- sub("\\.\\d+$", "", rownames(results))


gene_symbols <- getBM(attributes = c("ensembl_gene_id", "hgnc_symbol"),
                      filters = "ensembl_gene_id",
                      values = gene_ids_clean,
                      mart = ensembl)

# Gene Symbol을 results에 추가
results$Gene_Symbol <- gene_symbols$hgnc_symbol[match(rownames(results), gene_symbols$ensembl_gene_id)]

# 결과 확인
head(results)

# 열 순서 바꾸기
results <- results[, c("Gene_Symbol", "logFC", "AveExpr", "t", "P.Value", "adj.P.Val","B")]

common_genes <- intersect(F1$gene, results$Gene_Symbol)
merged_data <- data.frame(
gene = common_genes,
expression_A = F1_F$avg_log2FC[match(common_genes, F1_F$gene)],
expression_B = results$logFC[match(common_genes, results$Gene_Symbol)]
)

# Scatter plot with ggrepel for labels (expression_B >= 0.5)
ggplot(merged_data, aes(x = expression_A, y = expression_B)) +
geom_point(aes(color = gene), size = 2) +  # 점 표시
geom_text_repel(aes(label = ifelse(abs(expression_B) >= 0.5, gene, "")),  # expression_B >= 0.5일 때만 이름 표시
size = 3, color = "black", box.padding = 0.35, point.padding = 0.5) +  # 이름 표시 설정
labs(title = "Gene Expression Comparison (A vs B)",
x = "Expression A", y = "Expression B") +  # 제목과 축 레이블
theme_minimal() +  # 깔끔한 테마
theme(legend.position = "none")  # 범례 숨기기


#Survival 분석
clinical_data <- colData(colon_data)
survival_data <- clinical_data[, c("sample", "vital_status", "ajcc_pathologic_stage", "sample_type")]
library(survival)
library(ggplot2)
library(TCGAbiolinks)

이 블로그 검색

SJ's blog

pre-selected gene vs TCGA expression 비교

댓글

댓글 쓰기

이 블로그의 인기 게시물

#single cell sequencing 기초 분석 - #1 R 설치 및 package 설치

리눅스 기초 #10 GATK calling을 사용하기 위하여, reference file indexing하는 방법

Single cell 분석을 위한 package 소개