*excel sheet 불러오기
file_path <- "top_markers_통합.xlsx"
sheet1_data <- read_excel(file_path, sheet = 1)
sheet2_data <- read_excel(file_path, sheet = 2)
sheet3_data <- read_excel(file_path, sheet = 3)
sheet4_data <- read_excel(file_path, sheet = 4)
top_markers의 구조
Gene/p_val/avg_log2FC/pct.1/pct.2/p_val_adj
*Gene filtering
filtered_genes <- sheet1_data %>%filter(p_val < 0.05 & abs(avg_log2FC) > 0.5) %>% select(Gene)
filtered_genes2 <- sheet2_data %>%filter(p_val < 0.05 & abs(avg_log2FC) > 0.5) %>% select(gene)
filtered_genes3 <- sheet3_data %>%filter(p_val < 0.05 & abs(avg_log2FC) > 0.5) %>% select(gene)
filtered_genes4 <- sheet4_data %>%filter(p_val < 0.05 & abs(avg_log2FC) > 0.5) %>% select(gene)
*TCGA data filtering
gene_name 추출
gene_name <- rowData(colon_data)$gene_name
fpkm_values <- assay(colon_data, "fpkm_unstrand")
df <- data.frame(values = gene_name)
intersection_df <- merge(df, filtered_genes2, by.x = "values", by.y = "gene")
expression_data <- data.frame(gene_name = gene_names, fpkm = fpkm_values)
filtered_sheet2_data <- sheet2_data[sheet2_data$gene %in% intersection_df$values, ]
tumor_samples <- grep("01A", colnames(expression_data), value = TRUE)
normal_samples <- grep("11A", colnames(expression_data), value = TRUE)
tumor_data <- expression_data[, tumor_samples]
normal_data <- expression_data[, normal_samples]
group <- factor(c(rep("Normal", length(normal_data)), rep("Cancer", length(tumor_data))))
expr_data_combined <- cbind(normal_data, tumor_data)
design <- model.matrix(~ group)
fit <- lmFit(expr_data_combined, design)
fit <- eBayes(fit)
tumor_medians <- apply(expr_data_combined[, tumor_samples], 1, median)
normal_medians <- apply(expr_data_combined[, normal_samples], 1, median)
results <- topTable(fit, coef = 2, n = Inf)
tumor_medians <- tumor_medians[match(rownames(results), names(tumor_medians))]
normal_medians <- normal_medians[match(rownames(results), names(normal_medians))]
final_results <- data.frame(
Gene = rownames(results), # Gene ID가 일관되게 나타나도록
logFC = results$logFC,
P.Value = results$P.Value,
Tumor_Median = tumor_medians,
Normal_Median = normal_medians
)
gene_names <- expression_data$gene_name
names(gene_names) <- rownames(expression_data)
final_results$Gene_Symbol <- gene_names[match(final_results$Gene, names(gene_names))]
final_results <- final_results[, c("Gene_Symbol", "Gene", "logFC", "P.Value", "Tumor_Median", "Normal_Median")]
ggplot(filtered_sheet2_data, aes(x = avg_log2FC, y = logFC)) +
geom_point(color = "blue", size = 2) + # 점 (logFC 값 표시)
geom_bar(stat = "identity", fill = "skyblue", alpha = 0.5) + # 바 그래프
theme(axis.text.x = element_text(angle = 90, hjust = 1)) + # x축 레이블 회전
labs(x = "HT04", y = "TCGA", title = "Gene Expression Change (logFC) in Tumor vs Normal") +
geom_text_repel(aes(label = ifelse(abs(logFC) >= 0.5, as.character(gene_name), "")), # logFC가 0.5 이상인 경우에만 레이블 표시
box.padding = 0.3, point.padding = 0.5, size = 3)
댓글
댓글 쓰기