【R语言数据分析开发指南】

发布于:2025-08-13 ⋅ 阅读:(15) ⋅ 点赞:(0)

R语言数据分析开发指南

目录

  1. 环境配置与安装
  2. 数据读取与预处理
  3. 可视化:热图(Heatmap)
  4. 可视化:韦恩图(Venn Diagram)
  5. 机器学习:LASSO回归
  6. 机器学习:支持向量机(SVM)
  7. 结果保存与可视化
  8. 项目结构建议
  9. 常见问题解决

1. 环境配置与安装

1.1 安装R与RStudio

  • R语言下载:https://cran.r-project.org
  • RStudio IDE:https://posit.co/download/rstudio-desktop/

1.2 设置CRAN镜像(提高下载速度)

# 设置清华大学镜像
options(repos = c(CRAN = "https://mirrors.tuna.tsinghua.edu.cn/CRAN/"))

# 或者设置中科大镜像
options(repos = c(CRAN = "https://mirrors.ustc.edu.cn/CRAN/"))

1.3 批量安装必需包

# 定义需要安装的包
required_packages <- c(
  # 数据处理
  "tidyverse", "data.table", "readr", "readxl", "janitor",
  
  # 可视化基础
  "ggplot2", "ggpubr", "patchwork", "RColorBrewer", "viridis",
  
  # 热图
  "pheatmap", "ComplexHeatmap", "circlize",
  
  # 韦恩图
  "VennDiagram", "ggVennDiagram", "eulerr",
  
  # 机器学习
  "glmnet", "caret", "e1071", "randomForest",
  
  # 模型评估
  "ROCR", "pROC", "MLmetrics",
  
  # 其他实用包
  "corrplot", "reshape2", "gridExtra"
)

# 检查并安装缺失的包
new_packages <- required_packages[!(required_packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) {
  install.packages(new_packages, dependencies = TRUE)
}

# 验证安装
cat("成功安装", length(required_packages) - length(new_packages), "个包\n")
if(length(new_packages) > 0) {
  cat("新安装的包:", paste(new_packages, collapse = ", "), "\n")
}

1.4 环境管理(推荐)

# 使用renv管理项目依赖
install.packages("renv")
renv::init()      # 初始化项目环境
renv::snapshot()  # 保存当前环境快照
renv::restore()   # 恢复环境

2. 数据读取与预处理

2.1 读取不同格式数据

library(tidyverse)
library(readxl)

# CSV文件
data_csv <- read_csv("data/expression_data.csv")

# Excel文件
data_excel <- read_excel("data/sample_info.xlsx", sheet = 1)

# 制表符分隔文件
data_tsv <- read_tsv("data/gene_annotation.tsv")

# RDS文件(R专用格式)
data_rds <- readRDS("data/processed_data.rds")

2.2 数据清洗与整理

library(janitor)

# 清理列名
data_clean <- data_csv %>% 
  clean_names() %>%                    # 标准化列名
  remove_empty(c("rows", "cols"))      # 移除空行空列

# 处理基因表达矩阵
prepare_expression_matrix <- function(expr_data) {
  # 假设第一列是基因名
  gene_names <- expr_data[[1]]
  expr_matrix <- as.matrix(expr_data[, -1])
  rownames(expr_matrix) <- gene_names
  
  # 过滤低表达基因
  keep_genes <- rowSums(expr_matrix > 0) >= ncol(expr_matrix) * 0.1
  expr_matrix <- expr_matrix[keep_genes, ]
  
  # 对数转换(如果需要)
  expr_matrix <- log2(expr_matrix + 1)
  
  return(expr_matrix)
}

# 标准化处理
standardize_data <- function(matrix_data, method = "zscore") {
  if (method == "zscore") {
    # Z-score标准化(按行)
    return(t(scale(t(matrix_data))))
  } else if (method == "minmax") {
    # Min-Max标准化
    return((matrix_data - min(matrix_data)) / (max(matrix_data) - min(matrix_data)))
  }
}

# 示例使用
expr_matrix <- prepare_expression_matrix(data_csv)
expr_scaled <- standardize_data(expr_matrix, method = "zscore")

2.3 数据拆分

# 设置随机种子确保可重现
set.seed(123)

# 按比例拆分训练集和测试集
split_data <- function(data_matrix, train_ratio = 0.7) {
  n_samples <- ncol(data_matrix)
  train_indices <- sample(1:n_samples, size = floor(n_samples * train_ratio))
  
  train_data <- data_matrix[, train_indices]
  test_data <- data_matrix[, -train_indices]
  
  return(list(
    train = train_data,
    test = test_data,
    train_indices = train_indices
  ))
}

# 使用示例
data_split <- split_data(expr_scaled, train_ratio = 0.7)
train_matrix <- data_split$train
test_matrix <- data_split$test

3. 可视化:热图(Heatmap)

3.1 基础热图(pheatmap)

library(pheatmap)
library(RColorBrewer)

# 选择高变异基因
select_top_variable_genes <- function(expr_matrix, n_genes = 100) {
  gene_vars <- apply(expr_matrix, 1, var, na.rm = TRUE)
  top_genes <- names(sort(gene_vars, decreasing = TRUE)[1:n_genes])
  return(expr_matrix[top_genes, ])
}

# 创建样本注释
create_sample_annotation <- function(sample_names, groups) {
  annotation_df <- data.frame(
    Group = groups,
    row.names = sample_names
  )
  return(annotation_df)
}

# 绘制热图
plot_basic_heatmap <- function(expr_data, annotation = NULL, title = "Gene Expression Heatmap") {
  # 选择颜色
  colors <- colorRampPalette(rev(brewer.pal(11, "RdBu")))(100)
  
  pheatmap(
    expr_data,
    color = colors,
    annotation_col = annotation,
    scale = "none",
    clustering_distance_rows = "euclidean",
    clustering_distance_cols = "euclidean",
    clustering_method = "complete",
    show_rownames = FALSE,
    show_colnames = TRUE,
    main = title,
    fontsize = 10,
    fontsize_row = 8,
    fontsize_col = 8
  )
}

# 示例使用
top_var_genes <- select_top_variable_genes(expr_scaled, n_genes = 50)

# 创建示例分组信息
sample_groups <- factor(rep(c("Control", "Treatment"), each = ncol(top_var_genes)/2))
sample_annotation <- create_sample_annotation(colnames(top_var_genes), sample_groups)

# 绘制热图
plot_basic_heatmap(top_var_genes, annotation = sample_annotation)

3.2 高级热图(ComplexHeatmap)

library(ComplexHeatmap)
library(circlize)

# 创建复杂热图
plot_complex_heatmap <- function(expr_data, annotation_data, title = "Complex Heatmap") {
  # 定义颜色映射
  col_fun <- colorRamp2(c(-2, 0, 2), c("#2166AC", "white", "#B2182B"))
  
  # 创建顶部注释
  top_annotation <- HeatmapAnnotation(
    Group = annotation_data$Group,
    col = list(Group = c("Control" = "#1B9E77", "Treatment" = "#D95F02")),
    annotation_name_side = "left"
  )
  
  # 创建热图
  ht <- Heatmap(
    expr_data,
    name = "Z-score",
    col = col_fun,
    top_annotation = top_annotation,
    show_row_names = FALSE,
    show_column_names = TRUE,
    column_title = title,
    row_title = "Genes",
    clustering_distance_rows = "euclidean",
    clustering_distance_columns = "euclidean",
    row_dend_reorder = TRUE,
    column_dend_reorder = TRUE
  )
  
  return(ht)
}

# 绘制复杂热图
complex_ht <- plot_complex_heatmap(top_var_genes, sample_annotation)
draw(complex_ht)

3.3 相关性热图

# 计算相关性矩阵
correlation_heatmap <- function(expr_data, method = "pearson") {
  cor_matrix <- cor(t(expr_data), method = method, use = "complete.obs")
  
  pheatmap(
    cor_matrix,
    color = colorRampPalette(c("#053061", "#2166AC", "#4393C3", "#92C5DE", 
                               "#D1E5F0", "#F7F7F7", "#FDBF6F", "#FF7F00", 
                               "#E31A1C", "#B10026"))(100),
    main = paste("Gene Correlation Heatmap (", method, ")", sep = ""),
    show_rownames = FALSE,
    show_colnames = FALSE
  )
}

# 绘制相关性热图
correlation_heatmap(top_var_genes, method = "pearson")

4. 可视化:韦恩图(Venn Diagram)

4.1 使用VennDiagram包

library(VennDiagram)

# 创建示例数据集
create_gene_sets <- function() {
  all_genes <- paste0("Gene_", 1:1000)
  
  set1 <- sample(all_genes, 200)  # 差异表达基因集1
  set2 <- sample(all_genes, 250)  # 差异表达基因集2
  set3 <- sample(all_genes, 180)  # 差异表达基因集3
  
  return(list(
    "Condition_A" = set1,
    "Condition_B" = set2,
    "Condition_C" = set3
  ))
}

# 绘制韦恩图
plot_venn_diagram <- function(gene_sets, output_file = NULL) {
  venn_plot <- venn.diagram(
    x = gene_sets,
    filename = output_file,
    output = TRUE,
  
    # 颜色设置
    fill = c("#66C2A5", "#FC8D62", "#8DA0CB"),
    alpha = 0.6,
  
    # 文字设置
    cex = 1.2,
    fontfamily = "serif",
    fontface = "bold",
    cat.cex = 1.1,
    cat.fontface = "bold",
    cat.default.pos = "outer",
    cat.pos = c(-27, 27, 135),
    cat.dist = c(0.055, 0.055, 0.085),
  
    # 边框设置
    lwd = 2,
    lty = 'solid',
    col = c("#66C2A5", "#FC8D62", "#8DA0CB"),
  
    # 标题
    main = "Gene Set Overlap",
    main.pos = c(0.5, 1.05),
    main.fontface = "bold",
    main.fontsize = 14
  )
  
  if (is.null(output_file)) {
    grid::grid.newpage()
    grid::grid.draw(venn_plot)
  }
  
  return(venn_plot)
}

# 使用示例
gene_sets <- create_gene_sets()
plot_venn_diagram(gene_sets)

4.2 使用ggVennDiagram包(推荐)

library(ggVennDiagram)

# 现代化韦恩图
plot_modern_venn <- function(gene_sets, colors = NULL) {
  if (is.null(colors)) {
    colors <- c("#E41A1C", "#377EB8", "#4DAF4A", "#984EA3")
  }
  
  p <- ggVennDiagram(gene_sets, 
                     label_alpha = 0.8,
                     category.names = names(gene_sets)) +
    scale_fill_gradient(low = "white", high = colors[1]) +
    theme_void() +
    labs(title = "Gene Set Overlap Analysis",
         subtitle = paste("Total unique genes:", length(unique(unlist(gene_sets)))))
  
  return(p)
}

# 四个集合的韦恩图
create_four_sets <- function() {
  all_genes <- paste0("Gene_", 1:1500)
  
  list(
    "Set_A" = sample(all_genes, 300),
    "Set_B" = sample(all_genes, 280),
    "Set_C" = sample(all_genes, 250),
    "Set_D" = sample(all_genes, 320)
  )
}

# 绘制四集合韦恩图
four_sets <- create_four_sets()
modern_venn <- plot_modern_venn(four_sets)
print(modern_venn)

4.3 交集分析

# 分析集合交集
analyze_intersections <- function(gene_sets) {
  library(dplyr)
  
  # 计算两两交集
  set_names <- names(gene_sets)
  n_sets <- length(gene_sets)
  
  intersection_results <- data.frame()
  
  for (i in 1:(n_sets-1)) {
    for (j in (i+1):n_sets) {
      intersection <- intersect(gene_sets[[i]], gene_sets[[j]])
    
      result <- data.frame(
        Set1 = set_names[i],
        Set2 = set_names[j],
        Intersection_Size = length(intersection),
        Jaccard_Index = length(intersection) / length(union(gene_sets[[i]], gene_sets[[j]])),
        stringsAsFactors = FALSE
      )
    
      intersection_results <- rbind(intersection_results, result)
    }
  }
  
  return(intersection_results)
}

# 使用示例
intersection_analysis <- analyze_intersections(gene_sets)
print(intersection_analysis)

5. 机器学习:LASSO回归

5.1 LASSO回归用于特征选择

library(glmnet)

# 准备LASSO回归数据
prepare_lasso_data <- function(expr_matrix, response_var) {
  # 转置矩阵:样本为行,基因为列
  x <- t(expr_matrix)
  y <- response_var
  
  return(list(x = x, y = y))
}

# 执行LASSO回归
perform_lasso <- function(x, y, alpha = 1, nfolds = 10) {
  set.seed(123)
  
  # 交叉验证选择最优lambda
  cv_fit <- cv.glmnet(x, y, alpha = alpha, nfolds = nfolds, 
                      type.measure = "mse", family = "gaussian")
  
  # 提取系数
  coefficients <- coef(cv_fit, s = "lambda.min")
  selected_features <- rownames(coefficients)[which(coefficients != 0)]
  selected_features <- selected_features[selected_features != "(Intercept)"]
  
  return(list(
    model = cv_fit,
    selected_features = selected_features,
    lambda_min = cv_fit$lambda.min,
    lambda_1se = cv_fit$lambda.1se
  ))
}

# 可视化LASSO路径
plot_lasso_path <- function(lasso_result) {
  par(mfrow = c(1, 2))
  
  # 系数路径图
  plot(lasso_result$model$glmnet.fit, xvar = "lambda", label = TRUE)
  title("LASSO Coefficient Path")
  
  # 交叉验证误差图
  plot(lasso_result$model)
  title("Cross-Validation Error")
  
  par(mfrow = c(1, 1))
}

# 示例:创建连续型响应变量
set.seed(123)
continuous_response <- rnorm(ncol(train_matrix))

# 准备数据
lasso_data <- prepare_lasso_data(train_matrix, continuous_response)

# 执行LASSO回归
lasso_result <- perform_lasso(lasso_data$x, lasso_data$y)

# 显示结果
cat("Selected features:", length(lasso_result$selected_features), "\n")
cat("Feature names:\n")
print(head(lasso_result$selected_features, 10))

# 绘制结果
plot_lasso_path(lasso_result)

5.2 LASSO用于分类(Logistic LASSO)

# 分类LASSO
perform_lasso_classification <- function(x, y, alpha = 1, nfolds = 10) {
  set.seed(123)
  
  # 确保y是因子
  y <- as.factor(y)
  
  # 交叉验证
  cv_fit <- cv.glmnet(x, y, alpha = alpha, nfolds = nfolds,
                      type.measure = "class", family = "binomial")
  
  # 提取系数
  coefficients <- coef(cv_fit, s = "lambda.min")
  selected_features <- rownames(coefficients)[which(coefficients != 0)]
  selected_features <- selected_features[selected_features != "(Intercept)"]
  
  return(list(
    model = cv_fit,
    selected_features = selected_features,
    lambda_min = cv_fit$lambda.min
  ))
}

# 示例:创建二分类响应变量
binary_response <- factor(sample(c("Control", "Treatment"), 
                                ncol(train_matrix), replace = TRUE))

# 执行分类LASSO
lasso_class_data <- prepare_lasso_data(train_matrix, binary_response)
lasso_class_result <- perform_lasso_classification(lasso_class_data$x, lasso_class_data$y)

cat("Classification LASSO selected features:", length(lasso_class_result$selected_features), "\n")

5.3 模型预测与评估

# 预测函数
predict_lasso <- function(lasso_model, new_x, type = "response") {
  predictions <- predict(lasso_model$model, newx = new_x, 
                        s = "lambda.min", type = type)
  return(predictions)
}

# 回归评估
evaluate_regression <- function(actual, predicted) {
  mse <- mean((actual - predicted)^2)
  rmse <- sqrt(mse)
  mae <- mean(abs(actual - predicted))
  r_squared <- cor(actual, predicted)^2
  
  return(list(
    MSE = mse,
    RMSE = rmse,
    MAE = mae,
    R_squared = r_squared
  ))
}

# 示例预测
test_data <- prepare_lasso_data(test_matrix, rnorm(ncol(test_matrix)))
predictions <- predict_lasso(lasso_result, test_data$x)
evaluation <- evaluate_regression(test_data$y, predictions[,1])
print(evaluation)

6. 机器学习:支持向量机(SVM)

6.1 使用caret包的SVM

library(caret)
library(e1071)

# 准备SVM数据
prepare_svm_data <- function(expr_matrix, labels, feature_selection = TRUE, top_n = 100) {
  # 转置矩阵
  data_df <- as.data.frame(t(expr_matrix))
  
  # 特征选择(可选)
  if (feature_selection && ncol(data_df) > top_n) {
    # 基于方差的特征选择
    feature_vars <- apply(expr_matrix, 1, var)
    top_features <- names(sort(feature_vars, decreasing = TRUE)[1:top_n])
    data_df <- data_df[, top_features]
  }
  
  # 添加标签
  data_df$Class <- as.factor(labels)
  
  return(data_df)
}

# 训练SVM模型
train_svm_model <- function(train_data, method = "svmRadial") {
  # 设置训练控制参数
  ctrl <- trainControl(
    method = "repeatedcv",
    number = 5,              # 5折交叉验证
    repeats = 3,             # 重复3次
    classProbs = TRUE,       # 计算类别概率
    summaryFunction = twoClassSummary,
    savePredictions = "final"
  )
  
  # 训练模型
  set.seed(123)
  svm_model <- train(
    Class ~ .,
    data = train_data,
    method = method,
    metric = "ROC",          # 使用ROC作为评估指标
    trControl = ctrl,
    preProcess = c("center", "scale"),  # 标准化
    tuneLength = 5          # 参数调优
  )
  
  return(svm_model)
}

# 创建示例数据
train_labels <- factor(rep(c("Control", "Treatment"), 
                          length.out = ncol(train_matrix)))
test_labels <- factor(rep(c("Control", "Treatment"), 
                         length.out = ncol(test_matrix)))

# 准备训练和测试数据
train_svm_data <- prepare_svm_data(train_matrix, train_labels, top_n = 50)
test_svm_data <- prepare_svm_data(test_matrix, test_labels, top_n = 50)

# 训练模型
svm_model <- train_svm_model(train_svm_data)

# 查看模型结果
print(svm_model)
plot(svm_model)

6.2 模型预测与评估

library(pROC)

# SVM预测函数
predict_svm <- function(model, test_data) {
  # 预测类别
  predictions <- predict(model, newdata = test_data)
  
  # 预测概率
  probabilities <- predict(model, newdata = test_data, type = "prob")
  
  return(list(
    classes = predictions,
    probabilities = probabilities
  ))
}

# 评估分类性能
evaluate_classification <- function(actual, predicted, probabilities = NULL) {
  # 混淆矩阵
  conf_matrix <- confusionMatrix(predicted, actual)
  
  results <- list(
    confusion_matrix = conf_matrix,
    accuracy = conf_matrix$overall['Accuracy'],
    sensitivity = conf_matrix$byClass['Sensitivity'],
    specificity = conf_matrix$byClass['Specificity']
  )
  
  # 如果有概率预测,计算ROC
  if (!is.null(probabilities)) {
    roc_obj <- roc(response = actual, 
                   predictor = probabilities[, 1],
                   levels = rev(levels(actual)))
    results$auc <- auc(roc_obj)
    results$roc <- roc_obj
  }
  
  return(results)
}

# 执行预测
svm_predictions <- predict_svm(svm_model, test_svm_data)

# 评估模型
svm_evaluation <- evaluate_classification(
  actual = test_svm_data$Class,
  predicted = svm_predictions$classes,
  probabilities = svm_predictions$probabilities
)

# 打印结果
print(svm_evaluation$confusion_matrix)
cat("AUC:", round(svm_evaluation$auc, 3), "\n")

# 绘制ROC曲线
plot(svm_evaluation$roc, col = "#D95F02", lwd = 2, 
     main = "SVM ROC Curve")
legend("bottomright", legend = paste("AUC =", round(svm_evaluation$auc, 3)),
       col = "#D95F02", lwd = 2)

6.3 多类分类SVM

# 多类分类示例
create_multiclass_data <- function(expr_matrix, n_classes = 3) {
  n_samples <- ncol(expr_matrix)
  class_labels <- factor(sample(paste0("Class_", 1:n_classes), 
                               n_samples, replace = TRUE))
  
  data_df <- as.data.frame(t(expr_matrix))
  
  # 特征选择
  feature_vars <- apply(expr_matrix, 1, var)
  top_features <- names(sort(feature_vars, decreasing = TRUE)[1:100])
  data_df <- data_df[, top_features]
  
  data_df$Class <- class_labels
  return(data_df)
}

# 多类分类评估
evaluate_multiclass <- function(actual, predicted) {
  conf_matrix <- confusionMatrix(predicted, actual)
  
  # 计算每个类别的F1分数
  precision <- conf_matrix$byClass[, "Precision"]
  recall <- conf_matrix$byClass[, "Recall"]
  f1_scores <- 2 * (precision * recall) / (precision + recall)
  
  return(list(
    confusion_matrix = conf_matrix,
    accuracy = conf_matrix$overall['Accuracy'],
    f1_scores = f1_scores,
    macro_f1 = mean(f1_scores, na.rm = TRUE)
  ))
}

# 创建多类数据
multiclass_data <- create_multiclass_data(train_matrix, n_classes = 3)

# 训练多类SVM
multiclass_svm <- train_svm_model(multiclass_data)
print(multiclass_svm)

7. 结果保存与可视化

7.1 保存模型和结果

# 创建结果目录
dir.create("results", showWarnings = FALSE)
dir.create("results/models", showWarnings = FALSE)
dir.create("results/plots", showWarnings = FALSE)
dir.create("results/data", showWarnings = FALSE)

# 保存模型
save_models <- function() {
  saveRDS(lasso_result, "results/models/lasso_model.rds")
  saveRDS(svm_model, "results/models/svm_model.rds")
  
  # 保存特征选择结果
  writeLines(lasso_result$selected_features, 
             "results/data/lasso_selected_genes.txt")
  
  # 保存预测结果
  write.csv(svm_evaluation$confusion_matrix$table, 
            "results/data/svm_confusion_matrix.csv")
}

save_models()

7.2 生成综合报告

# 生成模型性能报告
generate_performance_report <- function() {
  report <- paste(
    "=== 模型性能报告 ===",
    paste("生成时间:", Sys.time()),
    "",
    "1. LASSO回归结果:",
    paste("  - 选择特征数量:", length(lasso_result$selected_features)),
    paste("  - 最优lambda:", round(lasso_result$lambda_min, 6)),
    "",
    "2. SVM分类结果:",
    paste("  - 准确率:", round(svm_evaluation$accuracy, 3)),
    paste("  - AUC:", round(svm_evaluation$auc, 3)),
    paste("  - 敏感性:", round(svm_evaluation$sensitivity, 3)),
    paste("  - 特异性:", round(svm_evaluation$specificity, 3)),
    "",
    "3. 数据信息:",
    paste("  - 训练样本数:", ncol(train_matrix)),
    paste("  - 测试样本数:", ncol(test_matrix)),
    paste("  - 总基因数:", nrow(expr_scaled)),
    "",
    sep = "\n"
  )
  
  writeLines(report, "results/performance_report.txt")
  cat(report)
}

generate_performance_report()

7.3 保存图形

# 批量保存图形
save_plots <- function() {
  # 保存热图
  png("results/plots/heatmap.png", width = 1200, height = 1000, res = 150)
  plot_basic_heatmap(top_var_genes, sample_annotation)
  dev.off()
  
  # 保存韦恩图
  png("results/plots/venn_diagram.png", width = 800, height = 600, res = 150)
  plot_venn_diagram(gene_sets, output_file = NULL)
  dev.off()
  
  # 保存LASSO路径图
  png("results/plots/lasso_path.png", width = 1000, height = 500, res = 150)
  plot_lasso_path(lasso_result)
  dev.off()
  
  # 保存ROC曲线
  png("results/plots/roc_curve.png", width = 600, height = 600, res = 150)
  plot(svm_evaluation$roc, col = "#D95F02", lwd = 2, main = "SVM ROC Curve")
  legend("bottomright", legend = paste("AUC =", round(svm_evaluation$auc, 3)),
         col = "#D95F02", lwd = 2)
  dev.off()
  
  cat("所有图形已保存到 results/plots/ 目录\n")
}

save_plots()

8. 项目结构建议

project_name/
├── data/                          # 原始数据
│   ├── expression_data.csv
│   ├── sample_info.xlsx
│   └── gene_annotation.tsv
├── scripts/                       # R脚本
│   ├── 01_data_preprocessing.R
│   ├── 02_exploratory_analysis.R
│   ├── 03_visualization.R
│   ├── 04_machine_learning.R
│   └── 05_generate_report.R
├── results/                       # 结果输出
│   ├── models/                    # 保存的模型
│   ├── plots/                     # 图形文件
│   └── data/                      # 处理后的数据
├── functions/                     # 自定义函数
│   ├── data_utils.R
│   ├── plot_utils.R
│   └── ml_utils.R
├── renv/                          # 环境管理(可选)
├── README.md                      # 项目说明
├── main.R                         # 主执行脚本
└── .Rprofile                      # R配置文件

主执行脚本示例(main.R)

# 主执行脚本
cat("开始数据分析流程...\n")

# 加载必要的包和函数
source("functions/data_utils.R")
source("functions/plot_utils.R")
source("functions/ml_utils.R")

# 执行分析步骤
source("scripts/01_data_preprocessing.R")
source("scripts/02_exploratory_analysis.R")
source("scripts/03_visualization.R")
source("scripts/04_machine_learning.R")
source("scripts/05_generate_report.R")

cat("分析流程完成!\n")
cat("结果已保存到 results/ 目录\n")

9. 常见问题解决

9.1 安装问题

# 问题1: 包安装失败
# 解决方案: 更换镜像或使用二进制包
install.packages("package_name", type = "binary")

# 问题2: 依赖包冲突
# 解决方案: 使用renv管理环境
renv::restore()

# 问题3: 内存不足
# 解决方案: 增加内存限制
memory.limit(size = 8000)  # Windows系统

9.2 数据处理问题

# 问题1: 缺失值处理
handle_missing_values <- function(data_matrix, method = "remove") {
  if (method == "remove") {
    # 移除含有缺失值的行
    return(data_matrix[complete.cases(data_matrix), ])
  } else if (method == "impute") {
    # 用均值填充缺失值
    for (i in 1:ncol(data_matrix)) {
      data_matrix[is.na(data_matrix[, i]), i] <- mean(data_matrix[, i], na.rm = TRUE)
    }
    return(data_matrix)
  }
}

# 问题2: 数据类型转换
ensure_numeric_matrix <- function(data_df) {
  # 确保所有列都是数值型
  numeric_cols <- sapply(data_df, is.numeric)
  if (!all(numeric_cols)) {
    warning("发现非数值列,将尝试转换")
    data_df[, !numeric_cols] <- lapply(data_df[, !numeric_cols], as.numeric)
  }
  return(as.matrix(data_df))
}

9.3 性能优化

# 问题1: 大数据集处理慢
# 解决方案: 使用data.table或并行计算
library(data.table)
library(parallel)

# 并行计算示例
parallel_correlation <- function(expr_matrix, n_cores = 4) {
  cl <- makeCluster(n_cores)
  clusterEvalQ(cl, library(stats))
  
  result <- parApply(cl, expr_matrix, 1, function(x) {
    cor(x, method = "pearson")
  })
  
  stopCluster(cl)
  return(result)
}

# 问题2: 内存使用优化
# 解决方案: 分批处理大数据
process_in_batches <- function(data_matrix, batch_size = 1000, fun) {
  n_rows <- nrow(data_matrix)
  n_batches <- ceiling(n_rows / batch_size)
  
  results <- list()
  
  for (i in 1:n_batches) {
    start_idx <- (i - 1) * batch_size + 1
    end_idx <- min(i * batch_size, n_rows)
  
    batch_data <- data_matrix[start_idx:end_idx, ]
    results[[i]] <- fun(batch_data)
  
    # 垃圾回收
    gc()
  }
  
  return(do.call(rbind, results))
}

9.4 调试技巧

# 调试函数
debug_info <- function(obj, name = "object") {
  cat("=== Debug Info for", name, "===\n")
  cat("Class:", class(obj), "\n")
  cat("Dimensions:", dim(obj), "\n")
  cat("Length:", length(obj), "\n")
  if (is.numeric(obj)) {
    cat("Range:", range(obj, na.rm = TRUE), "\n")
    cat("NA count:", sum(is.na(obj)), "\n")
  }
  cat("First few elements:\n")
  print(head(obj))
  cat("========================\n")
}

# 使用示例
debug_info(expr_matrix, "expression_matrix")

总结

本指南涵盖了R语言在生物信息学和数据科学中的核心应用:

  1. 环境配置:从包安装到项目管理的完整设置
  2. 数据处理:标准化的数据读取、清洗和预处理流程
  3. 可视化:热图和韦恩图的多种实现方法
  4. 机器学习:LASSO回归和SVM的实用实现
  5. 结果管理:模型保存、图形输出和报告生成
  6. 项目结构:可复现的项目组织方式
  7. 问题解决:常见问题的诊断和解决方案