Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save skchronicles/133bb7fe795652e41efe8ef4dfc410b2 to your computer and use it in GitHub Desktop.
Save skchronicles/133bb7fe795652e41efe8ef4dfc410b2 to your computer and use it in GitHub Desktop.

Revisions

  1. skchronicles created this gist Jul 25, 2025.
    68 changes: 68 additions & 0 deletions rna-seq_group-based_cpm_filtering.R
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,68 @@
    library("edgeR")

    # Reading in raw counts matrix
    raw <- read.table(
    file = params$raw,
    sep = "\t",
    header = TRUE,
    row.names = 1,
    quote = ""
    )

    # Reading in sample sheet,
    # contains this TSV file
    # contains two columns:
    # Sample and Group
    groupinfo <- read.table(
    file = params$group,
    sep = "\t",
    header = TRUE,
    stringsAsFactors = TRUE
    )
    rownames(groupinfo) = make.names(groupinfo$Sample)

    # Reorder the columns of the counts
    # matrix to match the order of the
    # Sample column of groupinfo TSV,
    # Samples with names that do not
    # match are dropped
    grp_idx <- match(make.names(colnames(raw)), row.names(groupinfo), nomatch = 0)
    groupinfo <- groupinfo[grp_idx,]

    # Create DGEList object with edgeR
    group <- as.factor(groupinfo$Group)
    deg <- edgeR::DGEList(counts = raw, group = group)

    # Calculate CPM to apply group-based
    # CPM filtering. This filter is used
    # to remove lowly expressed genes where
    # a gene must have a CPM of 0.5 in at
    # least 50% of the samples in at least
    # half of the groups.
    cpm_threshold <- 0.5
    n_genes_before_filtering <- nrow(raw)
    cpm_matrix <- edgeR::cpm(raw)
    group_levels <- levels(group)
    n_groups <- length(group_levels)
    samples_per_group <- table(group)
    half_samples_per_group <- ceiling(samples_per_group / 2)
    half_groups <- ceiling(n_groups / 2)
    # Gene must have CPM>=0.5 in 50%
    # of the samples per a given group
    expr_by_group <- sapply(group_levels, function(g) {
    samples_in_group <- which(group == g)
    rowSums(cpm_matrix[, samples_in_group] >= cpm_threshold) >= half_samples_per_group[g]
    })
    # Gene must have a CPM>=0.5 in 50%
    # of the groups
    keep_genes <- rowSums(expr_by_group) >= half_groups
    # Filter lowly expressed genes
    # Gene must have CPM>=0.5 in 50%
    # samples-per-group in 50% of the
    # Groups. The actual filtering
    # uses CPM values rather than counts
    # in order to avoid biases to
    # samples with large library sizes.
    keep_genes <- rowSums(expr_by_group) >= half_groups
    # Recaluate new lib.sizes after filtering
    deg <- deg[keep_genes, , keep.lib.sizes = FALSE]