skchronicles · July 25, 2025 15:46 · Jul 25, 2025
diff --git a/rna-seq_group-based_cpm_filtering.R b/rna-seq_group-based_cpm_filtering.R
@@ -0,0 +1,68 @@
+library("edgeR")
+
+# Reading in raw counts matrix
+raw <- read.table(
+    file = params$raw,
+    sep = "\t",
+    header = TRUE,
+    row.names = 1,
+    quote = ""
+)
+
+# Reading in sample sheet,
+# contains this TSV file
+# contains two columns:
+# Sample and Group
+groupinfo <- read.table(
+    file = params$group,
+    sep = "\t",
+    header = TRUE,
+    stringsAsFactors = TRUE
+)
+rownames(groupinfo) = make.names(groupinfo$Sample)
+
+# Reorder the columns of the counts
+# matrix to match the order of the
+# Sample column of groupinfo TSV,
+# Samples with names that do not
+# match are dropped
+grp_idx <- match(make.names(colnames(raw)), row.names(groupinfo), nomatch = 0)
+groupinfo <- groupinfo[grp_idx,]
+
+# Create DGEList object with edgeR
+group <- as.factor(groupinfo$Group)
+deg <- edgeR::DGEList(counts = raw, group = group)
+
+# Calculate CPM to apply group-based
+# CPM filtering. This filter is used
+# to remove lowly expressed genes where
+# a gene must have a CPM of 0.5 in at
+# least 50% of the samples in at least
+# half of the groups.
+cpm_threshold <- 0.5
+n_genes_before_filtering <- nrow(raw)
+cpm_matrix <- edgeR::cpm(raw)
+group_levels <- levels(group)
+n_groups <- length(group_levels)
+samples_per_group <- table(group)
+half_samples_per_group <- ceiling(samples_per_group / 2)
+half_groups <- ceiling(n_groups / 2)
+# Gene must have CPM>=0.5 in 50%
+# of the samples per a given group
+expr_by_group <- sapply(group_levels, function(g) {
+  samples_in_group <- which(group == g)
+  rowSums(cpm_matrix[, samples_in_group] >= cpm_threshold) >= half_samples_per_group[g]
+})
+# Gene must have a CPM>=0.5 in 50%
+# of the groups
+keep_genes <- rowSums(expr_by_group) >= half_groups
+# Filter lowly expressed genes
+# Gene must have CPM>=0.5 in 50% 
+# samples-per-group in 50% of the
+# Groups. The actual filtering
+# uses CPM values rather than counts
+# in order to avoid biases to
+# samples with large library sizes.
+keep_genes <- rowSums(expr_by_group) >= half_groups
+# Recaluate new lib.sizes after filtering
+deg <- deg[keep_genes, , keep.lib.sizes = FALSE]