Created
July 25, 2025 15:46
-
-
Save skchronicles/133bb7fe795652e41efe8ef4dfc410b2 to your computer and use it in GitHub Desktop.
Revisions
-
skchronicles created this gist
Jul 25, 2025 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,68 @@ library("edgeR") # Reading in raw counts matrix raw <- read.table( file = params$raw, sep = "\t", header = TRUE, row.names = 1, quote = "" ) # Reading in sample sheet, # contains this TSV file # contains two columns: # Sample and Group groupinfo <- read.table( file = params$group, sep = "\t", header = TRUE, stringsAsFactors = TRUE ) rownames(groupinfo) = make.names(groupinfo$Sample) # Reorder the columns of the counts # matrix to match the order of the # Sample column of groupinfo TSV, # Samples with names that do not # match are dropped grp_idx <- match(make.names(colnames(raw)), row.names(groupinfo), nomatch = 0) groupinfo <- groupinfo[grp_idx,] # Create DGEList object with edgeR group <- as.factor(groupinfo$Group) deg <- edgeR::DGEList(counts = raw, group = group) # Calculate CPM to apply group-based # CPM filtering. This filter is used # to remove lowly expressed genes where # a gene must have a CPM of 0.5 in at # least 50% of the samples in at least # half of the groups. cpm_threshold <- 0.5 n_genes_before_filtering <- nrow(raw) cpm_matrix <- edgeR::cpm(raw) group_levels <- levels(group) n_groups <- length(group_levels) samples_per_group <- table(group) half_samples_per_group <- ceiling(samples_per_group / 2) half_groups <- ceiling(n_groups / 2) # Gene must have CPM>=0.5 in 50% # of the samples per a given group expr_by_group <- sapply(group_levels, function(g) { samples_in_group <- which(group == g) rowSums(cpm_matrix[, samples_in_group] >= cpm_threshold) >= half_samples_per_group[g] }) # Gene must have a CPM>=0.5 in 50% # of the groups keep_genes <- rowSums(expr_by_group) >= half_groups # Filter lowly expressed genes # Gene must have CPM>=0.5 in 50% # samples-per-group in 50% of the # Groups. The actual filtering # uses CPM values rather than counts # in order to avoid biases to # samples with large library sizes. keep_genes <- rowSums(expr_by_group) >= half_groups # Recaluate new lib.sizes after filtering deg <- deg[keep_genes, , keep.lib.sizes = FALSE]