grabear · September 18, 2020 13:01 · Apr 27, 2018 · Apr 27, 2018 · Apr 27, 2018 · Apr 27, 2018
diff --git a/silva_walkthrough.R → SILVA_walkthrough.R b/silva_walkthrough.R → SILVA_walkthrough.R
diff --git a/silva_walkthrough.R b/silva_walkthrough.R
@@ -0,0 +1,14 @@
+# This functionality has been tested and a PR has been pulled with phyloseq here:
+# https://github.com/joey711/phyloseq/pull/854
+#
+# While the function has been vetted, the maintainers are very busy and the PR has not
+# yet been added to the main package.  Below i've added some detail to explain how to parse
+# your silva data.  It's quite easy....
+
+
+source("parse_silva_taxonomy_128")
+
+# import biom data
+silva_biom <- system.file("extdata", "SILVA_OTU_table.biom", package="phyloseq")
+# Create phyloseq object using silva parseing function
+silva_phyloseq <- import_biom(BIOMfilename = silva_biom, parseFunction = parse_taxonomy_silva_128)
diff --git a/parse_silva_taxonomy_128.R b/parse_silva_taxonomy_128.R
@@ -1,11 +1,6 @@
 #' @examples
 #'  taxvec1 = c("Root", "k__Bacteria", "p__Firmicutes", "c__Bacilli", "o__Bacillales", "f__Staphylococcaceae")
 #'  parse_taxonomy_default(taxvec1)
-#'  parse_taxonomy_greengenes(taxvec1)
-#'  taxvec2 = c("Root;k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Staphylococcaceae")
-#'  parse_taxonomy_qiime(taxvec2)
-#' taxvec3 = c("D_0__Bacteria", "D_1__Firmicutes", "D_2__Bacilli", "D_3__Staphylococcaceae")
-#' parse_taxonomy_silva(taxvec3)
 parse_taxonomy_default = function(char.vec){
 	# Remove any leading empty space
 	char.vec = gsub("^[[:space:]]{1,}", "", char.vec)
@@ -20,8 +15,9 @@ parse_taxonomy_default = function(char.vec){
 	return(char.vec)
 }
 
-
-
+#' @examples
+#' taxvec3 = c("D_0__Bacteria", "D_1__Firmicutes", "D_2__Bacilli", "D_3__Staphylococcaceae")
+#' parse_taxonomy_silva(taxvec3)
 parse_taxonomy_silva_128 <- function(char.vec){
   # Use default to assign names to elements in case problem with greengenes prefix
   char.vec = parse_taxonomy_default(char.vec)

diff --git a/parse_silva_taxonomy_128.R b/parse_silva_taxonomy_128.R
@@ -0,0 +1,66 @@
+#' @examples
+#'  taxvec1 = c("Root", "k__Bacteria", "p__Firmicutes", "c__Bacilli", "o__Bacillales", "f__Staphylococcaceae")
+#'  parse_taxonomy_default(taxvec1)
+#'  parse_taxonomy_greengenes(taxvec1)
+#'  taxvec2 = c("Root;k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Staphylococcaceae")
+#'  parse_taxonomy_qiime(taxvec2)
+#' taxvec3 = c("D_0__Bacteria", "D_1__Firmicutes", "D_2__Bacilli", "D_3__Staphylococcaceae")
+#' parse_taxonomy_silva(taxvec3)
+parse_taxonomy_default = function(char.vec){
+	# Remove any leading empty space
+	char.vec = gsub("^[[:space:]]{1,}", "", char.vec)
+	# Remove any trailing space
+	char.vec = gsub("[[:space:]]{1,}$", "", char.vec)
+	if( length(char.vec) > 0 ){
+		# Add dummy element (rank) name
+		names(char.vec) = paste("Rank", 1:length(char.vec), sep="")
+	} else {
+		warning("Empty taxonomy vector encountered.")
+	}
+	return(char.vec)
+}
+
+
+
+parse_taxonomy_silva_128 <- function(char.vec){
+  # Use default to assign names to elements in case problem with greengenes prefix
+  char.vec = parse_taxonomy_default(char.vec)
+  # Check for unassigned taxa
+  if (char.vec["Rank1"] == "Unassigned") {
+    char.vec <- c(Rank1="D_0__Unassigned", Rank2="D_1__Unassigned", Rank3="D_2__Unassigned", Rank4="D_3__Unassigned",
+                  Rank5="D_4__Unassigned", Rank6="D_5__Unassigned", Rank7="D_6__Unassigned")
+  }
+  # Define the meaning of each prefix according to SILVA taxonomy
+  Tranks = c(D_0="Kingdom", D_1="Phylum", D_2="Class", D_3="Order", D_4="Family", D_5="Genus", D_6="Species")
+  # Check for prefix using regexp, warn if there were none. trim indices, ti
+  ti = grep("[[:alpha:]]\\_[[:digit:]]{1}\\_\\_", char.vec)
+  if( length(ti) == 0L ){
+    warning(
+      "No silva prefixes were found. \n",
+      "Consider using parse_taxonomy_delfault() instead if true for all OTUs. \n",
+      "Dummy ranks may be included among taxonomic ranks now."
+    )
+    # Will want to return without further modifying char.vec
+    taxvec = char.vec
+    # Replace names of taxvec according to prefix, if any present...
+  } else {
+    # Format character vectors for Ambiguous taxa
+    if( length(ti) < 7 ){
+        for (key in names(char.vec)) {
+          if ( char.vec[key] == "Ambiguous_taxa" ) {
+            tax_no <- (as.numeric(substr(key, 5, 5)) - 1)
+            char.vec[key] = sprintf("D_%s__Ambiguous_taxa", tax_no)
+          }
+        }
+      # Reset the trimmed indicies if Ambiguous taxa
+      ti = grep("[[:alpha:]]\\_[[:digit:]]{1}\\_\\_", char.vec)
+    }
+    # Remove prefix using sub-"" regexp, call result taxvec
+    taxvec = gsub("[[:alpha:]]\\_[[:digit:]]{1}\\_\\_", "", char.vec)
+    # Define the ranks that will be replaced
+    repranks = Tranks[substr(char.vec[ti], 1, 3)]
+    # Replace, being sure to avoid prefixes notK present in Tranks
+    names(taxvec)[ti[!is.na(repranks)]] = repranks[!is.na(repranks)]
+  }
+  return(taxvec)
+}