walterst · April 17, 2018 12:57 · Apr 17, 2018 · Apr 17, 2018
diff --git a/find_fastq_errors.py b/find_fastq_errors.py
@@ -2,21 +2,24 @@
 
 # Used to find fastq seqs in gzipped files, write first error, if any, to a log file
 # Usage:  python find_fastq_errors.py fastq_folder log_file
-# where fastq_folder has all of the fastq files in it (doesn't search subdirectories)
+# where fastq_folder has all of the fastq files in it-will search subdirectories
 
 from sys import argv
 from glob import glob
 
 import gzip
+import os
 
-header_index = 0
-sequence_index = 1
-quality_index = 2
 
+output_log = open(argv[2], "w")
+fastq_files = []
 
-fastq_files = glob(argv[1] + "/*.gz") + glob(argv[1] + "/*.fastq")       
+fastq_files.extend(glob(argv[1] + "/*.fastq.gz") + glob(argv[1] + "/*.fastq"))
+
+for root,dirs,files in os.walk(argv[1]):
+    for curr_dir in dirs:
+        fastq_files.extend(glob(curr_dir + "/*.fastq.gz") + glob(curr_dir + "/*.fastq"))
 
-output_log = open(argv[2], "w")
 
 
 
@@ -50,6 +53,6 @@
             break
 
     if error_data:
-        output_log.write("%s\t%s\n" % (curr_file, error_data))
+        output_log.write("%s\n%s\n" % (curr_file, error_data))
 
     query_reads.close()
diff --git a/find_fastq_errors.py b/find_fastq_errors.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python
+
+# Used to find fastq seqs in gzipped files, write first error, if any, to a log file
+# Usage:  python find_fastq_errors.py fastq_folder log_file
+# where fastq_folder has all of the fastq files in it (doesn't search subdirectories)
+
+from sys import argv
+from glob import glob
+
+import gzip
+
+header_index = 0
+sequence_index = 1
+quality_index = 2
+
+
+fastq_files = glob(argv[1] + "/*.gz") + glob(argv[1] + "/*.fastq")       
+
+output_log = open(argv[2], "w")
+
+
+
+# Expected to be 4 line per sequence, no empty lines, this is limited parser
+for curr_file in fastq_files:
+
+    if curr_file.endswith('.gz'):
+        query_reads = gzip.open(curr_file, "rb")
+    else:
+        query_reads = open(curr_file, "U")
+
+    keep_reading = True
+    error_data = ""
+    while keep_reading:
+        label = query_reads.readline().strip()
+        if(label==""):   # hit end of file
+            keep_reading = False
+            break
+        if not label.startswith("@"):
+            error_data += "Found read label without @: %s\n" % label
+        seq = query_reads.readline().strip()
+        opt_label = query_reads.readline().strip()
+        if not opt_label.startswith("+"):
+            error_data += "Found optional read label without +: %s\n" % opt_label
+        qual = query_reads.readline().strip()
+
+        if len(seq) != len(qual):
+            error_data += "Found seq and qual of unequal lengths: \n%s\n%s" % (seq, qual)
+
+        if error_data:
+            break
+
+    if error_data:
+        output_log.write("%s\t%s\n" % (curr_file, error_data))
+
+    query_reads.close()
No results found