#!/usr/bin/env python # Used to find fastq seqs in gzipped files, write first error, if any, to a log file # Usage: python find_fastq_errors.py fastq_folder log_file # where fastq_folder has all of the fastq files in it-will search subdirectories from sys import argv from glob import glob import gzip import os output_log = open(argv[2], "w") fastq_files = [] fastq_files.extend(glob(argv[1] + "/*.fastq.gz") + glob(argv[1] + "/*.fastq")) for root,dirs,files in os.walk(argv[1]): for curr_dir in dirs: fastq_files.extend(glob(curr_dir + "/*.fastq.gz") + glob(curr_dir + "/*.fastq")) # Expected to be 4 line per sequence, no empty lines, this is limited parser for curr_file in fastq_files: if curr_file.endswith('.gz'): query_reads = gzip.open(curr_file, "rb") else: query_reads = open(curr_file, "U") keep_reading = True error_data = "" while keep_reading: label = query_reads.readline().strip() if(label==""): # hit end of file keep_reading = False break if not label.startswith("@"): error_data += "Found read label without @: %s\n" % label seq = query_reads.readline().strip() opt_label = query_reads.readline().strip() if not opt_label.startswith("+"): error_data += "Found optional read label without +: %s\n" % opt_label qual = query_reads.readline().strip() if len(seq) != len(qual): error_data += "Found seq and qual of unequal lengths: \n%s\n%s" % (seq, qual) if error_data: break if error_data: output_log.write("%s\n%s\n" % (curr_file, error_data)) query_reads.close()