Last active
April 17, 2018 12:57
-
-
Save walterst/96de484cdcfb13c9bed3fa27d8576dce to your computer and use it in GitHub Desktop.
Revisions
-
walterst revised this gist
Apr 17, 2018 . 1 changed file with 10 additions and 7 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -2,21 +2,24 @@ # Used to find fastq seqs in gzipped files, write first error, if any, to a log file # Usage: python find_fastq_errors.py fastq_folder log_file # where fastq_folder has all of the fastq files in it-will search subdirectories from sys import argv from glob import glob import gzip import os output_log = open(argv[2], "w") fastq_files = [] fastq_files.extend(glob(argv[1] + "/*.fastq.gz") + glob(argv[1] + "/*.fastq")) for root,dirs,files in os.walk(argv[1]): for curr_dir in dirs: fastq_files.extend(glob(curr_dir + "/*.fastq.gz") + glob(curr_dir + "/*.fastq")) @@ -50,6 +53,6 @@ break if error_data: output_log.write("%s\n%s\n" % (curr_file, error_data)) query_reads.close() -
walterst created this gist
Apr 17, 2018 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,55 @@ #!/usr/bin/env python # Used to find fastq seqs in gzipped files, write first error, if any, to a log file # Usage: python find_fastq_errors.py fastq_folder log_file # where fastq_folder has all of the fastq files in it (doesn't search subdirectories) from sys import argv from glob import glob import gzip header_index = 0 sequence_index = 1 quality_index = 2 fastq_files = glob(argv[1] + "/*.gz") + glob(argv[1] + "/*.fastq") output_log = open(argv[2], "w") # Expected to be 4 line per sequence, no empty lines, this is limited parser for curr_file in fastq_files: if curr_file.endswith('.gz'): query_reads = gzip.open(curr_file, "rb") else: query_reads = open(curr_file, "U") keep_reading = True error_data = "" while keep_reading: label = query_reads.readline().strip() if(label==""): # hit end of file keep_reading = False break if not label.startswith("@"): error_data += "Found read label without @: %s\n" % label seq = query_reads.readline().strip() opt_label = query_reads.readline().strip() if not opt_label.startswith("+"): error_data += "Found optional read label without +: %s\n" % opt_label qual = query_reads.readline().strip() if len(seq) != len(qual): error_data += "Found seq and qual of unequal lengths: \n%s\n%s" % (seq, qual) if error_data: break if error_data: output_log.write("%s\t%s\n" % (curr_file, error_data)) query_reads.close()