Skip to content

Instantly share code, notes, and snippets.

@walterst
Last active April 17, 2018 12:57
Show Gist options
  • Select an option

  • Save walterst/96de484cdcfb13c9bed3fa27d8576dce to your computer and use it in GitHub Desktop.

Select an option

Save walterst/96de484cdcfb13c9bed3fa27d8576dce to your computer and use it in GitHub Desktop.

Revisions

  1. walterst revised this gist Apr 17, 2018. 1 changed file with 10 additions and 7 deletions.
    17 changes: 10 additions & 7 deletions find_fastq_errors.py
    Original file line number Diff line number Diff line change
    @@ -2,21 +2,24 @@

    # Used to find fastq seqs in gzipped files, write first error, if any, to a log file
    # Usage: python find_fastq_errors.py fastq_folder log_file
    # where fastq_folder has all of the fastq files in it (doesn't search subdirectories)
    # where fastq_folder has all of the fastq files in it-will search subdirectories

    from sys import argv
    from glob import glob

    import gzip
    import os

    header_index = 0
    sequence_index = 1
    quality_index = 2

    output_log = open(argv[2], "w")
    fastq_files = []

    fastq_files = glob(argv[1] + "/*.gz") + glob(argv[1] + "/*.fastq")
    fastq_files.extend(glob(argv[1] + "/*.fastq.gz") + glob(argv[1] + "/*.fastq"))

    for root,dirs,files in os.walk(argv[1]):
    for curr_dir in dirs:
    fastq_files.extend(glob(curr_dir + "/*.fastq.gz") + glob(curr_dir + "/*.fastq"))

    output_log = open(argv[2], "w")



    @@ -50,6 +53,6 @@
    break

    if error_data:
    output_log.write("%s\t%s\n" % (curr_file, error_data))
    output_log.write("%s\n%s\n" % (curr_file, error_data))

    query_reads.close()
  2. walterst created this gist Apr 17, 2018.
    55 changes: 55 additions & 0 deletions find_fastq_errors.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,55 @@
    #!/usr/bin/env python

    # Used to find fastq seqs in gzipped files, write first error, if any, to a log file
    # Usage: python find_fastq_errors.py fastq_folder log_file
    # where fastq_folder has all of the fastq files in it (doesn't search subdirectories)

    from sys import argv
    from glob import glob

    import gzip

    header_index = 0
    sequence_index = 1
    quality_index = 2


    fastq_files = glob(argv[1] + "/*.gz") + glob(argv[1] + "/*.fastq")

    output_log = open(argv[2], "w")



    # Expected to be 4 line per sequence, no empty lines, this is limited parser
    for curr_file in fastq_files:

    if curr_file.endswith('.gz'):
    query_reads = gzip.open(curr_file, "rb")
    else:
    query_reads = open(curr_file, "U")

    keep_reading = True
    error_data = ""
    while keep_reading:
    label = query_reads.readline().strip()
    if(label==""): # hit end of file
    keep_reading = False
    break
    if not label.startswith("@"):
    error_data += "Found read label without @: %s\n" % label
    seq = query_reads.readline().strip()
    opt_label = query_reads.readline().strip()
    if not opt_label.startswith("+"):
    error_data += "Found optional read label without +: %s\n" % opt_label
    qual = query_reads.readline().strip()

    if len(seq) != len(qual):
    error_data += "Found seq and qual of unequal lengths: \n%s\n%s" % (seq, qual)

    if error_data:
    break

    if error_data:
    output_log.write("%s\t%s\n" % (curr_file, error_data))

    query_reads.close()