#!/usr/bin/env python

# Used to find fastq seqs in gzipped files, write first error, if any, to a log file
# Usage:  python find_fastq_errors.py fastq_folder log_file
# where fastq_folder has all of the fastq files in it-will search subdirectories

from sys import argv
from glob import glob

import gzip
import os


output_log = open(argv[2], "w")
fastq_files = []

fastq_files.extend(glob(argv[1] + "/*.fastq.gz") + glob(argv[1] + "/*.fastq"))

for root,dirs,files in os.walk(argv[1]):
    for curr_dir in dirs:
        fastq_files.extend(glob(curr_dir + "/*.fastq.gz") + glob(curr_dir + "/*.fastq"))


# Expected to be 4 line per sequence, no empty lines, this is limited parser
for curr_file in fastq_files:

    if curr_file.endswith('.gz'):
        query_reads = gzip.open(curr_file, "rb")
    else:
        query_reads = open(curr_file, "U")

    keep_reading = True
    error_data = ""
    while keep_reading:
        label = query_reads.readline().strip()
        if(label==""):   # hit end of file
            keep_reading = False
            break
        if not label.startswith("@"):
            error_data += "Found read label without @: %s\n" % label
        seq = query_reads.readline().strip()
        opt_label = query_reads.readline().strip()
        if not opt_label.startswith("+"):
            error_data += "Found optional read label without +: %s\n" % opt_label
        qual = query_reads.readline().strip()
        
        if len(seq) != len(qual):
            error_data += "Found seq and qual of unequal lengths: \n%s\n%s" % (seq, qual)
            
        if error_data:
            break
            
    if error_data:
        output_log.write("%s\n%s\n" % (curr_file, error_data))
        
    query_reads.close()