Last active
June 19, 2022 12:20
-
-
Save avrilcoghlan/4080983 to your computer and use it in GitHub Desktop.
Revisions
-
avrilcoghlan revised this gist
Feb 28, 2013 . 1 changed file with 386 additions and 185 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -45,7 +45,8 @@ =head1 CONTACT # Perl script run_genewisedb_afterblast.pl # Written by Avril Coghlan ([email protected]) # 8-Nov-12. # Last edited 27-Nov-2012. # SCRIPT SYNOPSIS: run_genewisedb_afterblast.pl: run GeneWise using TreeFam HMMs, in scaffold regions that have blast matches to the proteins used to build the HMMs. # #------------------------------------------------------------------# @@ -133,6 +134,8 @@ =head1 CONTACT # TEST SUBROUTINES: my $PRINT_TEST_DATA = 0; # SAYS WHETHER TO PRINT DATA USED DURING TESTING. &test_update_feature_types_in_gff($outputdir); &test_update_genewise_output($outputdir); &test_make_seqfile($outputdir); &test_check_if_have_hmm_for_family($outputdir); &test_print_error; @@ -142,8 +145,8 @@ =head1 CONTACT &test_run_genewise_on_scaffolds($outputdir,$blast_path); &test_find_pos_of_hmms($outputdir); &test_check_formatdb_files($outputdir); &test_find_pos_of_scaffolds($outputdir); print STDERR "Tests done, running main code...\n"; #------------------------------------------------------------------# @@ -176,30 +179,185 @@ sub run_main_program my $TAKE_FAMILY; # HASH TABLE THAT SAYS WHETHER TO TAKE A PARTICULAR FAMILY # RECORD THE POSITIONS OF THE HMMs IN $input_hmms: print STDERR "Reading positions of the HMMs in $input_hmms...\n"; ($hmm_pos_file,$TAKE_FAMILY,$errorcode,$errormsg) = &find_pos_of_hmms($input_hmms,$outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } # RECORD THE POSITION OF THE SCAFFOLDS IN $input_fasta: print STDERR "Reading the positions of the scaffolds in $input_fasta...\n"; ($scaffold_pos_file,$errorcode,$errormsg) = &find_pos_of_scaffolds($input_fasta,$outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } # RUN GENEWISE WITH ONE SCAFFOLD/CONTIG AT A TIME, ON ONE HMM AT A TIME: print STDERR "Running genewise on scaffolds to make output file $outputdir/$output...\n"; ($errorcode,$errormsg) = &run_genewise_on_scaffolds($outputdir,$input_fasta,$input_hmms,$output,$spliceflat,$parameterfile,$treefam_seqs, $evalue_cutoff,$flank_length,0,$hmm_pos_file,$blast_path,$scaffold_pos_file,$TAKE_FAMILY); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } print STDERR "Post-processing genewise output file $outputdir/$output...\n"; # REPLACE 'match' IN THE FEATURE COLUMN WITH 'gene', AND 'cds' WITH 'CDS': ($errorcode,$errormsg) = &update_feature_types_in_gff($output,$outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } # DELETE TEMPORARY FILES: system "rm -f $hmm_pos_file"; } #------------------------------------------------------------------# # TEST &update_feature_types_in_gff sub test_update_feature_types_in_gff { my $outputdir = $_[0]; # DIRECTORY FOR PUTTING OUTPUT FILES INTO my $output; # OUTPUT GFF FILE my $errorcode; # RETURNED AS 0 BY A FUNCTION IF THERE IS NO ERROR my $errormsg; # RETURNED AS 'none' BY A FUNCTION IF THERE IS NO ERROR my $expected_output; # FILE WITH THE EXPECTED CONTENTS OF $output my $differences; # DIFFERENCES BETWEEN $output AND $expected_output my $length_differences; # LENGTH OF $differences my $line; # ($output,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(OUTPUT,">$output") || die "ERROR: test_update_feature_types_in_gff: cannot open $output\n"; print OUTPUT "TF101001\n"; print OUTPUT "V:15395346..15397430 GeneWise match 2 1681 514.02 + . ID=V:15395346..15397430-TF101001-genewise-prediction-1\n"; print OUTPUT "V:15395346..15397430 GeneWise cds 2 11 0.00 + 0 ID=V:15395346..15397430-TF101001-genewise-prediction-1\n"; print OUTPUT "V:15395346..15397430 GeneWise intron 12 342 0.00 + . ID=V:15395346..15397430-TF101001-genewise-prediction-1\n"; print OUTPUT "V:15395346..15397430 GeneWise cds 343 396 0.00 + 2 ID=V:15395346..15397430-TF101001-genewise-prediction-1\n"; print OUTPUT "V:15395346..15397430 GeneWise intron 397 448 0.00 + . ID=V:15395346..15397430-TF101001-genewise-prediction-1\n"; print OUTPUT "V:15395346..15397430 GeneWise cds 449 543 0.00 + 2 ID=V:15395346..15397430-TF101001-genewise-prediction-1\n"; print OUTPUT "V:15395346..15397430 GeneWise intron 544 594 0.00 + . ID=V:15395346..15397430-TF101001-genewise-prediction-1\n"; print OUTPUT "V:15395346..15397430 GeneWise cds 595 1017 0.00 + 0 ID=V:15395346..15397430-TF101001-genewise-prediction-1\n"; print OUTPUT "V:15395346..15397430 GeneWise intron 1018 1066 0.00 + . ID=V:15395346..15397430-TF101001-genewise-prediction-1\n"; print OUTPUT "V:15395346..15397430 GeneWise cds 1067 1560 0.00 + 0 ID=V:15395346..15397430-TF101001-genewise-prediction-1\n"; print OUTPUT "V:15395346..15397430 GeneWise intron 1561 1605 0.00 + . ID=V:15395346..15397430-TF101001-genewise-prediction-1\n"; print OUTPUT "V:15395346..15397430 GeneWise cds 1606 1681 0.00 + 1 ID=V:15395346..15397430-TF101001-genewise-prediction-1\n"; print OUTPUT "V:15395346..15397430 GeneWise match 37437 63766 -315.16 + . ID=V:15395346..15397430-TF101001-genewise-prediction-2\n"; print OUTPUT "V:15395346..15397430 GeneWise cds 37437 62083 0.00 + 0 ID=V:15395346..15397430-TF101001-genewise-prediction-2\n"; print OUTPUT "V:15395346..15397430 GeneWise intron 62084 62744 0.00 + . ID=V:15395346..15397430-TF101001-genewise-prediction-2\n"; print OUTPUT "V:15395346..15397430 GeneWise cds 62745 63102 0.00 + 1 ID=V:15395346..15397430-TF101001-genewise-prediction-2\n"; print OUTPUT "V:15395346..15397430 GeneWise intron 63103 63151 0.00 + . ID=V:15395346..15397430-TF101001-genewise-prediction-2\n"; print OUTPUT "V:15395346..15397430 GeneWise cds 63152 63645 0.00 + 0 ID=V:15395346..15397430-TF101001-genewise-prediction-2\n"; print OUTPUT "V:15395346..15397430 GeneWise intron 63646 63690 0.00 + . ID=V:15395346..15397430-TF101001-genewise-prediction-2\n"; print OUTPUT "V:15395346..15397430 GeneWise cds 63691 63766 0.00 + 1 ID=V:15395346..15397430-TF101001-genewise-prediction-2\n"; print OUTPUT "V:15395346..15397430 GeneWise match 64165 64167 0.12 - . ID=V:15395346..15397430-TF101001-genewise-prediction-3\n"; print OUTPUT "V:15395346..15397430 GeneWise cds 64165 64167 0.00 - 0 ID=V:15395346..15397430-TF101001-genewise-prediction-3\n"; print OUTPUT "//\n"; close(OUTPUT); ($errorcode,$errormsg) = &update_feature_types_in_gff($output,$outputdir); if ($errorcode != 0) { print STDERR "ERROR: test_update_feature_types_in_gff: failed test1\n"; exit;} ($expected_output,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(EXPECTED,">$expected_output") || die "ERROR: test_update_feature_types_in_gff: cannot open $expected_output\n"; print EXPECTED "TF101001\n"; print EXPECTED "V:15395346..15397430 GeneWise gene 2 1681 514.02 + . ID=V:15395346..15397430-TF101001-genewise-prediction-1\n"; print EXPECTED "V:15395346..15397430 GeneWise CDS 2 11 0.00 + 0 ID=V:15395346..15397430-TF101001-genewise-prediction-1\n"; print EXPECTED "V:15395346..15397430 GeneWise intron 12 342 0.00 + . ID=V:15395346..15397430-TF101001-genewise-prediction-1\n"; print EXPECTED "V:15395346..15397430 GeneWise CDS 343 396 0.00 + 2 ID=V:15395346..15397430-TF101001-genewise-prediction-1\n"; print EXPECTED "V:15395346..15397430 GeneWise intron 397 448 0.00 + . ID=V:15395346..15397430-TF101001-genewise-prediction-1\n"; print EXPECTED "V:15395346..15397430 GeneWise CDS 449 543 0.00 + 2 ID=V:15395346..15397430-TF101001-genewise-prediction-1\n"; print EXPECTED "V:15395346..15397430 GeneWise intron 544 594 0.00 + . ID=V:15395346..15397430-TF101001-genewise-prediction-1\n"; print EXPECTED "V:15395346..15397430 GeneWise CDS 595 1017 0.00 + 0 ID=V:15395346..15397430-TF101001-genewise-prediction-1\n"; print EXPECTED "V:15395346..15397430 GeneWise intron 1018 1066 0.00 + . ID=V:15395346..15397430-TF101001-genewise-prediction-1\n"; print EXPECTED "V:15395346..15397430 GeneWise CDS 1067 1560 0.00 + 0 ID=V:15395346..15397430-TF101001-genewise-prediction-1\n"; print EXPECTED "V:15395346..15397430 GeneWise intron 1561 1605 0.00 + . ID=V:15395346..15397430-TF101001-genewise-prediction-1\n"; print EXPECTED "V:15395346..15397430 GeneWise CDS 1606 1681 0.00 + 1 ID=V:15395346..15397430-TF101001-genewise-prediction-1\n"; print EXPECTED "V:15395346..15397430 GeneWise gene 37437 63766 -315.16 + . ID=V:15395346..15397430-TF101001-genewise-prediction-2\n"; print EXPECTED "V:15395346..15397430 GeneWise CDS 37437 62083 0.00 + 0 ID=V:15395346..15397430-TF101001-genewise-prediction-2\n"; print EXPECTED "V:15395346..15397430 GeneWise intron 62084 62744 0.00 + . ID=V:15395346..15397430-TF101001-genewise-prediction-2\n"; print EXPECTED "V:15395346..15397430 GeneWise CDS 62745 63102 0.00 + 1 ID=V:15395346..15397430-TF101001-genewise-prediction-2\n"; print EXPECTED "V:15395346..15397430 GeneWise intron 63103 63151 0.00 + . ID=V:15395346..15397430-TF101001-genewise-prediction-2\n"; print EXPECTED "V:15395346..15397430 GeneWise CDS 63152 63645 0.00 + 0 ID=V:15395346..15397430-TF101001-genewise-prediction-2\n"; print EXPECTED "V:15395346..15397430 GeneWise intron 63646 63690 0.00 + . ID=V:15395346..15397430-TF101001-genewise-prediction-2\n"; print EXPECTED "V:15395346..15397430 GeneWise CDS 63691 63766 0.00 + 1 ID=V:15395346..15397430-TF101001-genewise-prediction-2\n"; print EXPECTED "V:15395346..15397430 GeneWise gene 64165 64167 0.12 - . ID=V:15395346..15397430-TF101001-genewise-prediction-3\n"; print EXPECTED "V:15395346..15397430 GeneWise CDS 64165 64167 0.00 - 0 ID=V:15395346..15397430-TF101001-genewise-prediction-3\n"; print EXPECTED "//\n"; close(EXPECTED); $differences = ""; open(TEMP,"diff $expected_output $output |"); while(<TEMP>) { $line = $_; $differences = $differences.$line; } close(TEMP); $length_differences = length($differences); if ($length_differences != 0) { print STDERR "ERROR: test_update_feature_types_in_gff: failed test1 (output $output expected_output $expected_output)\n"; exit;} system "rm -f $expected_output"; system "rm -f $output"; } #------------------------------------------------------------------# # REPLACE 'match' IN THE FEATURE COLUMN WITH 'gene', AND 'cds' WITH 'CDS': # THIS IS REQUIRED TO USE THE GENEWISE OUTPUT AS INPUT TO MAKER (WHICH REQUIRES gene/CDS FEATURE TYPES): sub update_feature_types_in_gff { my $output = $_[0]; # NAME OF THE OUTPUT GFF FILE my $outputdir = $_[1]; # DIRECTORY TO PUT OUTPUT FILES INTO my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR my $cmd; # COMMAND TO RUN # REPLACE 'cds' WITH 'CDS' IN THE GFF FILE: $cmd = "sed -i \'s/cds/CDS/g\' $output"; system "$cmd"; sleep(60); # FORCE PERL TO WAIT UNTIL THE 'sed' JOB IS COMPLETE (THIS IS NECESSARY) # REPLACE 'match' WITH 'gene' IN THE GFF FILE: $cmd = "sed -i \'s/match/gene/g\' $output"; system "$cmd"; sleep(60); # FORCE PERL TO WAIT UNTIL THE 'sed' JOB IS COMPLETE (THIS IS NECESSARY) return($errorcode,$errormsg); } #------------------------------------------------------------------# # SUBROUTINE TO MAKE A FILE NAME FOR A TEMPORARY FILE: sub make_filename { my $outputdir = $_[0]; # OUTPUT DIRECTORY TO WRITE TEMPORARY FILE NAME TO my $found_name = 0; # SAYS WHETHER WE HAVE FOUND A FILE NAME YET my $filename = "none";# NEW FILE NAME TO USE my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR my $poss_filename; # POSSIBLE FILE NAME TO USE my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAME while($found_name == 0) { $random_number = rand(); $poss_filename = $outputdir."/tmp".$random_number; if (!(-e $poss_filename)) { $filename = $poss_filename; $found_name = 1; } } if ($found_name == 0 || $filename eq 'none') { $errormsg = "ERROR: make_filename: found_name $found_name filename $filename\n"; $errorcode = 6; # ERRORCODE=6 return($filename,$errorcode,$errormsg); } return($filename,$errorcode,$errormsg); } #------------------------------------------------------------------# # TEST &find_pos_of_hmms sub test_find_pos_of_hmms { my $outputdir = $_[0]; # DIRECTORY FOR WRITING OUTPUT FILES IN my $input_hmmfile; # INPUT FILE OF HMMs my $errorcode; # RETURNED AS 0 BY A FUNCTION IF THERE IS NO ERROR my $errormsg; # RETURNED AS 'none' BY A FUNCTION IF THERE IS NO ERROR @@ -210,8 +368,8 @@ sub test_find_pos_of_hmms my $line; # my $TAKE_FAMILY; # HASH TABLE THAT SAYS WHETHER TO TAKE A PARTICULAR FAMILY ($input_hmmfile,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(HMMFILE,">$input_hmmfile") || die "ERROR: test_find_pos_of_hmms: cannot open $input_hmmfile\n"; print HMMFILE "HMMER2.0 [2.3.2]\n"; print HMMFILE "NAME TF101093.0\n"; @@ -240,8 +398,8 @@ sub test_find_pos_of_hmms !(defined($TAKE_FAMILY->{"TF101095"})) || !(defined($TAKE_FAMILY->{"TF101096"})) || !(defined($TAKE_FAMILY->{"TF101097"})) || !(defined($TAKE_FAMILY->{"TF101098"})) || !(defined($TAKE_FAMILY->{"TF101099"})) ) { print STDERR "ERROR: test_find_pos_of_hmms: failed test1\n"; exit;} ($expected_hmm_pos_file,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(EXPECTED,">$expected_hmm_pos_file") || die "ERROR: test_find_pos_of_hmms: cannot open $expected_hmm_pos_file\n"; print EXPECTED "TF101093 3\n"; print EXPECTED "TF101094 6\n"; @@ -275,7 +433,6 @@ sub test_find_pos_of_hmms sub test_find_pos_of_scaffolds { my $outputdir = $_[0]; ## DIRECTORY FOR WRITING OUTPUT FILES IN my $input_fasta; ## INPUT FILE OF SCAFFOLDS my $errorcode; ## RETURNED AS 0 BY A FUNCTION IF THERE IS NO ERROR my $errormsg; ## RETURNED AS 'none' BY A FUNCTION IF THERE IS NO ERROR @@ -285,8 +442,8 @@ sub test_find_pos_of_scaffolds my $length_differences; ## LENGTH OF $differences my $line; ## ($input_fasta,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(FASTA,">$input_fasta") || die "ERROR: test_find_pos_of_scaffolds: cannot open $input_fasta\n"; print FASTA ">scaffold2\n"; print FASTA "ABCDEFG\n"; @@ -301,8 +458,8 @@ sub test_find_pos_of_scaffolds close(FASTA); ($scaffold_pos_file,$errorcode,$errormsg) = &find_pos_of_scaffolds($input_fasta,$outputdir); if ($errorcode != 0) { print STDERR "ERROR: test_find_pos_of_scaffolds: failed test1\n"; exit;} ($expected_scaffold_pos_file,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(EXPECTED,">$expected_scaffold_pos_file") || die "ERROR: test_find_pos_of_scaffolds: cannot open $expected_scaffold_pos_file\n"; print EXPECTED "scaffold2 2\n"; print EXPECTED "scaffold3 5\n"; @@ -341,12 +498,11 @@ sub find_pos_of_scaffolds my $line_no; ## LINE NUMBER THAT THE NAME OF A SCAFFOLD APPEARS ON my $scaffold; ## SCAFFOLD IN $input_fasta my $prev_scaffold = "none";## PREVIOUS SCAFFOLD IN $input_fasta my $i; ## # OPEN A FILE TO PUT THE POSITIONS OF SCAFFOLDS IN $input_fasta: ($scaffold_pos_file,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(SCAFFOLD_POS,">$scaffold_pos_file") || die "ERROR: find_pos_of_scaffolds: cannot open $scaffold_pos_file\n"; # FIND THE LINES THAT SEQUENCES START ON: @@ -401,12 +557,11 @@ sub find_pos_of_hmms my $line_no; # LINE NUMBER THAT THE NAME OF A HMM APPEARS ON my $family; # TREEFAM FAMILY FOR A HMM IN $input_hmms my $prev_family = "none";# TREEFAM FAMILY FOR PREVIOUS HMM IN $input_hmms my %TAKE_FAMILY = (); # HASH TABLE THAT SAYS WHETHER TO TAKE A PARTICULAR FAMILY # OPEN A FILE TO PUT THE POSITIONS OF HMMs IN $input_hmms: ($hmm_pos_file,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(HMM_POS,">$hmm_pos_file") || die "ERROR: find_pos_of_hmms: cannot open $hmm_pos_file\n"; # FIND THE LINES THAT HMMs END ON: @@ -462,7 +617,6 @@ sub test_run_genewise_on_scaffolds my $input_hmms; # INPUT FILE OF HMMs my $fam_seqs; # INPUT FILE WITH SEQUENCES IN FAMILIES. my $output; # OUTPUT FILE FOR GENEWISE OUTPUT my $expected_output; # FILE CONTAINING THE EXPECTED CONTENT OF $output my $differences; # DIFFERENCES BETWEEN $output AND $expected_output my $length_differences; # LENGTH OF $differences @@ -473,13 +627,16 @@ sub test_run_genewise_on_scaffolds my $i; # my $scaffold_pos_file; # FILE WITH POSITIONS OF SCAFFOLDS IN $input_fasta my $TAKE_FAMILY; # HASH TABLE THAT SAYS WHETHER TO TAKE A FAMILY my @temp; # # DEFINE THE OUTPUT FILE NAME FOR GENEWISE OUTPUT: ($output,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } @temp = split(/\//,$output); $output = $temp[$#temp]; # WRITE A FILE WITH THE INPUT SEQUENCES IN THE FAMILY: ($fam_seqs,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(FAM_SEQS,">$fam_seqs") || die "ERROR: test_run_genewise_on_scaffolds: cannot open $fam_seqs\n"; print FAM_SEQS "TF101001\n"; print FAM_SEQS ">At1g16330.1\n"; @@ -537,8 +694,8 @@ sub test_run_genewise_on_scaffolds print FAM_SEQS "#END\n"; close(FAM_SEQS); # WRITE AN INPUT FASTA FILE: ($input_fasta,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(INPUT_FASTA,">$input_fasta") || die "ERROR: test_run_genewise_on_scaffolds: cannot open $input_fasta\n"; print INPUT_FASTA ">V:15395346..15397430\n"; print INPUT_FASTA "gaatgttattttcttatcaataagcacattttctatcaaacccagttcactttcaaattt\n"; @@ -617,8 +774,8 @@ sub test_run_genewise_on_scaffolds print INPUT_FASTA "cttgatcggcgcaactttacaagcggtttttgtccttacagtgct\n"; close(INPUT_FASTA); # WRITE AN INPUT FILE OF HMMs: ($input_hmms,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(INPUT_HMMS,">$input_hmms") || die "ERROR: test_run_genewise_on_scaffolds: cannot open $input_hmms\n"; print INPUT_HMMS "HMMER2.0 [2.3.2]\n"; print INPUT_HMMS "NAME TF101001\n"; @@ -1978,32 +2135,32 @@ sub test_run_genewise_on_scaffolds $blast_path,$scaffold_pos_file,$TAKE_FAMILY); if ($errorcode != 0) { print STDERR "ERROR: test_run_genewise_on_scaffolds: failed test1\n"; exit;} # WRITE A FILE WITH THE EXPECTED OUTPUT: ($expected_output,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(EXPECTED,">$expected_output") || die "ERROR: test_run_genewise_on_scaffolds: cannot open $expected_output\n"; print EXPECTED "TF101001\n"; print EXPECTED "V:15395346..15397430 GeneWise match 2 1681 514.02 + . ID=V:15395346..15397430-TF101001-genewise-prediction-1\n"; print EXPECTED "V:15395346..15397430 GeneWise cds 2 11 0.00 + 0 ID=V:15395346..15397430-TF101001-genewise-prediction-1\n"; print EXPECTED "V:15395346..15397430 GeneWise intron 12 342 0.00 + . ID=V:15395346..15397430-TF101001-genewise-prediction-1\n"; print EXPECTED "V:15395346..15397430 GeneWise cds 343 396 0.00 + 2 ID=V:15395346..15397430-TF101001-genewise-prediction-1\n"; print EXPECTED "V:15395346..15397430 GeneWise intron 397 448 0.00 + . ID=V:15395346..15397430-TF101001-genewise-prediction-1\n"; print EXPECTED "V:15395346..15397430 GeneWise cds 449 543 0.00 + 2 ID=V:15395346..15397430-TF101001-genewise-prediction-1\n"; print EXPECTED "V:15395346..15397430 GeneWise intron 544 594 0.00 + . ID=V:15395346..15397430-TF101001-genewise-prediction-1\n"; print EXPECTED "V:15395346..15397430 GeneWise cds 595 1017 0.00 + 0 ID=V:15395346..15397430-TF101001-genewise-prediction-1\n"; print EXPECTED "V:15395346..15397430 GeneWise intron 1018 1066 0.00 + . ID=V:15395346..15397430-TF101001-genewise-prediction-1\n"; print EXPECTED "V:15395346..15397430 GeneWise cds 1067 1560 0.00 + 0 ID=V:15395346..15397430-TF101001-genewise-prediction-1\n"; print EXPECTED "V:15395346..15397430 GeneWise intron 1561 1605 0.00 + . ID=V:15395346..15397430-TF101001-genewise-prediction-1\n"; print EXPECTED "V:15395346..15397430 GeneWise cds 1606 1681 0.00 + 1 ID=V:15395346..15397430-TF101001-genewise-prediction-1\n"; print EXPECTED "V:15395346..15397430 GeneWise match 37437 63766 -315.16 + . ID=V:15395346..15397430-TF101001-genewise-prediction-2\n"; print EXPECTED "V:15395346..15397430 GeneWise cds 37437 62083 0.00 + 0 ID=V:15395346..15397430-TF101001-genewise-prediction-2\n"; print EXPECTED "V:15395346..15397430 GeneWise intron 62084 62744 0.00 + . ID=V:15395346..15397430-TF101001-genewise-prediction-2\n"; print EXPECTED "V:15395346..15397430 GeneWise cds 62745 63102 0.00 + 1 ID=V:15395346..15397430-TF101001-genewise-prediction-2\n"; print EXPECTED "V:15395346..15397430 GeneWise intron 63103 63151 0.00 + . ID=V:15395346..15397430-TF101001-genewise-prediction-2\n"; print EXPECTED "V:15395346..15397430 GeneWise cds 63152 63645 0.00 + 0 ID=V:15395346..15397430-TF101001-genewise-prediction-2\n"; print EXPECTED "V:15395346..15397430 GeneWise intron 63646 63690 0.00 + . ID=V:15395346..15397430-TF101001-genewise-prediction-2\n"; print EXPECTED "V:15395346..15397430 GeneWise cds 63691 63766 0.00 + 1 ID=V:15395346..15397430-TF101001-genewise-prediction-2\n"; print EXPECTED "V:15395346..15397430 GeneWise match 64165 64167 0.12 - . ID=V:15395346..15397430-TF101001-genewise-prediction-3\n"; print EXPECTED "V:15395346..15397430 GeneWise cds 64165 64167 0.00 - 0 ID=V:15395346..15397430-TF101001-genewise-prediction-3\n"; print EXPECTED "//\n"; close(EXPECTED); $differences = ""; @@ -2036,11 +2193,13 @@ sub test_run_genewise_on_scaffolds system "rm -f $formatdbfile"; # DEFINE THE OUTPUT FILE NAME FOR GENEWISE OUTPUT: ($output,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } @temp = split(/\//,$output); $output = $temp[$#temp]; # WRITE A FILE WITH THE INPUT SEQUENCES IN THE FAMILY: ($fam_seqs,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(FAM_SEQS,">$fam_seqs") || die "ERROR: test_run_genewise_on_scaffolds: cannot open $fam_seqs\n"; print FAM_SEQS "TF101526\n"; print FAM_SEQS ">At1g29550.1\n"; @@ -2335,8 +2494,8 @@ sub test_run_genewise_on_scaffolds close(FAM_SEQS); # WRITE AN INPUT FASTA FILE: ($input_fasta,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(INPUT_FASTA,">$input_fasta") || die "ERROR: test_run_genewise_on_scaffolds: cannot open $input_fasta\n"; print INPUT_FASTA ">scaffold2.size1663701.3.1253264-1653727\n"; print INPUT_FASTA "TTTCCCTTCTCTATGAAAGCTTTGATATTGAGCTTTTGTCAATGATGAATGATCCCTTAA\n"; @@ -3927,8 +4086,8 @@ sub test_run_genewise_on_scaffolds print INPUT_FASTA "T\n"; close(INPUT_FASTA); # WRITE AN INPUT FILE OF HMMs: ($input_hmms,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(INPUT_HMMS,">$input_hmms") || die "ERROR: test_run_genewise_on_scaffolds: cannot open $input_hmms\n"; print INPUT_HMMS "HMMER2.0 [2.3.2]\n"; print INPUT_HMMS "NAME TF105015\n"; @@ -7182,46 +7341,46 @@ sub test_run_genewise_on_scaffolds $blast_path,$scaffold_pos_file,$TAKE_FAMILY); if ($errorcode != 0) { print STDERR "ERROR: test_run_genewise_on_scaffolds: failed test2\n"; exit;} # WRITE A FILE WITH THE EXPECTED OUTPUT: ($expected_output,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(EXPECTED,">$expected_output") || die "ERROR: test_run_genewise_on_scaffolds: cannot open $expected_output\n"; print EXPECTED "TF101526\n"; print EXPECTED "scaffold2.size1663701.3.1253264-1653727 GeneWise match 2 4 -17.91 + . ID=scaffold2.size1663701.3.1253264-1653727-TF101526-genewise-prediction-1\n"; print EXPECTED "scaffold2.size1663701.3.1253264-1653727 GeneWise cds 2 4 0.00 + 0 ID=scaffold2.size1663701.3.1253264-1653727-TF101526-genewise-prediction-1\n"; print EXPECTED "scaffold2.size1663701.3.1253264-1653727 GeneWise match 7 15 2.43 + . ID=scaffold2.size1663701.3.1253264-1653727-TF101526-genewise-prediction-2\n"; print EXPECTED "scaffold2.size1663701.3.1253264-1653727 GeneWise cds 7 15 0.00 + 0 ID=scaffold2.size1663701.3.1253264-1653727-TF101526-genewise-prediction-2\n"; print EXPECTED "scaffold5.size1039674 GeneWise match 2 7 -16.98 + . ID=scaffold5.size1039674-TF101526-genewise-prediction-1\n"; print EXPECTED "scaffold5.size1039674 GeneWise cds 2 7 0.00 + 0 ID=scaffold5.size1039674-TF101526-genewise-prediction-1\n"; print EXPECTED "scaffold5.size1039674 GeneWise match 9 231 -14.55 + . ID=scaffold5.size1039674-TF101526-genewise-prediction-2\n"; print EXPECTED "scaffold5.size1039674 GeneWise cds 9 43 0.00 + 0 ID=scaffold5.size1039674-TF101526-genewise-prediction-2\n"; print EXPECTED "scaffold5.size1039674 GeneWise intron 44 58 0.00 + . ID=scaffold5.size1039674-TF101526-genewise-prediction-2\n"; print EXPECTED "scaffold5.size1039674 GeneWise cds 59 122 0.00 + 1 ID=scaffold5.size1039674-TF101526-genewise-prediction-2\n"; print EXPECTED "scaffold5.size1039674 GeneWise intron 123 189 0.00 + . ID=scaffold5.size1039674-TF101526-genewise-prediction-2\n"; print EXPECTED "scaffold5.size1039674 GeneWise cds 190 231 0.00 + 0 ID=scaffold5.size1039674-TF101526-genewise-prediction-2\n"; print EXPECTED "scaffold5.size1039674 GeneWise match 44996 44998 2.11 - . ID=scaffold5.size1039674-TF101526-genewise-prediction-3\n"; print EXPECTED "scaffold5.size1039674 GeneWise cds 44996 44998 0.00 - 0 ID=scaffold5.size1039674-TF101526-genewise-prediction-3\n"; print EXPECTED "scaffold5.size1039674 GeneWise match 9587 11764 94.51 - . ID=scaffold5.size1039674-TF101526-genewise-prediction-4\n"; print EXPECTED "scaffold5.size1039674 GeneWise cds 11737 11764 0.00 - 0 ID=scaffold5.size1039674-TF101526-genewise-prediction-4\n"; print EXPECTED "scaffold5.size1039674 GeneWise intron 11099 11736 0.00 - . ID=scaffold5.size1039674-TF101526-genewise-prediction-4\n"; print EXPECTED "scaffold5.size1039674 GeneWise cds 10994 11098 0.00 - 2 ID=scaffold5.size1039674-TF101526-genewise-prediction-4\n"; print EXPECTED "scaffold5.size1039674 GeneWise intron 10932 10993 0.00 - . ID=scaffold5.size1039674-TF101526-genewise-prediction-4\n"; print EXPECTED "scaffold5.size1039674 GeneWise cds 10857 10931 0.00 - 2 ID=scaffold5.size1039674-TF101526-genewise-prediction-4\n"; print EXPECTED "scaffold5.size1039674 GeneWise intron 10620 10856 0.00 - . ID=scaffold5.size1039674-TF101526-genewise-prediction-4\n"; print EXPECTED "scaffold5.size1039674 GeneWise cds 10543 10619 0.00 - 2 ID=scaffold5.size1039674-TF101526-genewise-prediction-4\n"; print EXPECTED "scaffold5.size1039674 GeneWise intron 10419 10542 0.00 - . ID=scaffold5.size1039674-TF101526-genewise-prediction-4\n"; print EXPECTED "scaffold5.size1039674 GeneWise cds 10284 10418 0.00 - 0 ID=scaffold5.size1039674-TF101526-genewise-prediction-4\n"; print EXPECTED "scaffold5.size1039674 GeneWise intron 9915 10283 0.00 - . ID=scaffold5.size1039674-TF101526-genewise-prediction-4\n"; print EXPECTED "scaffold5.size1039674 GeneWise cds 9778 9914 0.00 - 0 ID=scaffold5.size1039674-TF101526-genewise-prediction-4\n"; print EXPECTED "scaffold5.size1039674 GeneWise intron 9761 9777 0.00 - . ID=scaffold5.size1039674-TF101526-genewise-prediction-4\n"; print EXPECTED "scaffold5.size1039674 GeneWise cds 9734 9760 0.00 - 1 ID=scaffold5.size1039674-TF101526-genewise-prediction-4\n"; print EXPECTED "scaffold5.size1039674 GeneWise intron 9597 9733 0.00 - . ID=scaffold5.size1039674-TF101526-genewise-prediction-4\n"; print EXPECTED "scaffold5.size1039674 GeneWise cds 9587 9596 0.00 - 1 ID=scaffold5.size1039674-TF101526-genewise-prediction-4\n"; print EXPECTED "//\n"; print EXPECTED "TF105015\n"; print EXPECTED "scaffold2.size1663701.3.1253264-1653727 GeneWise match 49989 50000 -11.34 - . ID=scaffold2.size1663701.3.1253264-1653727-TF105015-genewise-prediction-1\n"; print EXPECTED "scaffold2.size1663701.3.1253264-1653727 GeneWise cds 49989 50000 0.00 - 0 ID=scaffold2.size1663701.3.1253264-1653727-TF105015-genewise-prediction-1\n"; print EXPECTED "scaffold2.size1663701.3.1253264-1653727 GeneWise match 49945 49986 6.12 - . ID=scaffold2.size1663701.3.1253264-1653727-TF105015-genewise-prediction-2\n"; print EXPECTED "scaffold2.size1663701.3.1253264-1653727 GeneWise cds 49945 49986 0.00 - 0 ID=scaffold2.size1663701.3.1253264-1653727-TF105015-genewise-prediction-2\n"; print EXPECTED "//\n"; close(EXPECTED); $differences = ""; @@ -7265,15 +7424,14 @@ sub test_make_hmmfile my $errormsg; # RETURNED AS 'none' BY A FUNCTION IF THERE IS NO ERROR my $hmmfile; # FILE CONTAINING A HMM my $input_hmms; # INPUT FILE OF HMMS my $expected_hmmfile; # FILE CONTAINING EXPECTED CONTENT OF $hmmfile my $differences; # DIFFERENCES BETWEEN $hmmfile AND $expected_hmmfile my $length_differences; # LENGTH OF $differences my $line; # # WRITE THE INPUT FILE OF HMMS: ($input_hmms,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(INPUT_HMMS,">$input_hmms") || die "ERROR: test_make_hmmfile: cannot open $input_hmms\n"; print INPUT_HMMS "HMMER2.0 [2.3.2]\n"; print INPUT_HMMS "NAME TF350597.0\n"; @@ -7294,8 +7452,8 @@ sub test_make_hmmfile ($hmmfile,$errorcode,$errormsg) = &make_hmmfile($outputdir,$input_hmms,6,10); if ($errorcode != 0) { print STDERR "ERROR: test_make_hmmfile: failed test1\n"; exit;} # OPEN A FILE CONTAINING THE EXPECTED CONTENTS OF $hmmfile: ($expected_hmmfile,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(EXPECTED,">$expected_hmmfile") || die "ERROR: test_make_hmmfile: cannot open $expected_hmmfile\n"; print EXPECTED "HMMER2.0 [2.3.2]\n"; print EXPECTED "NAME TF101000.0\n"; @@ -7318,8 +7476,8 @@ sub test_make_hmmfile system "rm -f $expected_hmmfile"; # WRITE THE INPUT FILE OF HMMS: ($input_hmms,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(INPUT_HMMS,">$input_hmms") || die "ERROR: test_make_hmmfile: cannot open $input_hmms\n"; print INPUT_HMMS "HMMER2.0 [2.3.2]\n"; print INPUT_HMMS "NAME TF350597.0\n"; @@ -7340,8 +7498,8 @@ sub test_make_hmmfile ($hmmfile,$errorcode,$errormsg) = &make_hmmfile($outputdir,$input_hmms,11,15); if ($errorcode != 0) { print STDERR "ERROR: test_make_hmmfile: failed test2\n"; exit;} # OPEN A FILE CONTAINING THE EXPECTED CONTENTS OF $hmmfile: ($expected_hmmfile,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(EXPECTED,">$expected_hmmfile") || die "ERROR: test_make_hmmfile: cannot open $expected_hmmfile\n"; print EXPECTED "HMMER2.0 [2.3.2]\n"; print EXPECTED "NAME TF101001.0\n"; @@ -7364,8 +7522,8 @@ sub test_make_hmmfile system "rm -f $expected_hmmfile"; # WRITE THE INPUT FILE OF HMMS: ($input_hmms,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(INPUT_HMMS,">$input_hmms") || die "ERROR: test_make_hmmfile: cannot open $input_hmms\n"; print INPUT_HMMS "HMMER2.0 [2.3.2]\n"; # LINE 1 print INPUT_HMMS "NAME TF350597.0\n"; @@ -7386,8 +7544,8 @@ sub test_make_hmmfile ($hmmfile,$errorcode,$errormsg) = &make_hmmfile($outputdir,$input_hmms,1,5); if ($errorcode != 0) { print STDERR "ERROR: test_make_hmmfile: failed test3\n"; exit;} # OPEN A FILE CONTAINING THE EXPECTED CONTENTS OF $hmmfile: ($expected_hmmfile,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(EXPECTED,">$expected_hmmfile") || die "ERROR: test_make_hmmfile: cannot open $expected_hmmfile\n"; print EXPECTED "HMMER2.0 [2.3.2]\n"; print EXPECTED "NAME TF350597.0\n"; @@ -7410,8 +7568,8 @@ sub test_make_hmmfile system "rm -f $expected_hmmfile"; # WRITE THE INPUT FILE OF HMMS: ($input_hmms,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(INPUT_HMMS,">$input_hmms") || die "ERROR: test_make_hmmfile: cannot open $input_hmms\n"; print INPUT_HMMS "HMMER2.0 [2.3.2]\n"; print INPUT_HMMS "NAME TF350597.0\n"; @@ -7449,12 +7607,11 @@ sub make_hmmfile my $hmmfile; ## FILE WITH HMM FOR THE FAMILY my $errorcode = 0; ## RETURNED AS 0 IF THERE IS NO ERROR my $errormsg = "none";## RETURNED AS 'none' IF THERE IS NO ERROR my $hmm_length; ## LENGTH OF THE HMM FOR FAMILY IN $input_hmms # OPEN AN OUTPUT FILE FOR THE HMM FROM FAMILY $family: ($hmmfile,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } # CHECK $hmm_start_pos AND $hmm_end_pos ARE NUMBERS: if ($hmm_start_pos eq 'none' || $hmm_end_pos eq 'none') @@ -7488,11 +7645,10 @@ sub test_update_genewise_output my $differences; ## DIFFERENCES BETWEEN $genewise_output2 AND $expected_genewise_output2 my $length_differences; ## LENGTH OF $differences my $line; ## # OPEN A FILE WITH GENEWISE OUTPUT: ($genewise_output,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(GENEWISE_OUTPUT,">$genewise_output") || die "ERROR: test_update_genewise_output: cannot open $genewise_output\n"; print GENEWISE_OUTPUT "scaffold1 GeneWise match 2 4 -19.50 + . scaffold1-genewise-prediction-1\n"; print GENEWISE_OUTPUT "scaffold1 GeneWise cds 2 4 0.00 + 0 scaffold1-genewise-prediction-1\n"; @@ -7507,17 +7663,17 @@ sub test_update_genewise_output ($genewise_output2,$errorcode,$errormsg) = &update_genewise_output($genewise_output,$outputdir,"scaffold1",'TF101000'); if ($errorcode != 0) { print STDERR "ERROR: test_update_genewise_output: failed test1\n"; exit;} # OPEN A FILE WITH THE EXPECTED OUTPUT: ($expected_genewise_output2,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(EXPECTED,">$expected_genewise_output2") || die "ERROR: test_update_genewise_output: cannot open $expected_genewise_output2"; print EXPECTED "scaffold1 GeneWise match 2 4 -19.50 + . ID=scaffold1-TF101000-genewise-prediction-1\n"; print EXPECTED "scaffold1 GeneWise cds 2 4 0.00 + 0 ID=scaffold1-TF101000-genewise-prediction-1\n"; print EXPECTED "scaffold1 GeneWise match 7 24 0.91 + . ID=scaffold1-TF101000-genewise-prediction-2\n"; print EXPECTED "scaffold1 GeneWise cds 7 24 0.00 + 0 ID=scaffold1-TF101000-genewise-prediction-2\n"; print EXPECTED "scaffold1 GeneWise match 2 4 -19.50 + . ID=scaffold1-TF101000-genewise-prediction-3\n"; print EXPECTED "scaffold1 GeneWise cds 2 4 0.00 + 0 ID=scaffold1-TF101000-genewise-prediction-3\n"; print EXPECTED "scaffold1 GeneWise match 7 24 0.91 + . ID=scaffold1-TF101000-genewise-prediction-4\n"; print EXPECTED "scaffold1 GeneWise cds 7 24 0.00 + 0 ID=scaffold1-TF101000-genewise-prediction-4\n"; close(EXPECTED); $differences = ""; open(TEMP,"diff $genewise_output2 $expected_genewise_output2 |"); @@ -7534,8 +7690,8 @@ sub test_update_genewise_output system "rm -f $expected_genewise_output2"; # OPEN A FILE WITH GENEWISE OUTPUT: ($genewise_output,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(GENEWISE_OUTPUT,">$genewise_output") || die "ERROR: test_update_genewise_output: cannot open $genewise_output\n"; print GENEWISE_OUTPUT "scaffold1 GeneWise match 2 4 -19.50 + . scaffold1-genewise-prediction-1\n"; print GENEWISE_OUTPUT "scaffold1 GeneWise cds 2 4 0.00 + 0 scaffold1-genewise-prediction-1\n"; @@ -7553,8 +7709,8 @@ sub test_update_genewise_output system "rm -f $genewise_output2"; # OPEN A FILE WITH GENEWISE OUTPUT: ($genewise_output,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(GENEWISE_OUTPUT,">$genewise_output") || die "ERROR: test_update_genewise_output: cannot open $genewise_output\n"; print GENEWISE_OUTPUT "scaffold1 GeneWise match 2 4 -19.50 + . scaffold1-genewise-prediction-1\n"; print GENEWISE_OUTPUT "scaffold1 GeneWise cds 2 4 0.00 + 0 scaffold1-genewise-prediction-1\n"; @@ -7571,7 +7727,50 @@ sub test_update_genewise_output if ($errorcode != 30) { print STDERR "ERROR: test_update_genewise_output: failed test3\n"; exit;} system "rm -f $genewise_output"; system "rm -f $genewise_output2"; # OPEN A FILE WITH GENEWISE OUTPUT: ($genewise_output,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(GENEWISE_OUTPUT,">$genewise_output") || die "ERROR: test_update_genewise_output: cannot open $genewise_output\n"; print GENEWISE_OUTPUT "scaffold1 GeneWise match 4 2 -19.50 - . scaffold1-genewise-prediction-1\n"; print GENEWISE_OUTPUT "scaffold1 GeneWise cds 4 2 0.00 - 0 scaffold1-genewise-prediction-1\n"; print GENEWISE_OUTPUT "scaffold1 GeneWise match 24 7 0.91 - . scaffold1-genewise-prediction-2\n"; print GENEWISE_OUTPUT "scaffold1 GeneWise cds 24 7 0.00 - 0 scaffold1-genewise-prediction-2\n"; print GENEWISE_OUTPUT "//\n"; print GENEWISE_OUTPUT "scaffold1 GeneWise match 2 4 -19.50 + . scaffold1-genewise-prediction-1\n"; print GENEWISE_OUTPUT "scaffold1 GeneWise cds 2 4 0.00 + 0 scaffold1-genewise-prediction-1\n"; print GENEWISE_OUTPUT "scaffold1 GeneWise match 7 24 0.91 + . scaffold1-genewise-prediction-2\n"; print GENEWISE_OUTPUT "scaffold1 GeneWise cds 7 24 0.00 + 0 scaffold1-genewise-prediction-2\n"; close(GENEWISE_OUTPUT); ($genewise_output2,$errorcode,$errormsg) = &update_genewise_output($genewise_output,$outputdir,"scaffold1",'TF101000'); if ($errorcode != 0) { print STDERR "ERROR: test_update_genewise_output: failed test4\n"; exit;} # OPEN A FILE WITH THE EXPECTED OUTPUT: ($expected_genewise_output2,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(EXPECTED,">$expected_genewise_output2") || die "ERROR: test_update_genewise_output: cannot open $expected_genewise_output2"; print EXPECTED "scaffold1 GeneWise match 2 4 -19.50 - . ID=scaffold1-TF101000-genewise-prediction-1\n"; print EXPECTED "scaffold1 GeneWise cds 2 4 0.00 - 0 ID=scaffold1-TF101000-genewise-prediction-1\n"; print EXPECTED "scaffold1 GeneWise match 7 24 0.91 - . ID=scaffold1-TF101000-genewise-prediction-2\n"; print EXPECTED "scaffold1 GeneWise cds 7 24 0.00 - 0 ID=scaffold1-TF101000-genewise-prediction-2\n"; print EXPECTED "scaffold1 GeneWise match 2 4 -19.50 + . ID=scaffold1-TF101000-genewise-prediction-3\n"; print EXPECTED "scaffold1 GeneWise cds 2 4 0.00 + 0 ID=scaffold1-TF101000-genewise-prediction-3\n"; print EXPECTED "scaffold1 GeneWise match 7 24 0.91 + . ID=scaffold1-TF101000-genewise-prediction-4\n"; print EXPECTED "scaffold1 GeneWise cds 7 24 0.00 + 0 ID=scaffold1-TF101000-genewise-prediction-4\n"; close(EXPECTED); $differences = ""; open(TEMP,"diff $genewise_output2 $expected_genewise_output2 |"); while(<TEMP>) { $line = $_; $differences = $differences.$line; } close(TEMP); $length_differences = length($differences); if ($length_differences != 0) { print STDERR "ERROR: test_update_genewise_output: failed test4 (files $genewise_output2 $expected_genewise_output2)\n"; exit;} system "rm -f $genewise_output"; system "rm -f $genewise_output2"; system "rm -f $expected_genewise_output2"; } #------------------------------------------------------------------# @@ -7587,21 +7786,22 @@ sub update_genewise_output my $errorcode = 0; ## RETURNED AS 0 IF THERE IS NO ERROR my $errormsg = "none";## RETURNED AS 'none' IF THERE IS NO ERROR my $genewise_output2 = "none";## FILE WITH GENEWISE OUTPUT IN SCAFFOLD COORDAINTES my $line; ## my @temp; ## my $start; ## START OF FEATURE IN GENEWISE OUTPUT FILE my $end; ## END OF FEATURE IN GENEWISE OUTPUT FILE my $strand; ## STRAND OF FEATURE IN GENEWISE OUTPUT FILE my $name; ## NAME OF FEATURE IN GENEWISE OUTPUT FILE my $seqname; ## NAME OF SEQUENCE USED FOR RUNNING GENEWISE my $new_name; ## NEW NAME OF THE FEATURE, USING THE SCAFFOLD NAME my $num_predictions = 0; ## NUMBER OF GENE PREDICTIONS SEEN IN A SCAFFOLD my %SEEN = (); ## HASH TABLE TO RECORD WHETHER WE HAVE SEEN A PARTICULAR GENE PREDICTION BEFORE my $prev_line; ## PREVIOUS LINE OF $genewise_output my $tmp; ## # MAKE A TEMPORARY FILE TO PUT THE GENEWISE OUTPUT WITH SCAFFOLD COORDINATES INTO: ($genewise_output2,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(GENEWISE_OUTPUT2,">$genewise_output2") || die "ERROR: update_genewise_output: cannot open $genewise_output2\n"; # READ IN THE INPUT FILE OF GENEWISE OUTPUT: @@ -7617,6 +7817,7 @@ sub update_genewise_output $seqname = $temp[0]; $start = $temp[3]; $end = $temp[4]; $strand = $temp[6]; $name = $temp[8]; if ($seqname ne $scaffold) { @@ -7630,9 +7831,17 @@ sub update_genewise_output $SEEN{$name} = 1; $num_predictions++; } # FOR MAKER, YOU NEED START < END, EVEN IF IT IS A NEGATIVE STRAND GENE: if ($end < $start) { $tmp = $start; $start = $end; $end = $tmp; } # FOR MAKER, YOU NEED TO START THE FINAL COLUMN WITH ID=scaffold... $new_name = "ID=".$scaffold."-".$family."-genewise-prediction-".$num_predictions; # PRINT OUT TO $genewise_output2: print GENEWISE_OUTPUT2 "$scaffold\t$temp[1]\t$temp[2]\t$start\t$end\t$temp[5]\t$temp[6]\t$temp[7]\t$new_name\n"; } elsif ($#temp > 0 && $line ne '//') { @@ -7660,15 +7869,14 @@ sub test_check_if_have_hmm_for_family my $take; # SAYS WHETHER TO TAKE A PARTICULAR FAMILY my $errorcode; # RETURNED AS 0 BY A FUNCTION IF THERE IS NO ERROR my $errormsg; # RETURNED AS 'none' BY A FUNCTION IF THERE IS NO ERROR my $hmm_start_pos; # START POSITION OF A HMM IN $input_hmms my $hmm_end_pos; # END POSITION OF A HMM IN $input_hmms my $hmm_pos_file; # FILE WITH POSITIONS OF HMMs IN $input_hmms my $TAKE_FAMILY; # HASH TABLE THAT SAYS WHETHER TO TAKE A FAMILY # WRITE THE INPUT FILE OF HMMS: ($input_hmms,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(INPUT_HMMS,">$input_hmms") || die "ERROR: test_check_if_have_hmm_for_family: cannot open $input_hmms\n"; print INPUT_HMMS "HMMER2.0 [2.3.2]\n"; print INPUT_HMMS "NAME TF350597.0\n"; @@ -7696,8 +7904,8 @@ sub test_check_if_have_hmm_for_family system "rm -f $hmm_pos_file"; # WRITE THE INPUT FILE OF HMMS: ($input_hmms,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(INPUT_HMMS,">$input_hmms") || die "ERROR: test_check_if_have_hmm_for_family: cannot open $input_hmms\n"; print INPUT_HMMS "HMMER2.0 [2.3.2]\n"; print INPUT_HMMS "NAME TF350597.0\n"; @@ -7725,8 +7933,8 @@ sub test_check_if_have_hmm_for_family system "rm -f $hmm_pos_file"; # WRITE THE INPUT FILE OF HMMS: ($input_hmms,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(INPUT_HMMS,">$input_hmms") || die "ERROR: test_check_if_have_hmm_for_family: cannot open $input_hmms\n"; print INPUT_HMMS "HMMER2.0 [2.3.2]\n"; print INPUT_HMMS "NAME TF350597.0\n"; @@ -7754,8 +7962,8 @@ sub test_check_if_have_hmm_for_family system "rm -f $hmm_pos_file"; # WRITE THE INPUT FILE OF HMMS: ($input_hmms,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(INPUT_HMMS,">$input_hmms") || die "ERROR: test_check_if_have_hmm_for_family: cannot open $input_hmms\n"; print INPUT_HMMS "HMMER2.0 [2.3.2]\n"; print INPUT_HMMS "NAME TF350597.0\n"; @@ -7791,7 +7999,6 @@ sub test_check_if_have_hmm_for_family sub test_make_seqfile { my $outputdir = $_[0]; # DIRECTORY WITH OUTPUT FILES my $input_fasta; # INPUT FASTA FILE OF SCAFFOLD SEQUENCES my $seqfile; # FILE WITH SEQUENE OF A PARTICULAR SCAFFOLD my $scaffold_length; # LENGTH OF A PARTICULAR SCAFFOLD @@ -7804,8 +8011,8 @@ sub test_make_seqfile my $line; # # OPEN AN INPUT FASTA FILE OF SCAFFOLD SEQUENCES: ($input_fasta,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(INPUT_FASTA,">$input_fasta") || die "ERROR: test_make_seqfile: cannot open $input_fasta\n"; print INPUT_FASTA ">scaffold1\n"; print INPUT_FASTA "ABCDEFGHIJ\n"; @@ -7820,8 +8027,8 @@ sub test_make_seqfile ($seqfile,$scaffold_length,$errorcode,$errormsg) = &make_seqfile($input_fasta,$outputdir,"scaffold1",$scaffold_pos_file); if ($errorcode != 0 || $scaffold_length != 20) { print STDERR "ERROR: test_make_seqfile: failed test1 (errorcode $errorcode scaffold_length $scaffold_length\n"; exit;} # OPEN A FILE CONTAINING THE EXPECTED CONTENTS OF $seqfile: ($expected_seqfile,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(EXPECTED,">$expected_seqfile") || die "ERROR: test_make_seqfile: cannot open $expected_seqfile\n"; print EXPECTED ">scaffold1\n"; print EXPECTED "ABCDEFGHIJ\n"; @@ -7843,8 +8050,8 @@ sub test_make_seqfile system "rm -f $scaffold_pos_file"; # OPEN AN INPUT FASTA FILE OF SCAFFOLD SEQUENCES: ($input_fasta,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(INPUT_FASTA,">$input_fasta") || die "ERROR: test_make_seqfile: cannot open $input_fasta\n"; print INPUT_FASTA ">scaffold1\n"; print INPUT_FASTA "ABCDEFGHIJ\n"; @@ -7859,8 +8066,8 @@ sub test_make_seqfile ($seqfile,$scaffold_length,$errorcode,$errormsg) = &make_seqfile($input_fasta,$outputdir,"scaffold2",$scaffold_pos_file); if ($errorcode != 0 || $scaffold_length != 5) { print STDERR "ERROR: test_make_seqfile: failed test2\n"; exit;} # OPEN A FILE CONTAINING THE EXPECTED CONTENTS OF $seqfile: ($expected_seqfile,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(EXPECTED,">$expected_seqfile") || die "ERROR: test_make_seqfile: cannot open $expected_seqfile\n"; print EXPECTED ">scaffold2\n"; print EXPECTED "UVXYZ\n"; @@ -7881,8 +8088,8 @@ sub test_make_seqfile system "rm -f $scaffold_pos_file"; # OPEN AN INPUT FASTA FILE OF SCAFFOLD SEQUENCES: ($input_fasta,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(INPUT_FASTA,">$input_fasta") || die "ERROR: test_make_seqfile: cannot open $input_fasta\n"; print INPUT_FASTA ">scaffold1\n"; print INPUT_FASTA "ABCDEFGHIJ\n"; @@ -7891,8 +8098,8 @@ sub test_make_seqfile print INPUT_FASTA "UVXYZ\n"; close(INPUT_FASTA); # RECORD THE POSITION OF THE SCAFFOLDS IN $input_fasta: ($scaffold_pos_file,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(SCAFFOLD_POS,">$scaffold_pos_file") || die "ERROR: test_make_seqfile: cannot open $scaffold_pos_file"; print SCAFFOLD_POS "scaffold1 3\n"; print SCAFFOLD_POS "scaffold2 5\n"; @@ -7906,8 +8113,8 @@ sub test_make_seqfile system "rm -f $scaffold_pos_file"; # OPEN AN INPUT FASTA FILE OF SCAFFOLD SEQUENCES: ($input_fasta,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(INPUT_FASTA,">$input_fasta") || die "ERROR: test_make_seqfile: cannot open $input_fasta\n"; print INPUT_FASTA ">scaffold1\n"; print INPUT_FASTA "ABCDEFGHIJ\n"; @@ -7916,8 +8123,8 @@ sub test_make_seqfile print INPUT_FASTA "UVXYZ\n"; close(INPUT_FASTA); # RECORD THE POSITION OF THE SCAFFOLDS IN $input_fasta: ($scaffold_pos_file,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(SCAFFOLD_POS,">$scaffold_pos_file") || die "ERROR: test_make_seqfile: cannot open $scaffold_pos_file"; print SCAFFOLD_POS "scaffold2 5\n"; print SCAFFOLD_POS "scaffold3 3\n"; @@ -7954,7 +8161,6 @@ sub make_seqfile my $scaffold_end_pos = "none";# END POSITION OF $scaffold IN $input_fasta my $line; # my $line_num = "none";# LINE NUMBER THAT $scaffold APPEARS ON IN $scaffold_pos_file my $i; # my $scaffold_length_lines; # NUMBER OF LINES TAKEN UP BY $scaffold IN $input_fasta @@ -8040,8 +8246,8 @@ sub make_seqfile } # OPEN AN OUTPUT FILE FOR THE SEQUENCE OF SCAFFOLD $scaffold: ($seqfile,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } # CHECK $scaffold_start_pos AND $scaffold_end_pos ARE NUMBERS: if ($scaffold_start_pos eq 'none' || $scaffold_end_pos eq 'none') @@ -8189,14 +8395,13 @@ sub check_if_have_hmm_for_family sub test_check_formatdb_files { my $outputdir = $_[0]; # DIRECTORY TO WRITE OUTPUT FILES IN my $input_fasta; # INPUT FASTA FILE my $cmd; # COMMAND TO RUN my $formatdbfile; # OUTPUT FILE MADE BY FORMATDB # WRITE AN INPUT FASTA FILE: ($input_fasta,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(INPUT_FASTA,">$input_fasta") || die "ERROR: test_check_formatdb_file: cannot open $input_fasta\n"; print INPUT_FASTA ">V:15395346..15397430\n"; print INPUT_FASTA "gaatgttattttcttatcaataagcacattttctatcaaacccagttcactttcaaattt\n"; @@ -8354,7 +8559,6 @@ sub run_genewise_on_scaffolds my @temp; # my $family = "none";# TREEFAM FAMILY NAME my $seqfile; # FILE CONTAINING THE SEQUENCES FOR A FAMILY my $BLAST; # HASH TABLE WITH POSITIONS OF BLAST MATCHES IN SCAFFOLDS my $hit; # REGION OF A BLAST HIT BETWEEN THE FAMILY AND A SCAFFOLD my $scaffold; # A SCAFFOLD SEQUENCE @@ -8379,15 +8583,15 @@ sub run_genewise_on_scaffolds # OPEN THE OUTPUT FILE FOR THE GENEWISE OUTPUT: $output = $outputdir."/".$output; open(OUTPUT,">$output") || die "ERROR: run_genewise_on_scaffolds: cannot open output $output\n"; close(OUTPUT); # CHECK IF THE FORMATDB FILES ARE COMPLETE: ($errorcode,$errormsg) = &check_formatdb_files($input_fasta); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } # READ IN THE SEQUENCES IN EACH FAMILY, FAMILY BY FAMILY: open(TREEFAM_SEQS,"$treefam_seqs") || die "ERROR: run_genewise_on_scaffolds: cannot open treefam_seqs $treefam_seqs\n"; while(<TREEFAM_SEQS>) { $line = $_; @@ -8403,9 +8607,9 @@ sub run_genewise_on_scaffolds if ($take == 1) { # OPEN A TEMPORARY FILE FOR THE SEQUENCES FOR THIS FAMILY: ($seqfile,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(SEQFILE,">$seqfile") || die "ERROR: run_genewise_on_scaffolds: cannot open seqfile $seqfile\n"; # WRITE THE FAMILY NAME IN THE OUTPUT FILE: system "echo $family >> $output"; } @@ -8521,12 +8725,11 @@ sub test_check_for_blast_match my $seqfile; # FILE WITH THE PROTEIN SEQUENCES my $scaffolds; # FILE WITH THE DNA SEQUENCES OF THE SCAFFOLDS my $BLAST; # HASH TABLE WITH POSITIONS OF HITS IN SCAFFOLDS my $formatdbfile; # FORMAT FILE PRODUCED BY formatdb # MAKE A FILE WITH THE PROTEIN SEQUENCES: ($seqfile,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(SEQFILE,">$seqfile") || die "ERROR: test_check_for_blast_match: cannot open $seqfile\n"; print SEQFILE ">FOX-1\n"; print SEQFILE "MQALYQLSATGAQQQNQQIPIGLSNSLLYQQLAAHQQIAAQQHQQQLAVSAAHQTQNNIMLATSAPSLINHMENSTDGKV\n"; @@ -8537,8 +8740,8 @@ sub test_check_for_blast_match print SEQFILE "QMNPALRTLNRFTPY\n"; close(SEQFILE); # MAKE A FILE WITH THE DNA SEQUENCES: ($scaffolds,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(SCAFFOLDS,">$scaffolds") || die "ERROR: test_check_for_blast_match: cannot open $scaffolds\n"; print SCAFFOLDS ">X:2445317..2450316\n"; print SCAFFOLDS "ttttcctgtaaagcaaagcattcttgctcattcaccacaatgccacattattgtgcttct\n"; @@ -8682,7 +8885,6 @@ sub check_for_blast_match my $flank_length = $_[4]; ## LENGTH OF SEQUENCE TO TAKE ON EITHER SIDE OF A BLAST MATCH my $blast_path = $_[5]; ## PATH TO THE FORMATDB AND BLASTALL EXECUTABLES my $testing = $_[6]; ## SAYS WHETHER THIS IS BEING CALLED FROM A TESTING SUBROUTINE my $output; ## THE BLAST OUTPUT FILE my @temp; ## my $line; ## @@ -8703,8 +8905,8 @@ sub check_for_blast_match # RUN BLAST BETWEEN THE PROTEIN SEQUENCES AND THE SCAFFOLDS: # -m 8 GIVES TABULAR BLAST FORMAT ($output,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } $cmd = "$blast_path/blastall -p tblastn -i $seqfile -d $scaffolds -o $output -m 8 -e $evalue_cutoff\n"; ## system "$cmd"; @@ -8916,7 +9118,6 @@ sub run_genewise_on_hmm my $scaffold_length = $_[9]; ## LENGTH OF THE SCAFFOLD my $errorcode = 0; ## RETURNED AS 0 IF THERE IS NO ERROR. my $errormsg = "none";## RETURNED AS 'none' IF THERE IS NO ERROR. my $genewise = "none";## TEMPORARY FILE FOR THE GENEWISE OUTPUT FOR THIS HMM my $cmd; ## THE GENEWISE COMMAND TO RUN my $genewise_exe = "/nfs/users/nfs_a/alc/Documents/GeneWise/wise2.4.1/src/bin/genewise"; # wise 2.4.1, can use -genestats @@ -8938,8 +9139,8 @@ sub run_genewise_on_hmm } # MAKE A GENEWISE OUTPUT FILE: ($genewise,$errorcode,$errormsg) = &make_filename($outputdir); if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } open(GENEWISE,">$genewise") || die "ERROR: run_genewise_on_hmm: cannot open $genewise\n"; close(GENEWISE); if ($testing == 0) { print STDERR "___ scaffold $scaffoldname: running genewise for HMM $hmmname ...\n";} -
avrilcoghlan revised this gist
Nov 20, 2012 . No changes.There are no files selected for viewing
-
avrilcoghlan revised this gist
Nov 20, 2012 . 1 changed file with 6696 additions and 803 deletions.There are no files selected for viewing
-
avrilcoghlan created this gist
Nov 15, 2012 .There are no files selected for viewing