#!/usr/bin/env perl =head1 NAME bam2circos.pl - Convert BAM file to Circos plot =head1 SYNOPSIS bam2circos.pl [optional] -b FILE Required: -b|bam BAM file Optional: -k|karyotype Karyotype file -d|depth FILE Depth file -s|slidingpid FILE Sliding window percent identity file -w|window INT Window size (default: 50) for sliding window pid calculation -o|overlap INT Overlap size (default: 10) for sliding window pid calculation -a|annotations FILE Annotation file --keep Keep the generated circos configuration and text files -circoslog FILE Log circos outputs to FILE rather than std outputs -t|test Generate circos configuration files but don't execute circos -h|help Full documentation =head1 OPTIONS =over 4 =item B<-b|bam> BAM file containing aligments to one or more reference sequences. Reference IDs in RNAME column must match those found in the karyotype, annotation, depth and slidingpid files. =item B<-k|karyotype> The karyotype file needs to conform to the Circos karyotype file format: http://circos.ca/tutorials/lessons/ideograms/karyotypes/. For most viruses, the file will contain one row to define the full length of the viral genome. You can add rows to define segmented viruses. The 3rd column (ID) must match the RNAME column (reference column) of the BAM file Example: Bunyaviridae have 3 segments chr - NC_014397 L 0 6404 blue chr - NC_014396 M 0 3885 chr - NC_014395 S 0 1690 =item B<-a|annot> The annotation file needs to conform to the Circos text track file format: http://circos.ca/documentation/tutorials/2d_tracks/text_1/ Example (two genes named FOO and BAR) NC_001669 450 650 FOO NC_001669 600 1200 BAR =item B<-d|depth> The file needs to conform to the Circos data file format: http://circos.ca/documentation/tutorials/2d_tracks/histograms/ Example (base 1 has depth of 5 and base 2 depth of 10) NC_001669 0 1 5 NC_001669 1 2 10 =item B<-s|slidingpid> The file needs to conform to the Circos data file format above except that value of the fourth column is the sliding window percent identity Example (50bp sliding window where 1st 50bp is 88.5%, 2nd 50bp is 93.4%) NC_001669 0 50 88.5 NC_001669 51 100 93.4 =item B<-help> Print full documentation and exits. =back =head1 DESCRIPTION B will do blah blah blah. =head1 REQUIREMENTS =over 4 =item circos =item samtools =item awk =back =cut use strict; use warnings; use Getopt::Long; use Pod::Usage; use List::Util qw/sum max/; use POSIX; my %conf = (circos => "circos.conf", ); my ($bam, $depth, $pid, $annot, $karyo); my ($window, $overlap) = (50, 10); my ($help, $man, $keep, $test) = (0, 0, 0, 0); my ($circoslog) = "-"; my ($baselen) = 10000; # 10KB GetOptions ('b|bam=s' => \$bam, 'd|depth=s' => \$depth, 's|slidingpid=s' => \$pid, 'w|window=i' => \$window, 'o|overlap=i' => \$overlap, 'a|annotations=s'=> \$annot, 'k|karyotype=s' => \$karyo, 'h|help' => \$help, 't|test' => \$test, 'l|circoslog=s' => \$circoslog, 'n|baselen=i' => \$baselen, 'keep' => \$keep, ) || pod2usage (-verbose => 0); pod2usage(-verbose => 2) if ($help); pod2usage(-verbose => 1) if (!$bam); initialize (\%conf, $karyo, $annot, $pid, $depth, $bam, $window, $overlap, $baselen); run(\%conf) if ($test == 0); cleanup (\%conf, $keep); exit 0; ################################################################################ sub initialize { my ($conf, $karyotype, $annot, $pid, $depth, $bam, $window, $overlap, $baselen) = @_; my $karyotemp = "temp.karyo.txt"; $karyotype //= $karyotemp; if ($karyotype eq $karyotemp) { create_karyo_file ($karyotemp, $bam); } my $annottemp = "temp.annot.txt"; $annot //= $annottemp; if ($annot eq $annottemp) { create_annot_file ($annottemp, $karyotype); } my $depthtemp = "temp.depth.txt"; $depth //= $depthtemp; if ($depth eq $depthtemp) { create_depth_file ($depthtemp, $bam, $baselen, $karyotype); } my $slidingpidtemp = "temp.slidingpid.txt"; $pid //= $slidingpidtemp; if ($pid eq $slidingpidtemp) { create_slidingpid_file($slidingpidtemp, $window, $overlap, $bam, $karyotype, $baselen); } my $text = circos_conf($karyotype, $annot, $depth, $pid, $baselen); writefile($text,$conf->{circos}); } sub run { my ($conf) = @_; my $circosconf = $conf->{circos}; my $command = "circos -conf $circosconf"; #log to the file if not a console $command = "$command > $circoslog 2>&1" if ($circoslog ne "-"); print "Running: $command\n"; system("$command")==0 or die "Call to circos failed"; } sub cleanup { my ($conf, $keep) = @_; if ($keep) { print STDERR "Not deleting generated circos conf files\n"; } else { foreach my $conffile (%$conf) { unlink ($conffile); } unlink ("temp.karyo.txt", "temp.annot.txt", "temp.depth.txt", "temp.slidingpid.txt"); } } sub create_karyo_file { my ($karyofile, $bam) = @_; open (my $bam_in, "samtools view -H $bam | ") or die ("Can't open pipe from samtools view of a bam file: $!\n"); open (my $kout, ">", $karyofile) or die ("Can't open $karyofile for writing: $!\n"); my $segnum = 0; while (<$bam_in>) { if ( /^\@SQ\s+SN:([^\s]+)\s+LN:([0-9]+)/ ) { my $name = $1; my $len = $2; my $color_num = 1 + ($segnum++ % 23); # coloring used by circos. They map human chromosome numbers to colors. print $kout "chr - $name $name 0 $len chr$color_num\n"; } } close $kout; close $bam_in; } sub create_slidingpid_file { my ($slidingpidfile, $window, $overlap, $bam, $karyotype, $baselen, $random) = @_; my %vlen; my %scale; open (my $kin, "<", $karyotype) or die ("Can't open $karyotype: $!\n"); while (my $line = <$kin>) { my (undef, undef, $id, $name, $start, $end) = split (/ /, $line); $vlen{$id} = $end; $scale{$id} = floor($vlen{$id}/$baselen); $scale{$id} = 1 if ($scale{$id} == 0); } close $kin; my $text = ""; if ($random) { # calculate a random window percent identity # my $w = 50; # my $iter = $length / $w; # my $s = 0; # my $e = $w - 1; # for (my $i = 0; $i < $iter; $i++) { # my $line_s = $s + $w * $i; # my $line_e = $e + $w * $i; # my $x = 75 + int(rand(100 - 75)); # $text .= "$id $line_s $line_e $x\n"; # } } else { # calculate sliding window percent identity print "Running: samtools view $bam | \n"; open (my $bam_in, "samtools view $bam | ") or die ("Can't open pipe from samtools view of a bam file: $!\n"); my $bases; # 0th index in array will not be used. while (<$bam_in>) { my ($q, $f, $r, $p, $m, $c, undef, undef, undef, $seq, undef, @rest) = split(/\t/, $_); my ($nm) = $_ =~ /NM:i:(\d+)/; my $rlen = length($seq); my $avg_nm_read = ($rlen - $nm) / $rlen; my $end = $rlen + $p - 1; for (; $p <= $end; $p++) { push (@{$bases->{$r}->[$p]}, $avg_nm_read); } } # calculate the mean of the average NM / base values for each base foreach my $id (keys %$bases) { for (my $i = 1; $i <= $#{$bases->{$id}}; $i++) { my $mean=0; if (defined $bases->{$id}->[$i]){ $mean=mean(@{ $bases->{$id}->[$i] }); #print "$i $mean",$/; } $bases->{$id}->[$i] = $mean; } } foreach my $id (keys %$bases) { next if( !exists $vlen{$id} ); my $contig_length = $vlen{$id}; my $align_len_max = $#{$bases->{$id}}; my $w = $window * $scale{$id}; my $o = $overlap * $scale{$id}; # The last aligned read could before the genome ends # Fill bases with 0 beyon last aligned antry so window does not hit any undefined values . for(my $i=$align_len_max+1; $i<=$align_len_max + ($w - 1); $i++){ $bases->{$id}->[$i] = 0; } for (my $i = 1; $i <= $align_len_max - ($w - 1) && $i <= $contig_length - ($w - 1); $i += ($w - $o) ) { my $end = $i + ($w - 1); my $m=mean(@{ $bases->{$id} }[$i .. $end]); next if ($m==0.0); $text .= "$id $i $end " . 100 * $m . "\n"; } } } open (my $sout, ">", $slidingpidfile) or die ("Can't open $slidingpidfile for writing: $!\n"); print $sout $text; close $sout; } sub create_annot_file { my ($annotfile, $karyotype) = @_; # get ID, start and end for each chromosome and create an annotation for each chromosome open (my $kin, "<", $karyotype) or die ("Can't open $karyotype: $!\n"); open (my $aout, ">", $annotfile) or die ("Can't open $annotfile for writing: $!\n"); while (my $line = <$kin>) { my (undef, undef, $id, $name, $start, $end) = split (/ /, $line); print $aout "$id $start $end $id\n"; } close $kin; close $aout; } sub create_depth_file { my ($depthfile, $bam, $baselen, $karyotype) = @_; my $awk = q{awk -F "\t" '{print $1,$2-1,$2,$3}'}; my $command = join("", "samtools depth $bam | ", $awk, " > $depthfile"); print "Running: $command\n"; system($command)==0 or die "Command failed: $command"; # get lengths of chromosomes and calculate scale for each open (my $KARYO, "<", $karyotype) or die ("Can't open $karyotype: $!\n"); my %scale; while (<$KARYO>) { my (undef, undef, $id, undef, undef, $length) = split(/ /, $_); $scale{$id} = floor($length/$baselen); $scale{$id} = 1 if ($scale{$id} == 0); } close ($KARYO); # read depth file and bin based on ID open (my $DF, "<", $depthfile) or die ("Can't open $depthfile for reading: $!\n"); my %depth; while (<$DF>) { chomp; foreach my $id (keys %scale) { if (/^$id/) { push ( @{ $depth{$id} }, $_ ); } } } close ($DF); open (my $OUT, ">", $depthfile) or die ("Can't open $depthfile for writing: $!\n"); foreach my $id (keys %depth) { my @depth = (); my ($s, $e); my $lineno = 0; foreach ( @{ $depth{$id} } ) { my $d; ($id, $s, $e, $d) = split(/ /, $_); push (@depth, $d); if (++$lineno % $scale{$id} == 0) { my $avgd = mean(@depth); @depth = (); print $OUT "$id $s $e $avgd\n"; } } # there will be remaining depth values to be averaged if number of depth lines for ID is not a factor of SCALE if (@depth) { my $avgd = mean(@depth); @depth = (); print $OUT "$id $s $e $avgd\n"; } } close ($OUT); } sub circos_conf { my ($karyotype, $annot, $depth, $pid, $baselen) = @_; my $ticks_text = ticks_conf($karyotype, $baselen); my $ideogram_text= ideogram_conf(); my $depth_text= depth_conf($depth); my $slidingpid_text = ""; $slidingpid_text = slidingpid_conf($pid) if ($pid); my $text = <> $ticks_text $ideogram_text karyotype = ./$karyotype <> <> #chromosomes_units = 1000 #chromosomes_display_default = yes ############ ### ORFS ### type = tile file = $annot r1 = 1r r0 = 0.85r layers = 3 margin = 0.01u thickness = 50 padding = 4 layers_overflow = hide orientation = out stroke_thickness = 1 stroke_color = grey color = vlorange # # #condition = var(size) < 1kb #color = lgrey # # type = text file = $annot r1 = 1r r0 = 0.85r show_links = yes link_dims = 2p,2p,2p,2p,2p link_thickness = 4p link_color = red #label_rotate = no label_parallel = yes label_size = 30p label_font = bold ################ ### DEPTH ### $depth_text ################### ### SLIDING PID ### $slidingpid_text HEREDOC return $text; } sub ideogram_conf { my $text = < default = 0.01r break = 0.5r # ideogram position, fill and outline radius = 0.9r thickness = 50p # default = 20p fill = yes stroke_color = dgrey stroke_thickness = 2p # minimum definition for ideogram labels show_label = yes label_font = default label_radius = 1r + 75p label_size = 50 label_parallel = yes HEREDOC return $text; } sub ticks_conf { my ($karyotype, $baselen) = @_; open (my $KARYO, "<", $karyotype) or die ("Can't open $karyotype: $!\n"); my @lengths = (); while (<$KARYO>) { my (undef, undef, $id, undef, undef, $length) = split(/ /, $_); push (@lengths, $length); } close ($KARYO); my $maxl = max (@lengths); my $scale = 1; if ($maxl < $baselen) { $scale = 1; } elsif ($maxl < $baselen * 10) { $scale = 10; } elsif ($maxl < $baselen * 100) { $scale = 100; } else { $scale = 1000;} my $small_spacing = 100 * $scale . "u"; my $large_spacing = 1000 * $scale . "u"; my $text = < radius = 1r color = black thickness = 2p # the tick label is derived by multiplying the tick position # by 'multiplier' and casting it in 'format': # # sprintf(format,position*multiplier) multiplier = 1 # %d - integer # %f - float # %.1f - float with one decimal # %.2f - float with two decimals # # for other formats, see http://perldoc.perl.org/functions/sprintf.html format = %d spacing = $small_spacing size = 10p spacing = $large_spacing size = 25p show_label = yes label_size = 24p label_offset = 8p format = %d HEREDOC return $text; } sub depth_conf { my ($depthfile) = @_; my $text = < color = lgrey thickness = 1 spacing = 0.1r HEREDOC return $text; } sub writefile { my ($text, $outfile) = @_; open (my $out, ">", $outfile) or die ("Can't open $outfile for writing: $!\n"); print $out $text; print STDERR "Wrote $outfile file\n"; close ($out); } sub mean { if (@_) { return sum(@_)/@_; } else { return 0; } }