-
-
Save AswinSSoman/454fd7ef0cdb6ad9db52e4045a39b06e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env perl | |
| =head1 NAME | |
| bam2circos.pl - Convert BAM file to Circos plot | |
| =head1 SYNOPSIS | |
| bam2circos.pl [optional] -b FILE | |
| Required: | |
| -b|bam BAM file | |
| Optional: | |
| -k|karyotype Karyotype file | |
| -d|depth FILE Depth file | |
| -s|slidingpid FILE Sliding window percent identity file | |
| -w|window INT Window size (default: 50) for sliding window pid calculation | |
| -o|overlap INT Overlap size (default: 10) for sliding window pid calculation | |
| -a|annotations FILE Annotation file | |
| --keep Keep the generated circos configuration and text files | |
| -circoslog FILE Log circos outputs to FILE rather than std outputs | |
| -t|test Generate circos configuration files but don't execute circos | |
| -h|help Full documentation | |
| =head1 OPTIONS | |
| =over 4 | |
| =item B<-b|bam> | |
| BAM file containing aligments to one or more reference sequences. | |
| Reference IDs in RNAME column must match those found in the karyotype, | |
| annotation, depth and slidingpid files. | |
| =item B<-k|karyotype> | |
| The karyotype file needs to conform to the Circos karyotype file format: | |
| http://circos.ca/tutorials/lessons/ideograms/karyotypes/. For most | |
| viruses, the file will contain one row to define the full length of the | |
| viral genome. You can add rows to define segmented viruses. The 3rd | |
| column (ID) must match the RNAME column (reference column) of the BAM | |
| file | |
| Example: Bunyaviridae have 3 segments | |
| chr - NC_014397 L 0 6404 blue | |
| chr - NC_014396 M 0 3885 | |
| chr - NC_014395 S 0 1690 | |
| =item B<-a|annot> | |
| The annotation file needs to conform to the Circos text track file | |
| format: http://circos.ca/documentation/tutorials/2d_tracks/text_1/ | |
| Example (two genes named FOO and BAR) | |
| NC_001669 450 650 FOO | |
| NC_001669 600 1200 BAR | |
| =item B<-d|depth> | |
| The file needs to conform to the Circos data file format: | |
| http://circos.ca/documentation/tutorials/2d_tracks/histograms/ | |
| Example (base 1 has depth of 5 and base 2 depth of 10) | |
| NC_001669 0 1 5 | |
| NC_001669 1 2 10 | |
| =item B<-s|slidingpid> | |
| The file needs to conform to the Circos data file format above except | |
| that value of the fourth column is the sliding window percent identity | |
| Example (50bp sliding window where 1st 50bp is 88.5%, 2nd 50bp is 93.4%) | |
| NC_001669 0 50 88.5 | |
| NC_001669 51 100 93.4 | |
| =item B<-help> | |
| Print full documentation and exits. | |
| =back | |
| =head1 DESCRIPTION | |
| B<This program> will do blah blah blah. | |
| =head1 REQUIREMENTS | |
| =over 4 | |
| =item circos | |
| =item samtools | |
| =item awk | |
| =back | |
| =cut | |
| use strict; | |
| use warnings; | |
| use Getopt::Long; | |
| use Pod::Usage; | |
| use List::Util qw/sum max/; | |
| use POSIX; | |
| my %conf = (circos => "circos.conf", | |
| ); | |
| my ($bam, $depth, $pid, $annot, $karyo); | |
| my ($window, $overlap) = (50, 10); | |
| my ($help, $man, $keep, $test) = (0, 0, 0, 0); | |
| my ($circoslog) = "-"; | |
| my ($baselen) = 10000; # 10KB | |
| GetOptions ('b|bam=s' => \$bam, | |
| 'd|depth=s' => \$depth, | |
| 's|slidingpid=s' => \$pid, | |
| 'w|window=i' => \$window, | |
| 'o|overlap=i' => \$overlap, | |
| 'a|annotations=s'=> \$annot, | |
| 'k|karyotype=s' => \$karyo, | |
| 'h|help' => \$help, | |
| 't|test' => \$test, | |
| 'l|circoslog=s' => \$circoslog, | |
| 'n|baselen=i' => \$baselen, | |
| 'keep' => \$keep, | |
| ) || pod2usage (-verbose => 0); | |
| pod2usage(-verbose => 2) if ($help); | |
| pod2usage(-verbose => 1) if (!$bam); | |
| initialize (\%conf, $karyo, $annot, $pid, $depth, $bam, $window, $overlap, $baselen); | |
| run(\%conf) if ($test == 0); | |
| cleanup (\%conf, $keep); | |
| exit 0; | |
| ################################################################################ | |
| sub initialize { | |
| my ($conf, $karyotype, $annot, $pid, $depth, $bam, $window, $overlap, $baselen) = @_; | |
| my $karyotemp = "temp.karyo.txt"; | |
| $karyotype //= $karyotemp; | |
| if ($karyotype eq $karyotemp) { | |
| create_karyo_file ($karyotemp, $bam); | |
| } | |
| my $annottemp = "temp.annot.txt"; | |
| $annot //= $annottemp; | |
| if ($annot eq $annottemp) { | |
| create_annot_file ($annottemp, $karyotype); | |
| } | |
| my $depthtemp = "temp.depth.txt"; | |
| $depth //= $depthtemp; | |
| if ($depth eq $depthtemp) { | |
| create_depth_file ($depthtemp, $bam, $baselen, $karyotype); | |
| } | |
| my $slidingpidtemp = "temp.slidingpid.txt"; | |
| $pid //= $slidingpidtemp; | |
| if ($pid eq $slidingpidtemp) { | |
| create_slidingpid_file($slidingpidtemp, $window, $overlap, $bam, $karyotype, $baselen); | |
| } | |
| my $text = circos_conf($karyotype, $annot, $depth, $pid, $baselen); | |
| writefile($text,$conf->{circos}); | |
| } | |
| sub run { | |
| my ($conf) = @_; | |
| my $circosconf = $conf->{circos}; | |
| my $command = "circos -conf $circosconf"; | |
| #log to the file if not a console | |
| $command = "$command > $circoslog 2>&1" if ($circoslog ne "-"); | |
| print "Running: $command\n"; | |
| system("$command")==0 or die "Call to circos failed"; | |
| } | |
| sub cleanup { | |
| my ($conf, $keep) = @_; | |
| if ($keep) { | |
| print STDERR "Not deleting generated circos conf files\n"; | |
| } | |
| else { | |
| foreach my $conffile (%$conf) { | |
| unlink ($conffile); | |
| } | |
| unlink ("temp.karyo.txt", "temp.annot.txt", "temp.depth.txt", "temp.slidingpid.txt"); | |
| } | |
| } | |
| sub create_karyo_file { | |
| my ($karyofile, $bam) = @_; | |
| open (my $bam_in, "samtools view -H $bam | ") or die ("Can't open pipe from samtools view of a bam file: $!\n"); | |
| open (my $kout, ">", $karyofile) or die ("Can't open $karyofile for writing: $!\n"); | |
| my $segnum = 0; | |
| while (<$bam_in>) { | |
| if ( /^\@SQ\s+SN:([^\s]+)\s+LN:([0-9]+)/ ) { | |
| my $name = $1; | |
| my $len = $2; | |
| my $color_num = 1 + ($segnum++ % 23); # coloring used by circos. They map human chromosome numbers to colors. | |
| print $kout "chr - $name $name 0 $len chr$color_num\n"; | |
| } | |
| } | |
| close $kout; | |
| close $bam_in; | |
| } | |
| sub create_slidingpid_file { | |
| my ($slidingpidfile, $window, $overlap, $bam, $karyotype, $baselen, $random) = @_; | |
| my %vlen; | |
| my %scale; | |
| open (my $kin, "<", $karyotype) or die ("Can't open $karyotype: $!\n"); | |
| while (my $line = <$kin>) { | |
| my (undef, undef, $id, $name, $start, $end) = split (/ /, $line); | |
| $vlen{$id} = $end; | |
| $scale{$id} = floor($vlen{$id}/$baselen); | |
| $scale{$id} = 1 if ($scale{$id} == 0); | |
| } | |
| close $kin; | |
| my $text = ""; | |
| if ($random) { | |
| # calculate a random window percent identity | |
| # my $w = 50; | |
| # my $iter = $length / $w; | |
| # my $s = 0; | |
| # my $e = $w - 1; | |
| # for (my $i = 0; $i < $iter; $i++) { | |
| # my $line_s = $s + $w * $i; | |
| # my $line_e = $e + $w * $i; | |
| # my $x = 75 + int(rand(100 - 75)); | |
| # $text .= "$id $line_s $line_e $x\n"; | |
| # } | |
| } | |
| else { | |
| # calculate sliding window percent identity | |
| print "Running: samtools view $bam | \n"; | |
| open (my $bam_in, "samtools view $bam | ") or die ("Can't open pipe from samtools view of a bam file: $!\n"); | |
| my $bases; # 0th index in array will not be used. | |
| while (<$bam_in>) { | |
| my ($q, $f, $r, $p, $m, $c, undef, undef, undef, $seq, undef, @rest) = split(/\t/, $_); | |
| my ($nm) = $_ =~ /NM:i:(\d+)/; | |
| my $rlen = length($seq); | |
| my $avg_nm_read = ($rlen - $nm) / $rlen; | |
| my $end = $rlen + $p - 1; | |
| for (; $p <= $end; $p++) { | |
| push (@{$bases->{$r}->[$p]}, $avg_nm_read); | |
| } | |
| } | |
| # calculate the mean of the average NM / base values for each base | |
| foreach my $id (keys %$bases) { | |
| for (my $i = 1; $i <= $#{$bases->{$id}}; $i++) { | |
| my $mean=0; | |
| if (defined $bases->{$id}->[$i]){ | |
| $mean=mean(@{ $bases->{$id}->[$i] }); | |
| #print "$i $mean",$/; | |
| } | |
| $bases->{$id}->[$i] = $mean; | |
| } | |
| } | |
| foreach my $id (keys %$bases) { | |
| next if( !exists $vlen{$id} ); | |
| my $contig_length = $vlen{$id}; | |
| my $align_len_max = $#{$bases->{$id}}; | |
| my $w = $window * $scale{$id}; | |
| my $o = $overlap * $scale{$id}; | |
| # The last aligned read could before the genome ends | |
| # Fill bases with 0 beyon last aligned antry so window does not hit any undefined values . | |
| for(my $i=$align_len_max+1; $i<=$align_len_max + ($w - 1); $i++){ | |
| $bases->{$id}->[$i] = 0; | |
| } | |
| for (my $i = 1; $i <= $align_len_max - ($w - 1) && $i <= $contig_length - ($w - 1); $i += ($w - $o) ) { | |
| my $end = $i + ($w - 1); | |
| my $m=mean(@{ $bases->{$id} }[$i .. $end]); | |
| next if ($m==0.0); | |
| $text .= "$id $i $end " . 100 * $m . "\n"; | |
| } | |
| } | |
| } | |
| open (my $sout, ">", $slidingpidfile) or die ("Can't open $slidingpidfile for writing: $!\n"); | |
| print $sout $text; | |
| close $sout; | |
| } | |
| sub create_annot_file { | |
| my ($annotfile, $karyotype) = @_; | |
| # get ID, start and end for each chromosome and create an annotation for each chromosome | |
| open (my $kin, "<", $karyotype) or die ("Can't open $karyotype: $!\n"); | |
| open (my $aout, ">", $annotfile) or die ("Can't open $annotfile for writing: $!\n"); | |
| while (my $line = <$kin>) { | |
| my (undef, undef, $id, $name, $start, $end) = split (/ /, $line); | |
| print $aout "$id $start $end $id\n"; | |
| } | |
| close $kin; | |
| close $aout; | |
| } | |
| sub create_depth_file { | |
| my ($depthfile, $bam, $baselen, $karyotype) = @_; | |
| my $awk = q{awk -F "\t" '{print $1,$2-1,$2,$3}'}; | |
| my $command = join("", "samtools depth $bam | ", $awk, " > $depthfile"); | |
| print "Running: $command\n"; | |
| system($command)==0 or die "Command failed: $command"; | |
| # get lengths of chromosomes and calculate scale for each | |
| open (my $KARYO, "<", $karyotype) or die ("Can't open $karyotype: $!\n"); | |
| my %scale; | |
| while (<$KARYO>) { | |
| my (undef, undef, $id, undef, undef, $length) = split(/ /, $_); | |
| $scale{$id} = floor($length/$baselen); | |
| $scale{$id} = 1 if ($scale{$id} == 0); | |
| } | |
| close ($KARYO); | |
| # read depth file and bin based on ID | |
| open (my $DF, "<", $depthfile) or die ("Can't open $depthfile for reading: $!\n"); | |
| my %depth; | |
| while (<$DF>) { | |
| chomp; | |
| foreach my $id (keys %scale) { | |
| if (/^$id/) { | |
| push ( @{ $depth{$id} }, $_ ); | |
| } | |
| } | |
| } | |
| close ($DF); | |
| open (my $OUT, ">", $depthfile) or die ("Can't open $depthfile for writing: $!\n"); | |
| foreach my $id (keys %depth) { | |
| my @depth = (); | |
| my ($s, $e); | |
| my $lineno = 0; | |
| foreach ( @{ $depth{$id} } ) { | |
| my $d; | |
| ($id, $s, $e, $d) = split(/ /, $_); | |
| push (@depth, $d); | |
| if (++$lineno % $scale{$id} == 0) { | |
| my $avgd = mean(@depth); | |
| @depth = (); | |
| print $OUT "$id $s $e $avgd\n"; | |
| } | |
| } | |
| # there will be remaining depth values to be averaged if number of depth lines for ID is not a factor of SCALE | |
| if (@depth) { | |
| my $avgd = mean(@depth); | |
| @depth = (); | |
| print $OUT "$id $s $e $avgd\n"; | |
| } | |
| } | |
| close ($OUT); | |
| } | |
| sub circos_conf { | |
| my ($karyotype, $annot, $depth, $pid, $baselen) = @_; | |
| my $ticks_text = ticks_conf($karyotype, $baselen); | |
| my $ideogram_text= ideogram_conf(); | |
| my $depth_text= depth_conf($depth); | |
| my $slidingpid_text = ""; | |
| $slidingpid_text = slidingpid_conf($pid) if ($pid); | |
| my $text = <<HEREDOC; | |
| <<include etc/colors_fonts_patterns.conf>> | |
| $ticks_text | |
| $ideogram_text | |
| karyotype = ./$karyotype | |
| <image> | |
| <<include etc/image.conf>> | |
| </image> | |
| <<include etc/housekeeping.conf>> | |
| #chromosomes_units = 1000 | |
| #chromosomes_display_default = yes | |
| <plots> | |
| ############ | |
| ### ORFS ### | |
| <plot> | |
| type = tile | |
| file = $annot | |
| r1 = 1r | |
| r0 = 0.85r | |
| layers = 3 | |
| margin = 0.01u | |
| thickness = 50 | |
| padding = 4 | |
| layers_overflow = hide | |
| orientation = out | |
| stroke_thickness = 1 | |
| stroke_color = grey | |
| color = vlorange | |
| #<rules> | |
| #<rule> | |
| #condition = var(size) < 1kb | |
| #color = lgrey | |
| #</rule> | |
| #</rules> | |
| </plot> | |
| <plot> | |
| type = text | |
| file = $annot | |
| r1 = 1r | |
| r0 = 0.85r | |
| show_links = yes | |
| link_dims = 2p,2p,2p,2p,2p | |
| link_thickness = 4p | |
| link_color = red | |
| #label_rotate = no | |
| label_parallel = yes | |
| label_size = 30p | |
| label_font = bold | |
| </plot> | |
| ################ | |
| ### DEPTH ### | |
| <plot> | |
| $depth_text | |
| </plot> | |
| ################### | |
| ### SLIDING PID ### | |
| <plot> | |
| $slidingpid_text | |
| </plot> | |
| </plots> | |
| HEREDOC | |
| return $text; | |
| } | |
| sub ideogram_conf { | |
| my $text = <<HEREDOC; | |
| ### IDEOGRAM ### | |
| <ideogram> | |
| <spacing> | |
| default = 0.01r | |
| break = 0.5r | |
| </spacing> | |
| # ideogram position, fill and outline | |
| radius = 0.9r | |
| thickness = 50p # default = 20p | |
| fill = yes | |
| stroke_color = dgrey | |
| stroke_thickness = 2p | |
| # minimum definition for ideogram labels | |
| show_label = yes | |
| label_font = default | |
| label_radius = 1r + 75p | |
| label_size = 50 | |
| label_parallel = yes | |
| </ideogram> | |
| HEREDOC | |
| return $text; | |
| } | |
| sub ticks_conf { | |
| my ($karyotype, $baselen) = @_; | |
| open (my $KARYO, "<", $karyotype) or die ("Can't open $karyotype: $!\n"); | |
| my @lengths = (); | |
| while (<$KARYO>) { | |
| my (undef, undef, $id, undef, undef, $length) = split(/ /, $_); | |
| push (@lengths, $length); | |
| } | |
| close ($KARYO); | |
| my $maxl = max (@lengths); | |
| my $scale = 1; | |
| if ($maxl < $baselen) { $scale = 1; } | |
| elsif ($maxl < $baselen * 10) { $scale = 10; } | |
| elsif ($maxl < $baselen * 100) { $scale = 100; } | |
| else { $scale = 1000;} | |
| my $small_spacing = 100 * $scale . "u"; | |
| my $large_spacing = 1000 * $scale . "u"; | |
| my $text = <<HEREDOC; | |
| ### TICKS ### | |
| show_ticks = yes | |
| show_tick_labels = yes | |
| <ticks> | |
| radius = 1r | |
| color = black | |
| thickness = 2p | |
| # the tick label is derived by multiplying the tick position | |
| # by 'multiplier' and casting it in 'format': | |
| # | |
| # sprintf(format,position*multiplier) | |
| multiplier = 1 | |
| # %d - integer | |
| # %f - float | |
| # %.1f - float with one decimal | |
| # %.2f - float with two decimals | |
| # | |
| # for other formats, see http://perldoc.perl.org/functions/sprintf.html | |
| format = %d | |
| <tick> | |
| spacing = $small_spacing | |
| size = 10p | |
| </tick> | |
| <tick> | |
| spacing = $large_spacing | |
| size = 25p | |
| show_label = yes | |
| label_size = 24p | |
| label_offset = 8p | |
| format = %d | |
| </tick> | |
| </ticks> | |
| HEREDOC | |
| return $text; | |
| } | |
| sub depth_conf { | |
| my ($depthfile) = @_; | |
| my $text = <<HEREDOC; | |
| ### COVERAGE ### | |
| type = histogram | |
| file = $depthfile | |
| r1 = 0.80r | |
| r0 = 0.70r | |
| thickness = 0p | |
| #AP: the line below erros out with circos-0.69-3 (looks like it expects just a single value) | |
| #fill_color=red,green,blue,yellow # vdgrey | |
| fill_color=red | |
| HEREDOC | |
| return $text; | |
| } | |
| sub slidingpid_conf { | |
| my ($slidingpidfile) = @_; | |
| my $text = <<HEREDOC; | |
| ### SLIDING PID ### | |
| type = line | |
| thickness = 10 # 2 | |
| max_gap = 1u | |
| file = $slidingpidfile | |
| color = chr15 | |
| #fill_color = chr17 | |
| min = 0 | |
| max = 100 | |
| r1 = 0.65r | |
| r0 = 0.55r | |
| <axes> | |
| <axis> | |
| color = lgrey | |
| thickness = 1 | |
| spacing = 0.1r | |
| </axis> | |
| </axes> | |
| HEREDOC | |
| return $text; | |
| } | |
| sub writefile { | |
| my ($text, $outfile) = @_; | |
| open (my $out, ">", $outfile) or die ("Can't open $outfile for writing: $!\n"); | |
| print $out $text; | |
| print STDERR "Wrote $outfile file\n"; | |
| close ($out); | |
| } | |
| sub mean { | |
| if (@_) { | |
| return sum(@_)/@_; | |
| } | |
| else { | |
| return 0; | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment