Skip to content

Instantly share code, notes, and snippets.

@AswinSSoman
Forked from pcantalupo/bam2circos.pl
Created November 10, 2022 06:53
Show Gist options
  • Save AswinSSoman/454fd7ef0cdb6ad9db52e4045a39b06e to your computer and use it in GitHub Desktop.
Save AswinSSoman/454fd7ef0cdb6ad9db52e4045a39b06e to your computer and use it in GitHub Desktop.
#!/usr/bin/env perl
=head1 NAME
bam2circos.pl - Convert BAM file to Circos plot
=head1 SYNOPSIS
bam2circos.pl [optional] -b FILE
Required:
-b|bam BAM file
Optional:
-k|karyotype Karyotype file
-d|depth FILE Depth file
-s|slidingpid FILE Sliding window percent identity file
-w|window INT Window size (default: 50) for sliding window pid calculation
-o|overlap INT Overlap size (default: 10) for sliding window pid calculation
-a|annotations FILE Annotation file
--keep Keep the generated circos configuration and text files
-circoslog FILE Log circos outputs to FILE rather than std outputs
-t|test Generate circos configuration files but don't execute circos
-h|help Full documentation
=head1 OPTIONS
=over 4
=item B<-b|bam>
BAM file containing aligments to one or more reference sequences.
Reference IDs in RNAME column must match those found in the karyotype,
annotation, depth and slidingpid files.
=item B<-k|karyotype>
The karyotype file needs to conform to the Circos karyotype file format:
http://circos.ca/tutorials/lessons/ideograms/karyotypes/. For most
viruses, the file will contain one row to define the full length of the
viral genome. You can add rows to define segmented viruses. The 3rd
column (ID) must match the RNAME column (reference column) of the BAM
file
Example: Bunyaviridae have 3 segments
chr - NC_014397 L 0 6404 blue
chr - NC_014396 M 0 3885
chr - NC_014395 S 0 1690
=item B<-a|annot>
The annotation file needs to conform to the Circos text track file
format: http://circos.ca/documentation/tutorials/2d_tracks/text_1/
Example (two genes named FOO and BAR)
NC_001669 450 650 FOO
NC_001669 600 1200 BAR
=item B<-d|depth>
The file needs to conform to the Circos data file format:
http://circos.ca/documentation/tutorials/2d_tracks/histograms/
Example (base 1 has depth of 5 and base 2 depth of 10)
NC_001669 0 1 5
NC_001669 1 2 10
=item B<-s|slidingpid>
The file needs to conform to the Circos data file format above except
that value of the fourth column is the sliding window percent identity
Example (50bp sliding window where 1st 50bp is 88.5%, 2nd 50bp is 93.4%)
NC_001669 0 50 88.5
NC_001669 51 100 93.4
=item B<-help>
Print full documentation and exits.
=back
=head1 DESCRIPTION
B<This program> will do blah blah blah.
=head1 REQUIREMENTS
=over 4
=item circos
=item samtools
=item awk
=back
=cut
use strict;
use warnings;
use Getopt::Long;
use Pod::Usage;
use List::Util qw/sum max/;
use POSIX;
my %conf = (circos => "circos.conf",
);
my ($bam, $depth, $pid, $annot, $karyo);
my ($window, $overlap) = (50, 10);
my ($help, $man, $keep, $test) = (0, 0, 0, 0);
my ($circoslog) = "-";
my ($baselen) = 10000; # 10KB
GetOptions ('b|bam=s' => \$bam,
'd|depth=s' => \$depth,
's|slidingpid=s' => \$pid,
'w|window=i' => \$window,
'o|overlap=i' => \$overlap,
'a|annotations=s'=> \$annot,
'k|karyotype=s' => \$karyo,
'h|help' => \$help,
't|test' => \$test,
'l|circoslog=s' => \$circoslog,
'n|baselen=i' => \$baselen,
'keep' => \$keep,
) || pod2usage (-verbose => 0);
pod2usage(-verbose => 2) if ($help);
pod2usage(-verbose => 1) if (!$bam);
initialize (\%conf, $karyo, $annot, $pid, $depth, $bam, $window, $overlap, $baselen);
run(\%conf) if ($test == 0);
cleanup (\%conf, $keep);
exit 0;
################################################################################
sub initialize {
my ($conf, $karyotype, $annot, $pid, $depth, $bam, $window, $overlap, $baselen) = @_;
my $karyotemp = "temp.karyo.txt";
$karyotype //= $karyotemp;
if ($karyotype eq $karyotemp) {
create_karyo_file ($karyotemp, $bam);
}
my $annottemp = "temp.annot.txt";
$annot //= $annottemp;
if ($annot eq $annottemp) {
create_annot_file ($annottemp, $karyotype);
}
my $depthtemp = "temp.depth.txt";
$depth //= $depthtemp;
if ($depth eq $depthtemp) {
create_depth_file ($depthtemp, $bam, $baselen, $karyotype);
}
my $slidingpidtemp = "temp.slidingpid.txt";
$pid //= $slidingpidtemp;
if ($pid eq $slidingpidtemp) {
create_slidingpid_file($slidingpidtemp, $window, $overlap, $bam, $karyotype, $baselen);
}
my $text = circos_conf($karyotype, $annot, $depth, $pid, $baselen);
writefile($text,$conf->{circos});
}
sub run {
my ($conf) = @_;
my $circosconf = $conf->{circos};
my $command = "circos -conf $circosconf";
#log to the file if not a console
$command = "$command > $circoslog 2>&1" if ($circoslog ne "-");
print "Running: $command\n";
system("$command")==0 or die "Call to circos failed";
}
sub cleanup {
my ($conf, $keep) = @_;
if ($keep) {
print STDERR "Not deleting generated circos conf files\n";
}
else {
foreach my $conffile (%$conf) {
unlink ($conffile);
}
unlink ("temp.karyo.txt", "temp.annot.txt", "temp.depth.txt", "temp.slidingpid.txt");
}
}
sub create_karyo_file {
my ($karyofile, $bam) = @_;
open (my $bam_in, "samtools view -H $bam | ") or die ("Can't open pipe from samtools view of a bam file: $!\n");
open (my $kout, ">", $karyofile) or die ("Can't open $karyofile for writing: $!\n");
my $segnum = 0;
while (<$bam_in>) {
if ( /^\@SQ\s+SN:([^\s]+)\s+LN:([0-9]+)/ ) {
my $name = $1;
my $len = $2;
my $color_num = 1 + ($segnum++ % 23); # coloring used by circos. They map human chromosome numbers to colors.
print $kout "chr - $name $name 0 $len chr$color_num\n";
}
}
close $kout;
close $bam_in;
}
sub create_slidingpid_file {
my ($slidingpidfile, $window, $overlap, $bam, $karyotype, $baselen, $random) = @_;
my %vlen;
my %scale;
open (my $kin, "<", $karyotype) or die ("Can't open $karyotype: $!\n");
while (my $line = <$kin>) {
my (undef, undef, $id, $name, $start, $end) = split (/ /, $line);
$vlen{$id} = $end;
$scale{$id} = floor($vlen{$id}/$baselen);
$scale{$id} = 1 if ($scale{$id} == 0);
}
close $kin;
my $text = "";
if ($random) {
# calculate a random window percent identity
# my $w = 50;
# my $iter = $length / $w;
# my $s = 0;
# my $e = $w - 1;
# for (my $i = 0; $i < $iter; $i++) {
# my $line_s = $s + $w * $i;
# my $line_e = $e + $w * $i;
# my $x = 75 + int(rand(100 - 75));
# $text .= "$id $line_s $line_e $x\n";
# }
}
else {
# calculate sliding window percent identity
print "Running: samtools view $bam | \n";
open (my $bam_in, "samtools view $bam | ") or die ("Can't open pipe from samtools view of a bam file: $!\n");
my $bases; # 0th index in array will not be used.
while (<$bam_in>) {
my ($q, $f, $r, $p, $m, $c, undef, undef, undef, $seq, undef, @rest) = split(/\t/, $_);
my ($nm) = $_ =~ /NM:i:(\d+)/;
my $rlen = length($seq);
my $avg_nm_read = ($rlen - $nm) / $rlen;
my $end = $rlen + $p - 1;
for (; $p <= $end; $p++) {
push (@{$bases->{$r}->[$p]}, $avg_nm_read);
}
}
# calculate the mean of the average NM / base values for each base
foreach my $id (keys %$bases) {
for (my $i = 1; $i <= $#{$bases->{$id}}; $i++) {
my $mean=0;
if (defined $bases->{$id}->[$i]){
$mean=mean(@{ $bases->{$id}->[$i] });
#print "$i $mean",$/;
}
$bases->{$id}->[$i] = $mean;
}
}
foreach my $id (keys %$bases) {
next if( !exists $vlen{$id} );
my $contig_length = $vlen{$id};
my $align_len_max = $#{$bases->{$id}};
my $w = $window * $scale{$id};
my $o = $overlap * $scale{$id};
# The last aligned read could before the genome ends
# Fill bases with 0 beyon last aligned antry so window does not hit any undefined values .
for(my $i=$align_len_max+1; $i<=$align_len_max + ($w - 1); $i++){
$bases->{$id}->[$i] = 0;
}
for (my $i = 1; $i <= $align_len_max - ($w - 1) && $i <= $contig_length - ($w - 1); $i += ($w - $o) ) {
my $end = $i + ($w - 1);
my $m=mean(@{ $bases->{$id} }[$i .. $end]);
next if ($m==0.0);
$text .= "$id $i $end " . 100 * $m . "\n";
}
}
}
open (my $sout, ">", $slidingpidfile) or die ("Can't open $slidingpidfile for writing: $!\n");
print $sout $text;
close $sout;
}
sub create_annot_file {
my ($annotfile, $karyotype) = @_;
# get ID, start and end for each chromosome and create an annotation for each chromosome
open (my $kin, "<", $karyotype) or die ("Can't open $karyotype: $!\n");
open (my $aout, ">", $annotfile) or die ("Can't open $annotfile for writing: $!\n");
while (my $line = <$kin>) {
my (undef, undef, $id, $name, $start, $end) = split (/ /, $line);
print $aout "$id $start $end $id\n";
}
close $kin;
close $aout;
}
sub create_depth_file {
my ($depthfile, $bam, $baselen, $karyotype) = @_;
my $awk = q{awk -F "\t" '{print $1,$2-1,$2,$3}'};
my $command = join("", "samtools depth $bam | ", $awk, " > $depthfile");
print "Running: $command\n";
system($command)==0 or die "Command failed: $command";
# get lengths of chromosomes and calculate scale for each
open (my $KARYO, "<", $karyotype) or die ("Can't open $karyotype: $!\n");
my %scale;
while (<$KARYO>) {
my (undef, undef, $id, undef, undef, $length) = split(/ /, $_);
$scale{$id} = floor($length/$baselen);
$scale{$id} = 1 if ($scale{$id} == 0);
}
close ($KARYO);
# read depth file and bin based on ID
open (my $DF, "<", $depthfile) or die ("Can't open $depthfile for reading: $!\n");
my %depth;
while (<$DF>) {
chomp;
foreach my $id (keys %scale) {
if (/^$id/) {
push ( @{ $depth{$id} }, $_ );
}
}
}
close ($DF);
open (my $OUT, ">", $depthfile) or die ("Can't open $depthfile for writing: $!\n");
foreach my $id (keys %depth) {
my @depth = ();
my ($s, $e);
my $lineno = 0;
foreach ( @{ $depth{$id} } ) {
my $d;
($id, $s, $e, $d) = split(/ /, $_);
push (@depth, $d);
if (++$lineno % $scale{$id} == 0) {
my $avgd = mean(@depth);
@depth = ();
print $OUT "$id $s $e $avgd\n";
}
}
# there will be remaining depth values to be averaged if number of depth lines for ID is not a factor of SCALE
if (@depth) {
my $avgd = mean(@depth);
@depth = ();
print $OUT "$id $s $e $avgd\n";
}
}
close ($OUT);
}
sub circos_conf {
my ($karyotype, $annot, $depth, $pid, $baselen) = @_;
my $ticks_text = ticks_conf($karyotype, $baselen);
my $ideogram_text= ideogram_conf();
my $depth_text= depth_conf($depth);
my $slidingpid_text = "";
$slidingpid_text = slidingpid_conf($pid) if ($pid);
my $text = <<HEREDOC;
<<include etc/colors_fonts_patterns.conf>>
$ticks_text
$ideogram_text
karyotype = ./$karyotype
<image>
<<include etc/image.conf>>
</image>
<<include etc/housekeeping.conf>>
#chromosomes_units = 1000
#chromosomes_display_default = yes
<plots>
############
### ORFS ###
<plot>
type = tile
file = $annot
r1 = 1r
r0 = 0.85r
layers = 3
margin = 0.01u
thickness = 50
padding = 4
layers_overflow = hide
orientation = out
stroke_thickness = 1
stroke_color = grey
color = vlorange
#<rules>
#<rule>
#condition = var(size) < 1kb
#color = lgrey
#</rule>
#</rules>
</plot>
<plot>
type = text
file = $annot
r1 = 1r
r0 = 0.85r
show_links = yes
link_dims = 2p,2p,2p,2p,2p
link_thickness = 4p
link_color = red
#label_rotate = no
label_parallel = yes
label_size = 30p
label_font = bold
</plot>
################
### DEPTH ###
<plot>
$depth_text
</plot>
###################
### SLIDING PID ###
<plot>
$slidingpid_text
</plot>
</plots>
HEREDOC
return $text;
}
sub ideogram_conf {
my $text = <<HEREDOC;
### IDEOGRAM ###
<ideogram>
<spacing>
default = 0.01r
break = 0.5r
</spacing>
# ideogram position, fill and outline
radius = 0.9r
thickness = 50p # default = 20p
fill = yes
stroke_color = dgrey
stroke_thickness = 2p
# minimum definition for ideogram labels
show_label = yes
label_font = default
label_radius = 1r + 75p
label_size = 50
label_parallel = yes
</ideogram>
HEREDOC
return $text;
}
sub ticks_conf {
my ($karyotype, $baselen) = @_;
open (my $KARYO, "<", $karyotype) or die ("Can't open $karyotype: $!\n");
my @lengths = ();
while (<$KARYO>) {
my (undef, undef, $id, undef, undef, $length) = split(/ /, $_);
push (@lengths, $length);
}
close ($KARYO);
my $maxl = max (@lengths);
my $scale = 1;
if ($maxl < $baselen) { $scale = 1; }
elsif ($maxl < $baselen * 10) { $scale = 10; }
elsif ($maxl < $baselen * 100) { $scale = 100; }
else { $scale = 1000;}
my $small_spacing = 100 * $scale . "u";
my $large_spacing = 1000 * $scale . "u";
my $text = <<HEREDOC;
### TICKS ###
show_ticks = yes
show_tick_labels = yes
<ticks>
radius = 1r
color = black
thickness = 2p
# the tick label is derived by multiplying the tick position
# by 'multiplier' and casting it in 'format':
#
# sprintf(format,position*multiplier)
multiplier = 1
# %d - integer
# %f - float
# %.1f - float with one decimal
# %.2f - float with two decimals
#
# for other formats, see http://perldoc.perl.org/functions/sprintf.html
format = %d
<tick>
spacing = $small_spacing
size = 10p
</tick>
<tick>
spacing = $large_spacing
size = 25p
show_label = yes
label_size = 24p
label_offset = 8p
format = %d
</tick>
</ticks>
HEREDOC
return $text;
}
sub depth_conf {
my ($depthfile) = @_;
my $text = <<HEREDOC;
### COVERAGE ###
type = histogram
file = $depthfile
r1 = 0.80r
r0 = 0.70r
thickness = 0p
#AP: the line below erros out with circos-0.69-3 (looks like it expects just a single value)
#fill_color=red,green,blue,yellow # vdgrey
fill_color=red
HEREDOC
return $text;
}
sub slidingpid_conf {
my ($slidingpidfile) = @_;
my $text = <<HEREDOC;
### SLIDING PID ###
type = line
thickness = 10 # 2
max_gap = 1u
file = $slidingpidfile
color = chr15
#fill_color = chr17
min = 0
max = 100
r1 = 0.65r
r0 = 0.55r
<axes>
<axis>
color = lgrey
thickness = 1
spacing = 0.1r
</axis>
</axes>
HEREDOC
return $text;
}
sub writefile {
my ($text, $outfile) = @_;
open (my $out, ">", $outfile) or die ("Can't open $outfile for writing: $!\n");
print $out $text;
print STDERR "Wrote $outfile file\n";
close ($out);
}
sub mean {
if (@_) {
return sum(@_)/@_;
}
else {
return 0;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment