#!/usr/bin/perl
# parse_gff3.pl
# get the cds sequences from gff3 file.
# usage: perl parse_gff3_cds.pl gff3_input_file genome_sequence_directory >output.txt

use strict;
use warnings;

use Bio::Tools::GFF;
use Bio::DB::Fasta;

my $gff3_file=$ARGV[0];  #gff3 input file
my $genome=$ARGV[1]; #genome sequence directory
my $gffio = Bio::Tools::GFF -> new(-file =>$gff3_file , -gff_version => 3);
my $db    = Bio::DB::Fasta  -> new($genome);

my $i=0;
my $first=0;
my $strand=0;
my $seq='';
while(my $feature = $gffio->next_feature()) {
# Sometimes, the gff3 file format is a little with the standard format,
# So that the keys of the hashes are different.
# Use the following lines of masked code to see the keys of the hashes. Some Change may be needed.
#  $i++;
#  if ($i==2){
#    my @a=keys %{$feature};
#    print "@a\n";
#    my @b=keys %{$feature->{_location}};
#    print "@b\n";
#    print "$feature->{_primary_tag}\n";
#  }
  if($feature->{_primary_tag}=~/mrna/i){
    $first++;
    if($first==1){
      $seq='';
    }else{
      print_sequence($seq,80,$strand);
      $seq='';
    }
    print ">$feature->{_gsf_tag_hash}->{Name}->[0]|$feature->{_gsf_tag_hash}->{ID}->[0]\n";
  }elsif($feature->{_primary_tag}=~/cds/i){
    my $seq_temp=$db->seq($feature->{_gsf_seq_id}, $feature->{_location}->{_start}=>$feature->{_location}->{_end});
    if($feature->{_location}->{_strand}=~/-/){  # to know the strand
      $seq=$seq_temp.$seq;
      $strand=0;
    }else{
      $seq.=$seq_temp;
      $strand=1;
    }
  }
}
print_sequence($seq,80,$strand);

$gffio->close();

sub print_sequence {
  my($sequence, $length,$strand) = @_;
  if($strand==0){  # if the sequence is on the minus strand, get its reverse-complement counterpart
    $sequence=~tr/ACGTacgt/TGCAtgca/;
    $sequence=reverse $sequence;
  }
  for ( my $pos = 0 ; $pos < length($sequence) ; $pos += $length ) {
    print substr($sequence, $pos, $length),"\n";
  }
}