Skip to content

Instantly share code, notes, and snippets.

@zm-git-dev
Forked from liyupeng111/parse_gff3_cds.pl
Created July 25, 2018 01:10
Show Gist options
  • Save zm-git-dev/c70a66337bb9717c7e50b1daadde03a1 to your computer and use it in GitHub Desktop.
Save zm-git-dev/c70a66337bb9717c7e50b1daadde03a1 to your computer and use it in GitHub Desktop.
get cds sequences from gff3 file
#!/usr/bin/perl
# parse_gff3.pl
# get the cds sequences from gff3 file.
# usage: perl parse_gff3_cds.pl gff3_input_file genome_sequence_directory >output.txt
use strict;
use warnings;
use Bio::Tools::GFF;
use Bio::DB::Fasta;
my $gff3_file=$ARGV[0]; #gff3 input file
my $genome=$ARGV[1]; #genome sequence directory
my $gffio = Bio::Tools::GFF -> new(-file =>$gff3_file , -gff_version => 3);
my $db = Bio::DB::Fasta -> new($genome);
my $i=0;
my $first=0;
my $strand=0;
my $seq='';
while(my $feature = $gffio->next_feature()) {
# Sometimes, the gff3 file format is a little with the standard format,
# So that the keys of the hashes are different.
# Use the following lines of masked code to see the keys of the hashes. Some Change may be needed.
# $i++;
# if ($i==2){
# my @a=keys %{$feature};
# print "@a\n";
# my @b=keys %{$feature->{_location}};
# print "@b\n";
# print "$feature->{_primary_tag}\n";
# }
if($feature->{_primary_tag}=~/mrna/i){
$first++;
if($first==1){
$seq='';
}else{
print_sequence($seq,80,$strand);
$seq='';
}
print ">$feature->{_gsf_tag_hash}->{Name}->[0]|$feature->{_gsf_tag_hash}->{ID}->[0]\n";
}elsif($feature->{_primary_tag}=~/cds/i){
my $seq_temp=$db->seq($feature->{_gsf_seq_id}, $feature->{_location}->{_start}=>$feature->{_location}->{_end});
if($feature->{_location}->{_strand}=~/-/){ # to know the strand
$seq=$seq_temp.$seq;
$strand=0;
}else{
$seq.=$seq_temp;
$strand=1;
}
}
}
print_sequence($seq,80,$strand);
$gffio->close();
sub print_sequence {
my($sequence, $length,$strand) = @_;
if($strand==0){ # if the sequence is on the minus strand, get its reverse-complement counterpart
$sequence=~tr/ACGTacgt/TGCAtgca/;
$sequence=reverse $sequence;
}
for ( my $pos = 0 ; $pos < length($sequence) ; $pos += $length ) {
print substr($sequence, $pos, $length),"\n";
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment