-
-
Save zm-git-dev/c70a66337bb9717c7e50b1daadde03a1 to your computer and use it in GitHub Desktop.
get cds sequences from gff3 file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/perl | |
| # parse_gff3.pl | |
| # get the cds sequences from gff3 file. | |
| # usage: perl parse_gff3_cds.pl gff3_input_file genome_sequence_directory >output.txt | |
| use strict; | |
| use warnings; | |
| use Bio::Tools::GFF; | |
| use Bio::DB::Fasta; | |
| my $gff3_file=$ARGV[0]; #gff3 input file | |
| my $genome=$ARGV[1]; #genome sequence directory | |
| my $gffio = Bio::Tools::GFF -> new(-file =>$gff3_file , -gff_version => 3); | |
| my $db = Bio::DB::Fasta -> new($genome); | |
| my $i=0; | |
| my $first=0; | |
| my $strand=0; | |
| my $seq=''; | |
| while(my $feature = $gffio->next_feature()) { | |
| # Sometimes, the gff3 file format is a little with the standard format, | |
| # So that the keys of the hashes are different. | |
| # Use the following lines of masked code to see the keys of the hashes. Some Change may be needed. | |
| # $i++; | |
| # if ($i==2){ | |
| # my @a=keys %{$feature}; | |
| # print "@a\n"; | |
| # my @b=keys %{$feature->{_location}}; | |
| # print "@b\n"; | |
| # print "$feature->{_primary_tag}\n"; | |
| # } | |
| if($feature->{_primary_tag}=~/mrna/i){ | |
| $first++; | |
| if($first==1){ | |
| $seq=''; | |
| }else{ | |
| print_sequence($seq,80,$strand); | |
| $seq=''; | |
| } | |
| print ">$feature->{_gsf_tag_hash}->{Name}->[0]|$feature->{_gsf_tag_hash}->{ID}->[0]\n"; | |
| }elsif($feature->{_primary_tag}=~/cds/i){ | |
| my $seq_temp=$db->seq($feature->{_gsf_seq_id}, $feature->{_location}->{_start}=>$feature->{_location}->{_end}); | |
| if($feature->{_location}->{_strand}=~/-/){ # to know the strand | |
| $seq=$seq_temp.$seq; | |
| $strand=0; | |
| }else{ | |
| $seq.=$seq_temp; | |
| $strand=1; | |
| } | |
| } | |
| } | |
| print_sequence($seq,80,$strand); | |
| $gffio->close(); | |
| sub print_sequence { | |
| my($sequence, $length,$strand) = @_; | |
| if($strand==0){ # if the sequence is on the minus strand, get its reverse-complement counterpart | |
| $sequence=~tr/ACGTacgt/TGCAtgca/; | |
| $sequence=reverse $sequence; | |
| } | |
| for ( my $pos = 0 ; $pos < length($sequence) ; $pos += $length ) { | |
| print substr($sequence, $pos, $length),"\n"; | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment