|
|
@@ -0,0 +1,215 @@ |
|
|
#!/usr/local/bin/perl |
|
|
|
|
|
# |
|
|
# Perl script treefam_genelosses.pl |
|
|
# Written by Avril Coghlan ([email protected]). |
|
|
# 28-Aug-06. |
|
|
# |
|
|
# For the TreeFam project. |
|
|
# |
|
|
# This perl script connects to the MYSQL database of |
|
|
# TreeFam families and prints out a list of gene losses from |
|
|
# fully sequenced genomes. This script uses Jean-Karim's TreeFam |
|
|
# API. Finds losses in TreeFam-A clean trees (not seed, as misses fully sequenced genomes, |
|
|
# and not full trees, as are sometimes wrong). xxx use TreeFam-B too? |
|
|
# |
|
|
# The command-line format is: |
|
|
# % perl <treefam_genelosses.pl> |
|
|
# |
|
|
#------------------------------------------------------------------# |
|
|
|
|
|
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: |
|
|
|
|
|
$num_args = $#ARGV + 1; |
|
|
if ($num_args != 0) |
|
|
{ |
|
|
print "Usage of treefam_genelosses.pl\n\n"; |
|
|
print "perl -w treefam_genelosses.pl\n"; |
|
|
print "For example, >perl -w treefam_genelosses.pl\n"; |
|
|
exit; |
|
|
} |
|
|
|
|
|
# DECLARE MYSQL USERNAME AND HOST: |
|
|
|
|
|
use strict; |
|
|
use Treefam::DBConnection; |
|
|
|
|
|
my $VERBOSE = 0; # IF $VERBOSE==1, PRINT OUT EXTRA DETAILS. |
|
|
my $dbc = Treefam::DBConnection->new(); |
|
|
|
|
|
#------------------------------------------------------------------# |
|
|
|
|
|
# GET ALL THE TREEFAM-A FAMILIES IN THE CURRENT RELEASE: |
|
|
my $famh = $dbc->get_FamilyHandle(); # GET A FAMILY HANDLE. |
|
|
my @families = $famh->get_all_by_type('A'); # GET ALL TREEFAM-A FAMILIES. |
|
|
my $no_families = 0; |
|
|
foreach my $family(@families) |
|
|
{ |
|
|
my $AC = $family->ID(); # GET THE FAMILY ID. |
|
|
$no_families++; |
|
|
print "\n$no_families: reading family $AC...\n"; |
|
|
my $tree = $family->tree('clean'); # GET THE TREEFAM-A CLEAN TREE. xxx look at treefam-b too? |
|
|
# GET THE NAMES OF ALL THE GENES IN THIS TREE: |
|
|
my @leaves = $tree->get_leaf_nodes; |
|
|
foreach my $leaf(@leaves) |
|
|
{ |
|
|
my @tags = $leaf->get_all_tags(); |
|
|
my $is_gene_loss = 0; |
|
|
my $loss = "none"; |
|
|
foreach my $tag(@tags) |
|
|
{ |
|
|
# THE TAGS ARE: |
|
|
# G: GENE ID, eg. SPBC12D12.06 OR ENSG00000112237 |
|
|
# O: TREEFAM ID. eg. SPBC12D12.06 OR ENST00000326298.2 |
|
|
# S: SPECIES, eg. SCHP0 OR HUMAN |
|
|
# E: GENE LOSS |
|
|
# B: BOOTSTRAP VALUE |
|
|
for my $value ($leaf->get_tag_values($tag)) |
|
|
{ |
|
|
if ($tag eq 'E') |
|
|
{ |
|
|
$loss = $value; |
|
|
$is_gene_loss = 1; |
|
|
my @temp = split(/\$\-/,$loss); |
|
|
$loss = $temp[1]; |
|
|
if ($loss ne 'HUMAN' && $loss ne 'Homo') { goto NEXT_LEAF;} # xxx JUST LOOKING AT LOSES OF HUMAN GENES WHERE THERE IS A CHIMP GENE. |
|
|
# NOTE THAT IF CHIMP AND HUMAN WERE LOST, THEN $loss WOULD BE 'HUMAN,PANTR'. |
|
|
print "___ $AC: ID ",$leaf->id(),"\n"; # THIS IS SYMBOL."_".SPECIES, eg. CCNC_HUMAN |
|
|
print "___ GENE LOSS $loss (AC $AC)\n"; |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
# IF THIS NODE $leaf->id() HAS A GENE LOSS ASSOCIATED WITH IT, CHECK WHETHER |
|
|
# THE CLADE THAT IT IS IN IS GOOD EVIDENCE FOR THE LOSS: |
|
|
my $parent = 'none'; |
|
|
if ($is_gene_loss == 1) |
|
|
{ |
|
|
#---------------------------------------------------------# |
|
|
FIND_PARENT: |
|
|
# FIND THE PARENT NODE OF NODE $leaf: |
|
|
if ($parent eq 'none') |
|
|
{ |
|
|
if (defined($leaf->ancestor)) { $parent = $leaf->ancestor; } |
|
|
else { goto ALL_PARENTS_FOUND; } |
|
|
} |
|
|
else # GET THE PARENT NODE OF NODE $parent: |
|
|
{ |
|
|
if (defined($parent->ancestor)){ $parent = $parent->ancestor;} |
|
|
else { goto ALL_PARENTS_FOUND; } |
|
|
} |
|
|
if ($VERBOSE == 1) { print "___ PARENT OF CURRENT NODE is", $parent->internal_id(),"\n";} |
|
|
# FIND THE DESCENDENTS OF NODE $parent, AND CHECK IF THE CLADE OF DESCENDENTS |
|
|
# IS GOOD EVIDENCE FOR THE GENE LOSS $loss: |
|
|
my %SPECIES = (); # HASH TABLE TO RECORD SPECIES IN THIS CLADE. |
|
|
for my $descendent ($parent->get_all_Descendents()) |
|
|
{ |
|
|
if (defined($descendent->id())) # IF THIS IS A LEAF. |
|
|
{ |
|
|
my $id = $descendent->id(); |
|
|
if ($VERBOSE == 1) { print "___ WHICH HAS DESCENDENT $id\n";} |
|
|
my @temp = split(/_/,$id); |
|
|
my $species = $temp[$#temp]; |
|
|
$SPECIES{$species} = 1; |
|
|
} |
|
|
} |
|
|
# CHECK WHETHER THE CLADE OF DESCENDENTS IS GOOD EVIDENCE FOR THE GENE LOSS $loss: |
|
|
my $evidence_for_loss= &check_evidence_for_loss($loss,\%SPECIES); |
|
|
if ($VERBOSE == 1) { print "___ EVIDENCE FOR LOSS IN $loss: $evidence_for_loss\n";} |
|
|
if ($evidence_for_loss == 1) { goto ALL_PARENTS_FOUND;} |
|
|
# FIND THE PARENT OF NODE $parent: |
|
|
goto FIND_PARENT; |
|
|
#---------------------------------------------------------# |
|
|
|
|
|
ALL_PARENTS_FOUND: |
|
|
if ($VERBOSE == 1) { print "___ ALL PARENTS FOUND\n\n";} |
|
|
# IF THE CLADE OF DESCENDENTS IS GOOD EVIDENCE FOR GENE LOSS $loss, CHECK IF THERE |
|
|
# ARE ANY OTHER LOSSES IN THAT CLADE THAT MAY MAKE $loss LESS BELIEVABLE: |
|
|
print "___ EVIDENCE FOR LOSS IN $loss: $evidence_for_loss\n"; |
|
|
if ($evidence_for_loss == 1) |
|
|
{ |
|
|
for my $descendent ($parent->get_all_Descendents()) |
|
|
{ |
|
|
if (defined($descendent->id())) # IF THIS IS A LEAF. |
|
|
{ |
|
|
my $id = $descendent->id(); |
|
|
my ($loss2) = $descendent->get_tag_values('E'); |
|
|
if (defined($loss2)) |
|
|
{ |
|
|
my @temp2 = split(/\$-/,$loss2); |
|
|
my $species2= $temp2[1]; |
|
|
print "descendent $id -> loss $loss2\n"; |
|
|
if (($loss eq 'HUMAN' || $loss eq 'Homo') && ($species2 =~ /PANTR/ || $species2 =~ /Pan/)) # xxx add other cases |
|
|
{ |
|
|
$evidence_for_loss = 0; |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
print "___ FINAL EVIDENCE FOR LOSS IN $loss: now $evidence_for_loss\n"; |
|
|
#---------------------------------------------------------# |
|
|
} |
|
|
NEXT_LEAF: # xxx |
|
|
} |
|
|
} |
|
|
print "\n\n"; |
|
|
|
|
|
print STDERR "FINISHED.\n"; |
|
|
|
|
|
|
|
|
#------------------------------------------------------------------# |
|
|
|
|
|
# SUBROUTINE TO CHECK WHETHER A CLADE OF GENES IS GOOD EVIDENCE FOR A GENE LOSS: |
|
|
|
|
|
sub check_evidence_for_loss |
|
|
{ |
|
|
my $loss = $_[0]; # THE SPECIES IN WHICH THE PUTATIVE GENE LOSS OCCURRED. |
|
|
my $SPECIES = $_[1]; # POINTER TO HASH TABLE OF SPECIES IN THE CLADE THAT SPECIES $species IS MISSING FROM. |
|
|
my $evidence = 1; # SAYS WHETHER THE EVIDENCE FOR GENE LOSS IS STRONG. |
|
|
my @all = ('HUMAN','PANTR','MACMU','BOVIN','CANFA','MOUSE','RAT','MONDO', |
|
|
'CHICK','XENTR','TETNG','FUGRU','BRARE','CIOIN','DROME','DROPS', |
|
|
'CAEEL','CAEBR','CAERE','APIME','ANOGA','YEAST','SCHPO','ARATH','SCHMA'); |
|
|
# ALL THE FULLY SEQUENCED SPECIES. |
|
|
my @expect; # THE SPECIES THAT WE EXPECT TO SEE IN THE CLADE. |
|
|
my $i; # |
|
|
my %EXPECT = (); # |
|
|
my $expecti; # |
|
|
my @expecti; # |
|
|
my $j; # |
|
|
my $found_expecti = 0; # |
|
|
my $loss2; # |
|
|
|
|
|
# CHECK WHETHER THERE IS STRONG EVIDENCE FOR A GENE LOSS: # xxx ADD MORE CASES |
|
|
# FOR EXAMPLE, WE CAN BE SURE OF A HUMAN GENE LOSS IF THE CLADE CONTAINS AT LEAST |
|
|
# ONE CHIMP OR ONE MACAQUE, AT LEAST ONE RAT OR ONE MOUSE, AT LEAST ONE COW, AND |
|
|
# AT LEAST ONE DOG GENE: |
|
|
if ($loss eq 'HUMAN') { @expect = ('PANTR','MACMU','MOUSE/RAT','BOVIN/CANFA'); } |
|
|
|
|
|
# CHECK THAT WE SEE ALL THE SPECIES THAT WE EXPECT IN THE CLADE, AND NO OTHER SPECIES: |
|
|
for ($i = 0; $i <= $#expect; $i++) |
|
|
{ |
|
|
$expecti = $expect[$i]; |
|
|
@expecti = split(/\//,$expecti); |
|
|
$found_expecti = 0; |
|
|
# CHECK WHETHER WE SEE AT LEAST ONE OF PANTR/MACMU FOR EXAMPLE: |
|
|
for ($j = 0; $j <= $#expecti; $j++) |
|
|
{ |
|
|
$EXPECT{$expecti[$j]} = 1; |
|
|
if ($SPECIES->{$expecti[$j]}) { $found_expecti = 1;} |
|
|
} |
|
|
if ($found_expecti == 0) { $evidence = 0;} # WE DON'T FIND A SPECIES THAT WE EXPECT TO SEE IN THE CLADE. |
|
|
} |
|
|
# CHECK IF WE SEE ANY SPECIES IN THE CLADE THAT WE DON'T EXPECT TO SEE: |
|
|
for ($i = 0; $i <= $#all; $i++) |
|
|
{ |
|
|
if (!($EXPECT{$all[$i]}) && $all[$i] ne $loss) |
|
|
{ |
|
|
if ($SPECIES->{$all[$i]}) { $evidence = 0;} # WE FIND A SPECIES THAT WE DON'T EXPECT TO SEE IN THE CLADE. |
|
|
} |
|
|
} |
|
|
|
|
|
return($evidence); |
|
|
} |
|
|
|
|
|
#------------------------------------------------------------------# |