#!/usr/local/bin/perl # # Perl script treefam_genelosses.pl # Written by Avril Coghlan (alc@sanger.ac.uk). # 28-Aug-06. # # For the TreeFam project. # # This perl script connects to the MYSQL database of # TreeFam families and prints out a list of gene losses from # fully sequenced genomes. This script uses Jean-Karim's TreeFam # API. Finds losses in TreeFam-A clean trees (not seed, as misses fully sequenced genomes, # and not full trees, as are sometimes wrong). xxx use TreeFam-B too? # # The command-line format is: # % perl # #------------------------------------------------------------------# # CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: $num_args = $#ARGV + 1; if ($num_args != 0) { print "Usage of treefam_genelosses.pl\n\n"; print "perl -w treefam_genelosses.pl\n"; print "For example, >perl -w treefam_genelosses.pl\n"; exit; } # DECLARE MYSQL USERNAME AND HOST: use strict; use Treefam::DBConnection; my $VERBOSE = 0; # IF $VERBOSE==1, PRINT OUT EXTRA DETAILS. my $dbc = Treefam::DBConnection->new(); #------------------------------------------------------------------# # GET ALL THE TREEFAM-A FAMILIES IN THE CURRENT RELEASE: my $famh = $dbc->get_FamilyHandle(); # GET A FAMILY HANDLE. my @families = $famh->get_all_by_type('A'); # GET ALL TREEFAM-A FAMILIES. my $no_families = 0; foreach my $family(@families) { my $AC = $family->ID(); # GET THE FAMILY ID. $no_families++; print "\n$no_families: reading family $AC...\n"; my $tree = $family->tree('clean'); # GET THE TREEFAM-A CLEAN TREE. xxx look at treefam-b too? # GET THE NAMES OF ALL THE GENES IN THIS TREE: my @leaves = $tree->get_leaf_nodes; foreach my $leaf(@leaves) { my @tags = $leaf->get_all_tags(); my $is_gene_loss = 0; my $loss = "none"; foreach my $tag(@tags) { # THE TAGS ARE: # G: GENE ID, eg. SPBC12D12.06 OR ENSG00000112237 # O: TREEFAM ID. eg. SPBC12D12.06 OR ENST00000326298.2 # S: SPECIES, eg. SCHP0 OR HUMAN # E: GENE LOSS # B: BOOTSTRAP VALUE for my $value ($leaf->get_tag_values($tag)) { if ($tag eq 'E') { $loss = $value; $is_gene_loss = 1; my @temp = split(/\$\-/,$loss); $loss = $temp[1]; if ($loss ne 'HUMAN' && $loss ne 'Homo') { goto NEXT_LEAF;} # xxx JUST LOOKING AT LOSES OF HUMAN GENES WHERE THERE IS A CHIMP GENE. # NOTE THAT IF CHIMP AND HUMAN WERE LOST, THEN $loss WOULD BE 'HUMAN,PANTR'. print "___ $AC: ID ",$leaf->id(),"\n"; # THIS IS SYMBOL."_".SPECIES, eg. CCNC_HUMAN print "___ GENE LOSS $loss (AC $AC)\n"; } } } # IF THIS NODE $leaf->id() HAS A GENE LOSS ASSOCIATED WITH IT, CHECK WHETHER # THE CLADE THAT IT IS IN IS GOOD EVIDENCE FOR THE LOSS: my $parent = 'none'; if ($is_gene_loss == 1) { #---------------------------------------------------------# FIND_PARENT: # FIND THE PARENT NODE OF NODE $leaf: if ($parent eq 'none') { if (defined($leaf->ancestor)) { $parent = $leaf->ancestor; } else { goto ALL_PARENTS_FOUND; } } else # GET THE PARENT NODE OF NODE $parent: { if (defined($parent->ancestor)){ $parent = $parent->ancestor;} else { goto ALL_PARENTS_FOUND; } } if ($VERBOSE == 1) { print "___ PARENT OF CURRENT NODE is", $parent->internal_id(),"\n";} # FIND THE DESCENDENTS OF NODE $parent, AND CHECK IF THE CLADE OF DESCENDENTS # IS GOOD EVIDENCE FOR THE GENE LOSS $loss: my %SPECIES = (); # HASH TABLE TO RECORD SPECIES IN THIS CLADE. for my $descendent ($parent->get_all_Descendents()) { if (defined($descendent->id())) # IF THIS IS A LEAF. { my $id = $descendent->id(); if ($VERBOSE == 1) { print "___ WHICH HAS DESCENDENT $id\n";} my @temp = split(/_/,$id); my $species = $temp[$#temp]; $SPECIES{$species} = 1; } } # CHECK WHETHER THE CLADE OF DESCENDENTS IS GOOD EVIDENCE FOR THE GENE LOSS $loss: my $evidence_for_loss= &check_evidence_for_loss($loss,\%SPECIES); if ($VERBOSE == 1) { print "___ EVIDENCE FOR LOSS IN $loss: $evidence_for_loss\n";} if ($evidence_for_loss == 1) { goto ALL_PARENTS_FOUND;} # FIND THE PARENT OF NODE $parent: goto FIND_PARENT; #---------------------------------------------------------# ALL_PARENTS_FOUND: if ($VERBOSE == 1) { print "___ ALL PARENTS FOUND\n\n";} # IF THE CLADE OF DESCENDENTS IS GOOD EVIDENCE FOR GENE LOSS $loss, CHECK IF THERE # ARE ANY OTHER LOSSES IN THAT CLADE THAT MAY MAKE $loss LESS BELIEVABLE: print "___ EVIDENCE FOR LOSS IN $loss: $evidence_for_loss\n"; if ($evidence_for_loss == 1) { for my $descendent ($parent->get_all_Descendents()) { if (defined($descendent->id())) # IF THIS IS A LEAF. { my $id = $descendent->id(); my ($loss2) = $descendent->get_tag_values('E'); if (defined($loss2)) { my @temp2 = split(/\$-/,$loss2); my $species2= $temp2[1]; print "descendent $id -> loss $loss2\n"; if (($loss eq 'HUMAN' || $loss eq 'Homo') && ($species2 =~ /PANTR/ || $species2 =~ /Pan/)) # xxx add other cases { $evidence_for_loss = 0; } } } } } print "___ FINAL EVIDENCE FOR LOSS IN $loss: now $evidence_for_loss\n"; #---------------------------------------------------------# } NEXT_LEAF: # xxx } } print "\n\n"; print STDERR "FINISHED.\n"; #------------------------------------------------------------------# # SUBROUTINE TO CHECK WHETHER A CLADE OF GENES IS GOOD EVIDENCE FOR A GENE LOSS: sub check_evidence_for_loss { my $loss = $_[0]; # THE SPECIES IN WHICH THE PUTATIVE GENE LOSS OCCURRED. my $SPECIES = $_[1]; # POINTER TO HASH TABLE OF SPECIES IN THE CLADE THAT SPECIES $species IS MISSING FROM. my $evidence = 1; # SAYS WHETHER THE EVIDENCE FOR GENE LOSS IS STRONG. my @all = ('HUMAN','PANTR','MACMU','BOVIN','CANFA','MOUSE','RAT','MONDO', 'CHICK','XENTR','TETNG','FUGRU','BRARE','CIOIN','DROME','DROPS', 'CAEEL','CAEBR','CAERE','APIME','ANOGA','YEAST','SCHPO','ARATH','SCHMA'); # ALL THE FULLY SEQUENCED SPECIES. my @expect; # THE SPECIES THAT WE EXPECT TO SEE IN THE CLADE. my $i; # my %EXPECT = (); # my $expecti; # my @expecti; # my $j; # my $found_expecti = 0; # my $loss2; # # CHECK WHETHER THERE IS STRONG EVIDENCE FOR A GENE LOSS: # xxx ADD MORE CASES # FOR EXAMPLE, WE CAN BE SURE OF A HUMAN GENE LOSS IF THE CLADE CONTAINS AT LEAST # ONE CHIMP OR ONE MACAQUE, AT LEAST ONE RAT OR ONE MOUSE, AT LEAST ONE COW, AND # AT LEAST ONE DOG GENE: if ($loss eq 'HUMAN') { @expect = ('PANTR','MACMU','MOUSE/RAT','BOVIN/CANFA'); } # CHECK THAT WE SEE ALL THE SPECIES THAT WE EXPECT IN THE CLADE, AND NO OTHER SPECIES: for ($i = 0; $i <= $#expect; $i++) { $expecti = $expect[$i]; @expecti = split(/\//,$expecti); $found_expecti = 0; # CHECK WHETHER WE SEE AT LEAST ONE OF PANTR/MACMU FOR EXAMPLE: for ($j = 0; $j <= $#expecti; $j++) { $EXPECT{$expecti[$j]} = 1; if ($SPECIES->{$expecti[$j]}) { $found_expecti = 1;} } if ($found_expecti == 0) { $evidence = 0;} # WE DON'T FIND A SPECIES THAT WE EXPECT TO SEE IN THE CLADE. } # CHECK IF WE SEE ANY SPECIES IN THE CLADE THAT WE DON'T EXPECT TO SEE: for ($i = 0; $i <= $#all; $i++) { if (!($EXPECT{$all[$i]}) && $all[$i] ne $loss) { if ($SPECIES->{$all[$i]}) { $evidence = 0;} # WE FIND A SPECIES THAT WE DON'T EXPECT TO SEE IN THE CLADE. } } return($evidence); } #------------------------------------------------------------------#