Skip to content

Instantly share code, notes, and snippets.

@AswinSSoman
Forked from avrilcoghlan/treefam_gene_losses.pl
Created November 10, 2022 06:42
Show Gist options
  • Save AswinSSoman/b54aa248a8d43b31c047c19313f99d53 to your computer and use it in GitHub Desktop.
Save AswinSSoman/b54aa248a8d43b31c047c19313f99d53 to your computer and use it in GitHub Desktop.

Revisions

  1. @avrilcoghlan avrilcoghlan created this gist Mar 1, 2013.
    215 changes: 215 additions & 0 deletions treefam_gene_losses.pl
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,215 @@
    #!/usr/local/bin/perl

    #
    # Perl script treefam_genelosses.pl
    # Written by Avril Coghlan ([email protected]).
    # 28-Aug-06.
    #
    # For the TreeFam project.
    #
    # This perl script connects to the MYSQL database of
    # TreeFam families and prints out a list of gene losses from
    # fully sequenced genomes. This script uses Jean-Karim's TreeFam
    # API. Finds losses in TreeFam-A clean trees (not seed, as misses fully sequenced genomes,
    # and not full trees, as are sometimes wrong). xxx use TreeFam-B too?
    #
    # The command-line format is:
    # % perl <treefam_genelosses.pl>
    #
    #------------------------------------------------------------------#

    # CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:

    $num_args = $#ARGV + 1;
    if ($num_args != 0)
    {
    print "Usage of treefam_genelosses.pl\n\n";
    print "perl -w treefam_genelosses.pl\n";
    print "For example, >perl -w treefam_genelosses.pl\n";
    exit;
    }

    # DECLARE MYSQL USERNAME AND HOST:

    use strict;
    use Treefam::DBConnection;

    my $VERBOSE = 0; # IF $VERBOSE==1, PRINT OUT EXTRA DETAILS.
    my $dbc = Treefam::DBConnection->new();

    #------------------------------------------------------------------#

    # GET ALL THE TREEFAM-A FAMILIES IN THE CURRENT RELEASE:
    my $famh = $dbc->get_FamilyHandle(); # GET A FAMILY HANDLE.
    my @families = $famh->get_all_by_type('A'); # GET ALL TREEFAM-A FAMILIES.
    my $no_families = 0;
    foreach my $family(@families)
    {
    my $AC = $family->ID(); # GET THE FAMILY ID.
    $no_families++;
    print "\n$no_families: reading family $AC...\n";
    my $tree = $family->tree('clean'); # GET THE TREEFAM-A CLEAN TREE. xxx look at treefam-b too?
    # GET THE NAMES OF ALL THE GENES IN THIS TREE:
    my @leaves = $tree->get_leaf_nodes;
    foreach my $leaf(@leaves)
    {
    my @tags = $leaf->get_all_tags();
    my $is_gene_loss = 0;
    my $loss = "none";
    foreach my $tag(@tags)
    {
    # THE TAGS ARE:
    # G: GENE ID, eg. SPBC12D12.06 OR ENSG00000112237
    # O: TREEFAM ID. eg. SPBC12D12.06 OR ENST00000326298.2
    # S: SPECIES, eg. SCHP0 OR HUMAN
    # E: GENE LOSS
    # B: BOOTSTRAP VALUE
    for my $value ($leaf->get_tag_values($tag))
    {
    if ($tag eq 'E')
    {
    $loss = $value;
    $is_gene_loss = 1;
    my @temp = split(/\$\-/,$loss);
    $loss = $temp[1];
    if ($loss ne 'HUMAN' && $loss ne 'Homo') { goto NEXT_LEAF;} # xxx JUST LOOKING AT LOSES OF HUMAN GENES WHERE THERE IS A CHIMP GENE.
    # NOTE THAT IF CHIMP AND HUMAN WERE LOST, THEN $loss WOULD BE 'HUMAN,PANTR'.
    print "___ $AC: ID ",$leaf->id(),"\n"; # THIS IS SYMBOL."_".SPECIES, eg. CCNC_HUMAN
    print "___ GENE LOSS $loss (AC $AC)\n";
    }
    }
    }

    # IF THIS NODE $leaf->id() HAS A GENE LOSS ASSOCIATED WITH IT, CHECK WHETHER
    # THE CLADE THAT IT IS IN IS GOOD EVIDENCE FOR THE LOSS:
    my $parent = 'none';
    if ($is_gene_loss == 1)
    {
    #---------------------------------------------------------#
    FIND_PARENT:
    # FIND THE PARENT NODE OF NODE $leaf:
    if ($parent eq 'none')
    {
    if (defined($leaf->ancestor)) { $parent = $leaf->ancestor; }
    else { goto ALL_PARENTS_FOUND; }
    }
    else # GET THE PARENT NODE OF NODE $parent:
    {
    if (defined($parent->ancestor)){ $parent = $parent->ancestor;}
    else { goto ALL_PARENTS_FOUND; }
    }
    if ($VERBOSE == 1) { print "___ PARENT OF CURRENT NODE is", $parent->internal_id(),"\n";}
    # FIND THE DESCENDENTS OF NODE $parent, AND CHECK IF THE CLADE OF DESCENDENTS
    # IS GOOD EVIDENCE FOR THE GENE LOSS $loss:
    my %SPECIES = (); # HASH TABLE TO RECORD SPECIES IN THIS CLADE.
    for my $descendent ($parent->get_all_Descendents())
    {
    if (defined($descendent->id())) # IF THIS IS A LEAF.
    {
    my $id = $descendent->id();
    if ($VERBOSE == 1) { print "___ WHICH HAS DESCENDENT $id\n";}
    my @temp = split(/_/,$id);
    my $species = $temp[$#temp];
    $SPECIES{$species} = 1;
    }
    }
    # CHECK WHETHER THE CLADE OF DESCENDENTS IS GOOD EVIDENCE FOR THE GENE LOSS $loss:
    my $evidence_for_loss= &check_evidence_for_loss($loss,\%SPECIES);
    if ($VERBOSE == 1) { print "___ EVIDENCE FOR LOSS IN $loss: $evidence_for_loss\n";}
    if ($evidence_for_loss == 1) { goto ALL_PARENTS_FOUND;}
    # FIND THE PARENT OF NODE $parent:
    goto FIND_PARENT;
    #---------------------------------------------------------#

    ALL_PARENTS_FOUND:
    if ($VERBOSE == 1) { print "___ ALL PARENTS FOUND\n\n";}
    # IF THE CLADE OF DESCENDENTS IS GOOD EVIDENCE FOR GENE LOSS $loss, CHECK IF THERE
    # ARE ANY OTHER LOSSES IN THAT CLADE THAT MAY MAKE $loss LESS BELIEVABLE:
    print "___ EVIDENCE FOR LOSS IN $loss: $evidence_for_loss\n";
    if ($evidence_for_loss == 1)
    {
    for my $descendent ($parent->get_all_Descendents())
    {
    if (defined($descendent->id())) # IF THIS IS A LEAF.
    {
    my $id = $descendent->id();
    my ($loss2) = $descendent->get_tag_values('E');
    if (defined($loss2))
    {
    my @temp2 = split(/\$-/,$loss2);
    my $species2= $temp2[1];
    print "descendent $id -> loss $loss2\n";
    if (($loss eq 'HUMAN' || $loss eq 'Homo') && ($species2 =~ /PANTR/ || $species2 =~ /Pan/)) # xxx add other cases
    {
    $evidence_for_loss = 0;
    }
    }
    }
    }
    }
    print "___ FINAL EVIDENCE FOR LOSS IN $loss: now $evidence_for_loss\n";
    #---------------------------------------------------------#
    }
    NEXT_LEAF: # xxx
    }
    }
    print "\n\n";

    print STDERR "FINISHED.\n";


    #------------------------------------------------------------------#

    # SUBROUTINE TO CHECK WHETHER A CLADE OF GENES IS GOOD EVIDENCE FOR A GENE LOSS:

    sub check_evidence_for_loss
    {
    my $loss = $_[0]; # THE SPECIES IN WHICH THE PUTATIVE GENE LOSS OCCURRED.
    my $SPECIES = $_[1]; # POINTER TO HASH TABLE OF SPECIES IN THE CLADE THAT SPECIES $species IS MISSING FROM.
    my $evidence = 1; # SAYS WHETHER THE EVIDENCE FOR GENE LOSS IS STRONG.
    my @all = ('HUMAN','PANTR','MACMU','BOVIN','CANFA','MOUSE','RAT','MONDO',
    'CHICK','XENTR','TETNG','FUGRU','BRARE','CIOIN','DROME','DROPS',
    'CAEEL','CAEBR','CAERE','APIME','ANOGA','YEAST','SCHPO','ARATH','SCHMA');
    # ALL THE FULLY SEQUENCED SPECIES.
    my @expect; # THE SPECIES THAT WE EXPECT TO SEE IN THE CLADE.
    my $i; #
    my %EXPECT = (); #
    my $expecti; #
    my @expecti; #
    my $j; #
    my $found_expecti = 0; #
    my $loss2; #

    # CHECK WHETHER THERE IS STRONG EVIDENCE FOR A GENE LOSS: # xxx ADD MORE CASES
    # FOR EXAMPLE, WE CAN BE SURE OF A HUMAN GENE LOSS IF THE CLADE CONTAINS AT LEAST
    # ONE CHIMP OR ONE MACAQUE, AT LEAST ONE RAT OR ONE MOUSE, AT LEAST ONE COW, AND
    # AT LEAST ONE DOG GENE:
    if ($loss eq 'HUMAN') { @expect = ('PANTR','MACMU','MOUSE/RAT','BOVIN/CANFA'); }

    # CHECK THAT WE SEE ALL THE SPECIES THAT WE EXPECT IN THE CLADE, AND NO OTHER SPECIES:
    for ($i = 0; $i <= $#expect; $i++)
    {
    $expecti = $expect[$i];
    @expecti = split(/\//,$expecti);
    $found_expecti = 0;
    # CHECK WHETHER WE SEE AT LEAST ONE OF PANTR/MACMU FOR EXAMPLE:
    for ($j = 0; $j <= $#expecti; $j++)
    {
    $EXPECT{$expecti[$j]} = 1;
    if ($SPECIES->{$expecti[$j]}) { $found_expecti = 1;}
    }
    if ($found_expecti == 0) { $evidence = 0;} # WE DON'T FIND A SPECIES THAT WE EXPECT TO SEE IN THE CLADE.
    }
    # CHECK IF WE SEE ANY SPECIES IN THE CLADE THAT WE DON'T EXPECT TO SEE:
    for ($i = 0; $i <= $#all; $i++)
    {
    if (!($EXPECT{$all[$i]}) && $all[$i] ne $loss)
    {
    if ($SPECIES->{$all[$i]}) { $evidence = 0;} # WE FIND A SPECIES THAT WE DON'T EXPECT TO SEE IN THE CLADE.
    }
    }

    return($evidence);
    }

    #------------------------------------------------------------------#