Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save avrilcoghlan/883d63f327d34b6bb57d8453d5497cd0 to your computer and use it in GitHub Desktop.

Select an option

Save avrilcoghlan/883d63f327d34b6bb57d8453d5497cd0 to your computer and use it in GitHub Desktop.

Revisions

  1. avrilcoghlan created this gist Mar 4, 2022.
    35 changes: 35 additions & 0 deletions format_blastp_output_for_chembl_singleproteintargetsonly.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,35 @@
    import os
    import sys
    from collections import defaultdict
    import FiftyHG_Chembl

    #====================================================================#

    def main():

    # find the blast output files:
    blastoutput = defaultdict()
    mydir = os.getcwd() # the current directory
    myfiles = os.listdir(mydir)
    for myfile in myfiles:
    if myfile.endswith('.txt2'): # eg. schistosoma_mansoni.txt2
    # find the species name:
    temp = myfile.split('.txt2')
    species = temp[0] # eg. schistosoma_mansoni
    # read in the list of uniprot ids in chembl single-protein targets:
    singleproteintargetfile = os.path.join(mydir, 'chembl_single_protein_targets_uniprot_ids')
    assert(os.path.exists(singleproteintargetfile))
    singleproteintargets = FiftyHG_Chembl.read_single_protein_targets(singleproteintargetfile) # returns a set of uniprot ids in targets
    # parse and format this blast output file, to just take the blast matches to single-protein chembl targets:
    myfile = os.path.join(mydir, myfile)
    output_file = '%sb' % myfile
    print('Making file',output_file)
    if not os.path.exists(output_file):
    FiftyHG_Chembl.reformat_blast_output_singleproteintargetsonly(myfile,species,singleproteintargets,output_file)

    #====================================================================#

    if __name__=="__main__":
    main()

    #====================================================================#