# This is not presently all encompassing as it was started well after my sequence work repo # at https://github.com/fomightez/sequencework , where much of this related code is. # For making FASTA files/entriees out of dataframes, see 'specific dataframe contents saved as formatted text file example' # in my useful pandas snippets gist https://gist.github.com/fomightez/ef57387b5d23106fabd4e02dab6819b4 # For reading in FASTA files/entries # -See `mine_mito_features` function in `mine_mito_features_from_transcriptome.py` # https://github.com/fomightez/sequencework/blob/master/Extract_Details_or_Annotation/mine_mito_features_from_transcriptome.py # -See `get_fasta_seq` and `get_seq_from_URL` functions in `find_sequence_element_occurrences_in_sequence.py` in # the https://github.com/fomightez/sequencework/tree/master/FindSequence repo # (direct url: https://github.com/fomightez/sequencework/blob/master/FindSequence/find_sequence_element_occurrences_in_sequence.py ) # From `patmatch-binder`, `specifically PatMatch nucleic handling flags demystified.ipynb`, and # highly based on similar handling in `find_sequence_element_occurrences_in_sequence.py` # (https://github.com/fomightez/sequencework/blob/master/FindSequence/find_sequence_element_occurrences_in_sequence.py) & # now adapted into a proper (slightly better) script with command line provideable arguments at # https://github.com/fomightez/sequencework/blob/master/ConvertSeq/convert_fasta_to_reverse_complement.py : ## READ IN FASTA FILE AND CONVERT TO REVERSE COMPLEMENT USING BIOPYTHON ** from Bio import SeqIO from Bio.Seq import Seq # for reverse complement def get_seq_from_file(file_name): ''' takes a file name and gets the sequence RECORD. ''' fasta_iterator = SeqIO.parse(file_name, "fasta") # Use of `next()` on next line to get first FASTA -formatted sequence is # based on http://biopython.org/DIST/docs/api/Bio.SeqIO-module.html # I think difference from `SeqIO.read()` in this approach is that it won't # give an error if more than one entry is in the file. record = next(fasta_iterator) #return record.seq #return record.id return record # Read FASTA file input_file_name = "chr15.fsa" sequence_record = get_seq_from_file(input_file_name) #print(sequence_record) #FOR DEBUGGING # Read multi FASTA file ((in other words single FASTA file with multiple sequences/records multiFASTA) records = [] for record in SeqIO.parse("TAIR9_pep_20090619.fa", "fasta"): records.append(record) thale_cress_prot_length = [] for record in records: #print (len(record.seq)) thale_cress_prot_length.append(len(record.seq)) # Get reverse complement seq_rev_compl_record = sequence_record.reverse_complement() seq_rev_compl_record.id = sequence_record.id #record needs id for writing FASTA # or even easier, see https://biopython.org/DIST/docs/api/Bio.SeqRecord.SeqRecord-class.html#reverse_complement # and use argruments to keep id and description when reverse complementing seq_rev_compl_record = sequence_record.reverse_complement(id=True,description=True) #print(seq_rev_compl_record) #FOR DEBUGGING # Save FASTA file for reverse complement output_file_name = "chr15.revcompl.fa" SeqIO.write(seq_rev_compl_record, output_file_name, "fasta"); # Save multi fasta (in other words single FASTA file with multiple sequences/records) SeqIO.write(records,file_name, "fasta"); # based on https://www.biostars.org/p/48797/#48801 "If that's the case then # note SeqIO.write() can take a list or a generator of SeqRecords so you should pass one of those" # related to above , but dealing with URL source or file too: def get_seq_from_URL(url): ''' takes a URL and gets the sequence ''' try: from StringIO import StringIO except ImportError: from io import StringIO # Getting html originally for just Python 3, adapted from # https://stackoverflow.com/a/17510727/8508004 and then updated from to # handle Python 2 and 3 according to same link. try: # For Python 3.0 and later from urllib.request import urlopen except ImportError: # Fall back to Python 2's urllib2 from urllib2 import urlopen html = urlopen(url) fasta_iterator = SeqIO.parse(StringIO( html.read().decode(encoding='UTF-8')), "fasta") # Use of `next()` on next line to get first FASTA -formatted sequence is # based on http://biopython.org/DIST/docs/api/Bio.SeqIO-module.html # I think difference from `SeqIO.read()` in this approach is that it won't # give an error if more than one entry is in the html. # I found I needed `StringIO()` or got issues with trying to handle long file name. record = next(fasta_iterator) return record.seq def get_fasta_seq(source): ''' Takes a source URL or filepath/ file name and gets sequence if it is in FASTA format. It won't return anything if what is provided isn't FASTA format because Biopython handles both trying to extract FASTA from URL and from file. See https://stackoverflow.com/a/44294079/8508004. Placing it in a function it easy to then check and provide some feedback. ''' if source.lower().startswith("http"): return get_seq_from_URL(source) else: # Read sequence, treating source as a filepath. # Use of `with` on next line based on http://biopython.org/wiki/SeqIO , # under "Sequence Input". Otherwise, backbone based on # https://www.biostars.org/p/209383/, and fact `rU` mode depecated. with open(source, "r") as handle: for record in SeqIO.parse(handle, "fasta"): # print(record.seq) # for debugging return record.seq # remove gaps and make a string into a SeqRecord that can be written to FASTA with `SeqIO.write(records,file_name, "fasta")` from Bio.Alphabet import generic_dna from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord new_record = SeqRecord(Seq(text_string, generic_dna).ungap("-"), id= id_, description=id_descript))# based # on https://www.biostars.org/p/48797/ and `.ungap()` method, see # https://github.com/biopython/biopython/issues/1511 , and `description` # from what I've seen for `id` plus https://biopython.org/wiki/SeqIO # Note that you can leave out `id=` part and just put the necessary string in that # position in function call because it is a required positoinal argument #If the 'sequence' is already a Bio.Seq.Seq object, it can be made into a record with an id, or optionnally also a description new_record = SeqRecord(seq_obj, "XS56782")) #-or- new_record = SeqRecord(seq_obj, id= "XS56782", description=id_descript)) #note that a `record.decscription` will also contain the `record.id`; can be removed with `description_wo_id=record.description.split(record.id,1)[1]` so it won't show up twice # Editing description line of FASTA from within a jupyter notebooks cell # add identifiers to each `chr` so results for each strain clear later chromosome_id_prefix = "chr" def add_strain_id_to_description_line(file,strain_id): ''' Takes a file and edits every description line to add strain_id after the caret. Saves the fixed file ''' import sys output_file_name = "temp.txt" # prepare output file for saving so it will be open and ready with open(output_file_name, 'w') as output_file: # read in the input file with open(file, 'r') as input_handler: # prepare to give feeback later or allow skipping to certain start lines_processed = 0 for line in input_handler: lines_processed += 1 if line.startswith(">"): rest_o_line = line.split(">") new_line = ">"+strain_id + rest_o_line[1] else: new_line = line # Send text to output output_file.write(new_line) # replace the original file with edited !mv temp.txt {file} # Feedback sys.stderr.write("\n{} chromosome identifiers tagged.".format(file)) for s in yue_et_al_strains: add_strain_id_to_description_line(s+".genome.fa",s) #function to get description line text from a FASTA file def get_descriptionline(filename): ''' Takes the name of file in FASTA format and gets the first description line, i.e., the text after the caret on the first line. Returns the text after the '>' symbol. ''' import sys output_file_name = "temp.txt" # prepare output file for saving so it will be open and ready with open(output_file_name, 'w') as output_file: # read in the input file with open(filename, 'r') as input_handler: # prepare to give feeback later or allow skipping to certain start lines_processed = 0 for line in input_handler: lines_processed += 1 if lines_process == 1 and line.startswith(">"): return line.split(">")[0].strip() # save the protein sequence as FASTA chunk_size = 70 #<---amino acids per line to have in FASTA (note this chunking to # multiple lines is the opposite of what PatMatch's `unjustify_fasta.pl` does) # Note that I generalized the chunking to a function that can handle any type of # residue in `get_seq_from_multiFASTA_with_match_in_description.py` prot_seq_chunks = [prot_seq[i:i+chunk_size] for i in range( 0, len(prot_seq),chunk_size)] prot_seq_fa = ">" + prot_descr + "\n"+ "\n".join(prot_seq_chunks) p_output_file_name = strain_id +"_" + gene_name + "_protein_ortholog.fa" with open(p_output_file_name, 'w') as output: output.write(prot_seq_fa) prot_seqs_fn_list.append(p_output_file_name) sys.stderr.write("\n\nProtein sequence saved as " "`{}`.".format(p_output_file_name)) # above code made into function for working with pyfaidx; used in `blast-binder` repo to make it easier for users # who already had a multi-FASTA sequence file of sequeneces of interest to plug into a workflow at a later # point def seq_to_file(prot_seq, description, name_suffix_to_use): ''' function to save a sequence as a file in FASTA-format ''' # save the protein sequence as FASTA chunk_size = 70 #<---amino acids per line to have in FASTA (note this chunking to # multiple lines is the opposite of what PatMatch's `unjustify_fasta.pl` does) # Note that I generalized the chunking to a function that can handle any type of # residue in `get_seq_from_multiFASTA_with_match_in_description.py` prot_seq_chunks = [prot_seq[i:i+chunk_size] for i in range( 0, len(prot_seq),chunk_size)] prot_seq_fa = ">" + description + "\n"+ "\n".join(prot_seq_chunks) p_output_file_name = description.split()[0] + name_suffix_to_use with open(p_output_file_name, 'w') as output: output.write(prot_seq_fa) sys.stderr.write("\n\nProtein sequence saved as " "`{}`.".format(p_output_file_name)) from pyfaidx import Fasta records = Fasta(sequence) for seq in records: # make individual file (also see `!faidx --split-files {seq_file}` below) seq_to_file(str(seq), seq.long_name, "_protein_ortholog.fa") # I am using `seq.long_name` here because it give more options for adapting the code to make a file name one prefers; however, in developing some other code I became aware that if the FASTA files are non-standard and have an empty line above the description line, that `seq.long_name` will included the caret symbol whereas `seq.name` won't for some reason. So if the less-than-symbol is getting added into name, just change here to `seq.name` # do SOMETHING WITH THAT FILE ''' BLOCK TO DO SOMETHING HERE ''' # One metric to assess is count all the letters present (because shows number of Ns too) # [BASIC VERSION] from pyfaidx import Fasta import collections for g in genomes: concatenated_seqs = "" chrs = Fasta(g) for x in chrs: #print(x.name) concatenated_seqs += str(x) print(g,":",collections.Counter(concatenated_seqs)) # ['EXPANDED' VERSION] from pyfaidx import Fasta import pandas as pd import collections nt_counts = {} for g in genomes: strain_id = g.split(".genome.fa")[0] concatenated_seqs = "" chrs = Fasta(g) for x in chrs: #print(x.name) concatenated_seqs += str(x) nt_counts[strain_id] = collections.Counter(concatenated_seqs) nt_count_df = pd.DataFrame.from_dict(nt_counts, orient='index').fillna(0) nt_count_df["Total_nts"] = nt_count_df.sum(1) def percent_calc(items): ''' takes a list of two items and calculates percentage of first item within total (second item) ''' return items[0]/items[1] nt_count_df['% N'] = nt_count_df[['N','Total_nts']].apply(percent_calc, axis=1) nt_count_df = nt_count_df.style.format({'Total_nts':'{:.2E}','% N':'{:.2%}'}) # Alternatives for handling description lines, i.e.,lines before sequence beginning with `>`, with pyfaidx: # By default, `.name` attribute just goes to first encountered space in entry description line. You can access full # description with `.long_name` attribute. <===NOTE while developing some other code I became aware that if the FASTA # files are non-standard and have an empty line above the description line, that `seq.long_name` will included the # caret symbol whereas `seq.name` won't for some reason. So if the less-than-symbol is getting added into name, # it may be easiest to just change to `seq.name` or strip off `>` if the starts with `>` because the first one won't have it. from pyfaidx import Fasta genes = Fasta('tests/data/genes.fasta') for record in genes: print(record.name) print(record.long_name) # new in v0.4.9, you can specify the `.name` attribute be the full description line if you specify `read_long_names=True`. from pyfaidx import Fasta genes = Fasta('tests/data/genes.fasta', read_long_names=True) for record in genes: print(record.name) # You can also filter out based on the contents of the description line using 'filt_function'. The have # simple example in the documentation now that filters on first letter in that line. Below is fancier # where filters records where description line ends in patterns like '0-0' or '4182-4182'. seqrecords = Fasta(fn, read_long_names=True, filt_function = lambda x: x.split( "extracted_coordinates: ")[1].split("-")[0] != x.split( "extracted_coordinates: ")[1].split("-")[1]) # Note that in this case I had details on the sequence in the description line. This won't always # be the case so I could just use an if statement after that to conditionally skip those where # sequence in the record isn't greater than 1, example `if len(record) > 1:`. I wanted to avoid # adding yet another if conditional to the processing I was doing where I chose to use `filt_function` to # filter. It seems I could also interate on the `Fasta` # object only collecting to a new list the records where the length of the sequence was greater than 1 and then # iterate over that list of records. Those should be fine for iterating even if I haven't figured out # how to cast the list of records (each ``) back to the the `class 'pyfaidx.Fasta'` # count frequency of blocks of Ns in a string (presumably sequence) import re from collections import defaultdict t = "NaaNNNhcTCaaNANANDANNNNNNAANNANNANNNNNNNNANANANNANNNNNN" matches = [] len_match_dict = defaultdict(int) min_number_Ns_in_row_to_collect = 1 pattern_obj = re.compile("N{{{},}}".format(min_number_Ns_in_row_to_collect), re.I) # adpated from # code worked out in `collapse_large_unknown_blocks_in_DNA_sequence.py`, which relied heavily on # https://stackoverflow.com/a/250306/8508004 for m in pattern_obj.finditer(t): len_match_dict[len(m.group())] += 1 matches.append(m.group()) print(len_match_dict) print(collections.Counter(matches)) # Split a file using pyfaidx command line utility that is available in Jupyter when install pyfaidx module (note this may not split every fasta entry in a multifasta because it seems to keep regions together, like faSplit seems to maybe do, too. See https://github.com/fomightez/Fasta2Structure-cli/blob/4bbc18dff4d487ca3eeaeaad9511a1767a6eea4d/binder/postBuild) seq_file = "SGDs288CplusPacBio_ADJUSTEDplusWoltersnW303forALIGNERS.fa" !faidx --split-files {seq_file} # Package up a lot of sequence files (FASTA) into one archive for easy downloading , PLUS DEMONSTRATES # merging a lot of inidvidual sequence files into one FASTA file (for Jupyter session) #Archive the sequences (FASTA format) collected and any dataframes made # Pickle each dataframe and also save as `tsv` for possible use elsewhere strd_dataframes_fn_list = [] def pickle_df_and_store_as_table(dataframe, prefix): ''' Take a dataframe and a filename prefix and save a pickled form of that dataframe and a text tablular data version (tab-sepearated values). Returns the name of the pickled and text file. ''' dataframe.to_pickle(prefix + ".pkl") dataframe.to_csv(prefix + ".tsv", sep='\t',index = False) return prefix + ".pkl", prefix + ".tsv" # To automate the dataframe handling, make a dictionary for each dataframe name string as key and filename prefix # associated as the value df_n_fn_dict = { "CTD_seq_of_protein_orthologs": CTD_seq_df, "first_heptad_of_protein_orthologs": first_7_df, "heptads_ofCTD_seq_of_protein_orthologs": repeat_df, "main_heptads_ofCTD_seq_of_protein_orthologs": repeat_wo_first_df, "fraction_matching_consensus_per_CTD": fraction_consensus_df, } import pandas as pd for prefix, dataframe in df_n_fn_dict.items(): #pkl_fn, text_table_fn = pickle_df_and_store_as_table(dataframe, prefix) strd_dataframes_fn_list.extend(pickle_df_and_store_as_table(dataframe, prefix)) # store `CTD_seqs_fn_list` as json since lighter-weight and more portable than pickling CTD_seqs_fn_list_storedfn = "CTD_seqs_fn_list.json" import json with open(CTD_seqs_fn_list_storedfn, 'w') as f: json.dump(CTD_seqs_fn_list, f) #for ease in aligning or other uses later save the all the CTDs as a concatenated file cat_fasta_fn = "CTD_seq_of_protein_orthologs.fa" !cat {" ".join(CTD_seqs_fn_list)} > {cat_fasta_fn} archiving_fn_list = CTD_seqs_fn_list + strd_dataframes_fn_list + [CTD_seqs_fn_list_storedfn , cat_fasta_fn] archive_file_name = gene_name+"_orthologs_extracted_CTDs.tar.gz" !tar czf {archive_file_name} {" ".join(archiving_fn_list)} # use the list for archiving command sys.stderr.write("\nCollected CTD sequences" " and tables of details gathered and saved as " "`{}`.".format(archive_file_name)) # Generate / create/ make multi-record FASTA file (multiFASTA. i.e., single FASTA file with multiple sequence entries) with mock / fake / simulate/ random sequence #!curl -O https://raw.githubusercontent.com/amarallab/NullSeq/master/nullseq.py #!curl -O https://raw.githubusercontent.com/amarallab/NullSeq/master/NullSeq_Functions.py #%run nullseq.py -l 436 !pip install fire !curl -O https://raw.githubusercontent.com/mauriceling/bactome/master/randomseq.py #!python randomseq.py FLS -- --help #???<-- That didn't match typical Python help syntax I was familar with but it is noted at https://github.com/google/python-fire !python randomseq.py FLS --length=706 --n=3 > mock_seqs.fa # store FASTA seqeunces as strings within Python scripts but get the sequence back for length # or sequence as a string def stringFASTA2seq(s): ''' Takes a FASTA file contents that are currently as a Python string and converts it to the sequence string. In other words, it removes the description line and the line breaks and just makes a sequence string. ''' l = s.split("\n") return "".join(l[1:]) example_sequence = '''>OMEGAminusHEG S.cerevisiae omega with homing endonuclease gene and associated DNA removedSPECULATIVE ATTTACCCCCTTGTCCCATTATATTGAAAAATATAATTATTCAATTAATTATTTAATTGA AGTAAATTGGGTGAATTGCTTAGATATCCATATAGATAAAAATAATGGACAATAAGCAGC GAAGCTTATAACAACTTTCATATATGTATATATACGGTTATAAGAACGTTCAACGACTAG ATGATGAGTGGAGTTAACAATAATTCATCCACGAGCGCCCAATGTCGAATAAATAAAATA TTAAATAAATTTTTATTTGATAATGATATAGTCTGAACAATATAGTAATATATTGAAGTA ATTATTTAAATGTAATTACGATAACAAAAAATTTGA ''' length_example_sequence = len(stringFASTA2seq(example_sequence)) # see my file `about parsing fasta formatted sequence data in biopython.md`