#PyPI Is the module repository in Python (the equivalent to CPAN in Perl) // *Initializing 2 variables at the same time: a=b=0 or a=b=0.0 // #Python shebang: #!/usr/bin/env python (for python 2.7 latest) #!/usr/bin/env python3 (for python 3.latest) # Python style guide: https://www.python.org/dev/peps/pep-0008/ *Class Names: Class names should normally use the CapWords convention *Funcƒtion Names; Function names should be lowercase, with words separated by underscores as necessary to improve readability. *Function params: The most followed convention is the one used in NumPy (described at https://github.com/numpy/numpy/blob/master/doc/HOWTO_DOCUMENT.rst.txt) Parameters ---------- x : type Description of parameter `x`. y Description of parameter `y` (with type not specified) #to print print "hello"; / #python naming conventions -Class names start with an uppercase letter. All other identifiers start with a lowercase letter. -Starting an identifier with a single leading underscore indicates that the identifier is private. - Starting an identifier with two leading underscores indicates a strongly private identifier. - If the identifier also ends with two trailing underscores, the identifier is a language-defined special name. #assign values to variableswh counter = 100 # An integer assignment miles = 1000.0 # A floating point name = "John" # A string // #Object introspection: my_list = [1, 2, 3] dir(my_list) #gave us the names of all the methods of a list. This can be handy when you are not able to recall a method name. / #using type: print(type('')) # Output: / #Using Inspect: import inspect print(inspect.getmembers(str)) #multiple assignment a = b = c = 1 a, b, c = 1, 2, "john" #increment operator a+=2 #print variables print counter print miles print name #Standard Data Types Numbers String List Tuple Dictionary / #knowing the data type of some object: type(whatever) #Python strings: str = 'Hello World!' print str[0]; #print H print str[1:3]; #print el print str[2:]; #print llo (everything from the 2nd character) print str*2; #prints hellohello print str1+str2 #concatenating 2 strings #concatenating a string and a number, where b is the number: str=a+str(b) #python lists: list = [ 'abcd', 786 , 2.23, 'john', 70.2 ] print list # Prints complete list print len(list) #print length of list print list[0] # Prints first element of the list print list[1:3] # Prints elements starting from 2nd to 4th print list[2:] # Prints elements starting from 3rd element print tinylist * 2 # Prints list two times print list + tinylist # Prints concatenated lists #adding a prefix to each element on a list: alist = ['foo','spam', 'bar'] prefix='pref' newlist=[prefix+elt for elt in alist] #print join a list mylist = ['spam', 'ham', 'eggs'] print ','.join(mylist) *Remove an element's first occurrence in a list >>> a = ['a', 'b', 'c', 'd'] >>> a.remove('b') >>> print a ['a', 'c', 'd'] #remove duplicated elements from list mylist = ['spam', 'ham', 'eggs'] set(mylist) // #sets in python. #read list of words in a file into a set: my_set = set(open('all_runs.ega.txt')) / #this will contain a trailing \n, so we need to do the following: set(line.strip() for line in open('filename.txt')) // #A good way of checking what elements there is in one list and not the other and vice versa: To find the intersection (items that are in both sets): >>> a = set([1, 2, 3, 4, 5, 6]) >>> b = set([4, 5, 6, 7, 8, 9]) >>> a & b set([4, 5, 6]) To find the difference (items that only in one set): >>> a = set([1, 2, 3, 4, 5, 6]) >>> b = set([4, 5, 6, 7, 8, 9]) >>> a - b set([1, 2, 3]) >>> b - a set([7, 8, 9]) To find the symmetric difference (items that are in one or the other, but not both): >>> a = set([1, 2, 3, 4, 5, 6]) >>> b = set([4, 5, 6, 7, 8, 9]) >>> a ^ b set([1, 2, 3, 7, 8, 9]) / *Adding elements to a set >>> a.add(7) // #python tuples: A tuple is another sequence data type that is similar to the list. A tuple consists of a number of values separated by commas. Unlike lists, however, tuples are enclosed within parentheses. Tuples can be thought of as read-only lists. tuple = ( 'abcd', 786 , 2.23, 'john', 70.2 ) tinytuple = (123, 'john') print tuple # Prints complete list print tuple[0] # Prints first element of the list print tuple[1:3] # Prints elements starting from 2nd to 4th print tuple[2:] # Prints elements starting from 3rd element print tinytuple * 2 # Prints list two times print tuple + tinytuple # Prints concatenated lists // #list of tuples my_list = [ ('a', 1), ('b', 2), ('c', 3), ('d', 4)] // #iterating over list of tuples: for j,k in my_list: ... print j ... print k / *named tuples: from collections import namedtuple Point = namedtuple('Point', 'x y') pt1 = Point(1.0, 5.0) pt2 = Point(2.5, 1.5) from math import sqrt line_length = sqrt((pt1.x-pt2.x)**2 + (pt1.y-pt2.y)**2) // #Python Dictionary: Python 's dictionaries are hash table type. They work like associative arrays or hashes found in Perl and consist of key-value pairs. tinydict = {'name': 'john','code':6734, 'dept': 'sales'} print dict['one'] # Prints value for 'one' key print dict[2] # Prints value for 2 key print tinydict # Prints complete dictionary print tinydict.keys() # Prints all the keys print tinydict.values() # Prints all the values // #compare 2 dictionaries Make Two Dictionaries importers = {'El Salvador' : 1234, 'Nicaragua' : 152, 'Spain' : 252 } exporters = {'Spain' : 252, 'Germany' : 251, 'Italy' : 1563 } Find Duplicate Keys # Find the intersection of importers and exporters importers.keys() & exporters.keys() {'Spain'} Find Difference In Keys # Find the difference between importers and exporters importers.keys() - exporters.keys() {'El Salvador', 'Nicaragua'} Find Key, Values Pairs In Common # Find countries where the amount of exports matches the amount of imports importers.items() & exporters.items() {('Spain', 252)} // # Merge 2 dictionaries (dicts) >>> x = {'a': 1, 'b': 2} >>> y = {'b': 3, 'c': 4} >>> z = {**x, **y} >>> z {'c': 4, 'a': 1, 'b': 3} #When there are 2 overlapping keys then the right-hand key has precedence // #if if expression: statement(s) #else if expression: statement(s) else: statement(s) #elif if expression1: statement(s) elif expression2: statement(s) elif expression3: statement(s) else: statement(s) #and operator if (expression1 and expression2): statement(s) #while loop while expression: statement(s) #for statement for iterating_var in sequence: statements(s) ex: l=[1,2,3,4,5,6] for i in l: print i ex: for i in 'caca': print i #break statement for letter in 'Python': # First Example if letter == 'h': break print 'Current Letter :', letter var = 10 # Second Example while var > 0: print 'Current variable value :', var var = var -1 if var == 5: break print "Good bye!" #continue statement for letter in 'Python': # First Example if letter == 'h': continue print 'Current Letter :', letter var = 10 # Second Example while var > 0: print 'Current variable value :', var var = var -1 if var == 5: continue print "Good bye!" // ''' Iterate over the list using while loop ''' i = 0 sizeofList = len(wordList) while i < sizeofList : print(wordList[i]) i += 1 // #defining a function and calling function #!/usr/bin/python # Function definition is here def printme( str ): "This prints a passed string into this function" print str; return; # Now you can call printme function printme("I'm first call to user defined function!"); printme("Again second call to the same function"); #opening a file for writing >>>f = open('workfile', 'w') >>>print f #opening a file for reading and printing lines f=open('/Users/ernesto/supercont1.1v4.gff','r'); for line in f: print line, // # open a file if provided, if not then write to STDOUT import sys import contextlib @contextlib.contextmanager def smart_open(filename=None): if filename and filename != '-': fh = open(filename, 'w') else: fh = sys.stdout try: yield fh finally: if fh is not sys.stdout: fh.close() Use it like this: # writes to some_file with smart_open('some_file') as fh: print >>fh, 'some output' # writes to stdout with smart_open() as fh: print >>fh, 'some output' # writes to stdout with smart_open('-') as fh: print >>fh, 'some output' // #opening a file and write something f=open('text.txt','w'); f.write("hello"); f.close; / #adding a newline: f.write(your_string+"\n"); / #append a text f=open('text.txt','a'); f.write("hello"); f.close; #create a dir import os; os.mkdir("newdir"); #change into a dir os.chdir("newdir"); #split a string >>> str="hello,bye" >>> str.split(',') >>> str ['hello', 'bye'] #split string into elms using tab separators elms=line.split("\t") #replace text in a string >>> s = '100 NORTH MAIN ROAD' >>> s.replace('O','U') '100 NURTH MAIN RUAD' #regex #1st ex:import re; m = re.search("(\d+)","hello 12 bye caca") if m: print m.groups()[0] >>>12 #combining a patther and a variable m1=re.search(r'>+%s' %variablename,line) #finding all occurences of a pattern >>> import re; >>> p = re.compile('\d+'); >>> p.findall('12 drummers drumming, 11 pipers piping, 10 lords a-leaping') ['12', '11', '10'] #finding all declaring the pattern in the same line >>> import re; >>> re.findall('\d+','12 drummers drumming, 11 pipers piping, 10 lords a-leaping') #checking if a string is empty if not myString: do something.... #checking if a string starts with '>' import re; p = re.compile( '^>' ) m = p.match( '>hello' ) if m: print 'Match found: ', m.group() else: print 'No match' >>>Match found: > Another simpler way: m=re.search("^>",line) if m: print 'Match found: ', m.group() #matchin more than one pattern on the same string m=re.search("^>|^#",line) if m: print 'Match found: ', m.group() #matching all non ACGT bases in a DNA sequence: import re dna = "ATCGCGAZZZTTCAA" if re.search(r"[^ATGC]", dna): print("restriction site found!") // #Counting longest occurrence of repeated sequence in Python import re my_str = "abcdefgfaabbbffbbbbbbfgbb" length=len(max(re.compile("(b+b)*").findall(my_str))) print(length) / #negating a specific pattern patt = re.compile('(?!_NON_REF)_GENOTYPE_CONCORDANCE' ) // #exit from a program sys.exit(0) #accessing the command line args import sys print 'Number of arguments:', len(sys.argv), 'arguments.' print sys.argv[0] #print script name print sys.argv[1] #printing first arg #exiting if number of args is incorrect if (len(sys.argv)<3): sys.exit("[USAGE] python test_1.py ") // #most efficient way of parsing command line args: Good tutorial at: https://mkaz.tech/python-argparse-cookbook.html / #First example using a single verbose import sys import argparse parser = argparse.ArgumentParser(description='Demo') parser.add_argument('--verbose', action='store_true', help='verbose flag' ) args = parser.parse_args() if args.verbose: print("~ Verbose!") else: print("~ Not so verbose") #This is run by: $ python test.py ~ Not so verbose $ python test.py --verbose ~ Verbose! #if you run it with python test.py --help #you will get: usage: generate_expPer_donor.py [-h] [--verbose] Demo optional arguments: -h, --help show this help message and exit --verbose verbose flag / #required arg: parser = argparse.ArgumentParser() parser.add_argument('--limit', required=True, type=int) args = parser.parse_args() / #Parsing a file argument: parser = argparse.ArgumentParser() parser.add_argument('f', type=argparse.FileType('r')) args = parser.parse_args() for line in args.f: print( line.strip() ) // #validating arguments def check_positive(value): ivalue = int(value) if ivalue <= 0: raise argparse.ArgumentTypeError("%s is an invalid positive int value" % value) return ivalue parser = argparse.ArgumentParser(...) parser.add_argument('foo', type=check_positive) // #adding a default value for an option: parser.add_argument('--limit', default=5, type=int) // #removing \n in python: 'test string \n'.rstrip('\n') or line=line.rstrip('\n') // #using a list as an option for argparse: parser.add_argument('-l','--list', nargs='+', help=' Set flag', required=True) # Use like: # python arg.py -l 1234 2345 3456 4567 # TL;DR Use the nargs option or the 'append' setting of the action option (depending on how you want the user interface to behave). nargs parser.add_argument('-l','--list', nargs='+', help=' Set flag', required=True) # Use like: # python arg.py -l 1234 2345 3456 4567 nargs='+' takes 1 or more arguments, nargs='*' takes zero or more. // #OOP in Python // A Python file is called a "module" and it's one way to organize your software so that it makes "sense". Another is a directory, called a "package". A module is a distinct thing that may have one or two dozen closely-related classes. The trick is that a module is something you'll import, and you need that import to be perfectly sensible to people who will read, maintain and extend your software. The rule is this: a module is the unit of reuse. // #!/usr/bin/python class Employee: 'Common base class for all employees' empCount = 0 def __init__(self, name, salary): self.name = name self.salary = salary Employee.empCount += 1 def displayCount(self): print "Total Employee %d" % Employee.empCount def displayEmployee(self): print "Name : ", self.name, ", Salary: ", self.salary "This would create first object of Employee class" emp1 = Employee("Zara", 2000) "This would create second object of Employee class" emp2 = Employee("Manni", 5000) emp1.displayEmployee() emp2.displayEmployee() print "Total Employee %d" % Employee.empCount / #Inheritance: class MinimumBalanceAccount(BankAccount): def __init__(self, minimum_balance): BankAccount.__init__(self) self.minimum_balance = minimum_balance def withdraw(self, amount): if self.balance - amount < self.minimum_balance: print 'Sorry, minimum balance must be maintained.' else: BankAccount.withdraw(self, amount) #Where BankAccount is the parent class and withdraw overrides the withdraw method in the parent class / *Another example of inheritance: class Person: def __init__(self, first, last): self.firstname = first self.lastname = last def __str__(self): return self.firstname + " " + self.lastname class Employee(Person): def __init__(self, first, last, staffnum): super().__init__(first, last) self.staffnumber = staffnum x = Person("Marge", "Simpson") y = Employee("Homer", "Simpson", "1007") print(x) print(y) / #class built-in methods __doc__ #class documentation, for examples; print Employee.__doc__ / #inheritance #!/usr/bin/python class Parent: # define parent class parentAttr = 100 def __init__(self): print "Calling parent constructor" def parentMethod(self): print 'Calling parent method' def setAttr(self, attr): Parent.parentAttr = attr def getAttr(self): print "Parent attribute :", Parent.parentAttr class Child(Parent): # define child class def __init__(self): print "Calling child constructor" def childMethod(self): print 'Calling child method' / *Set attribute: setattr(x, attr, 'magic') / *Initializing an object from a dict: for k,v in c.items(): setattr(self,k,v) print "h" / # Private methods and attributes in Python: # The information below is extracted from https://www.bogotobogo.com/python/python_private_attributes_methods.php # # Example of a class: # p.py class P: def __init__(self, name, alias): self.name = name # public self.__alias = alias # private def who(self): print('name : ', self.name) print('alias : ', self.__alias) # When we create an instance of P and we try to access its attributes: >>> from p import P >>> x = P(name='Alex', alias='amen') >>> x.name 'Alex' >>> x.alias Traceback (most recent call last): File "", line 1, in AttributeError: P instance has no attribute 'alias' # We can't access alias using double underscore: >>> x.__alias Traceback (most recent call last): File "", line 1, in AttributeError: P instance has no attribute '__alias' # But we can access by using a single underscore: >>> x._P__alias 'amen' # We can also have Private functions. An example of a class: # p2.py class P: def __init__(self, name, alias): self.name = name # public self.__alias = alias # private def who(self): print('name : ', self.name) print('alias : ', self.__alias) def __foo(self): # private method print('This is private method') def foo(self): # public method print('This is public method') self.__foo() # We can instantiate the class, but when we try to access the # private function: >>> from p2 import P >>> x = P('Alex', 'amem') >>> x.__foo() Traceback (most recent call last): File "", line 1, in AttributeError: P instance has no attribute '__foo' # But we can access this function by doing: >>> x._P__foo() This is private method / #debugging in python # epdb1.py -- experiment with the Python debugger, pdb import pdb a = "aaa" pdb.set_trace() b = "bbb" c = "ccc" final = a + b + c print final Then, run the script and it will stop when it reaches the pdb.set_trace() line. After that press 'n+enter' to advance, Also use 'p variable' to print the variable Use 'l' to list where you are Use b 48 (to set a breakpoint at line 48) Use c (go to the next breakpoint) cl or clear #to clear all breakpoints // pretty printing a data structure in python (similar to Data::Dumper) >>> import pprint >>> stuff = ['spam', 'eggs', 'lumberjack', 'knights', 'ni'] >>> stuff.insert(0, stuff[:]) >>> pp = pprint.PrettyPrinter(indent=4) >>> pp.pprint(stuff) [ ['spam', 'eggs', 'lumberjack', 'knights', 'ni'], 'spam', 'eggs', 'lumberjack', 'knights', 'ni'] // #Object instrospection, using a customized way of printing the object: >>> class Test: ... def __repr__(self): ... return "Test()" ... def __str__(self): ... return "member of Test" ... >>> t = Test() >>> t Test() >>> print t member of Test / One example of its real use: class Test: def __init__(self, a, b): self.a = a self.b = b def __repr__(self): return "" % (self.a, self.b) def __str__(self): return "From str method of Test: a is %s, b is %s" % (self.a, self.b) // *print all the attributes of an object def __str__(self): sb = [] for key in self.__dict__: sb.append("{key}='{value}'".format(key=key, value=self.__dict__[key])) return ', '.join(sb) def __repr__(self): return self.__str__() // #printing contents of a Python dict within a debugger: from pprint import pprint pprint (vars(your_object)) // #escaping in python: print(r"\t\n") #r stands for raw, this statement would literally print: \t\n #count characters within a string dna="AAAAGGGGG" dna.count("A") dna.count("G") #representation of an object repr(objectname) // Initializing 2 variables at the same time v1,v2=1,2 // # # #debugger# # # python -m pdb script.py #run python debugger p variable #print variable contents n #next statement b 2 #create a breakpoint at line 2 / if after entering a command we press enter, we repeat this last command / l #lists the area of my program that is currently being executed / s #step into a subroutine / q #exit from the debugger / c #continue until the next breakpoint is hitted // print 'hello' // >>>print 1,2 #print adding a space between 1 2 // print 1,2, #print without newline at end of text // >>>a="ccc" >>>print a.upper() CCC // L1=[2,3,4] #create list (array) print L1[0] #printing element at index=0 // L1[-2] #accessing element but counting from the right // L1[2:] #slicing a list (from index at 2 until the last index) // values = [100, 200, 300, 400, 500] # Slice from third index to index one from last. slice = values[2:-1] print(slice) Output [300, 400] // # different examples of slicing: # Let us first create a list to demonstrate slicing # lst contains all number from 1 to 10 lst = range(1, 11) print lst # we get: range(1, 11) That can be unpacked to: lst = [*range(1, 11)] # below list has numbers from 2 to 5 lst1_5 = lst[1 : 5] print lst1_5 # below list has numbers from 6 to 8 lst5_8 = lst[5 : 8] print lst5_8 # below list has numbers from 2 to 10 lst1_ = lst[1 : ] print lst1_ # below list has numbers from 1 to 5 lst_5 = lst[: 5] print lst_5 # below list has numbers from 2 to 8 in step 2 lst1_8_2 = lst[1 : 8 : 2] print lst1_8_2 # below list has numbers from 10 to 1 lst_rev = lst[ : : -1] print lst_rev # below list has numbers from 10 to 6 in step 2 lst_rev_9_5_2 = lst[9 : 4 : -2] print lst_rev_9_5_2 // # get the last elements of a list in python >>> a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] >>> a [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] >>> a[-9:] [4, 5, 6, 7, 8, 9, 10, 11, 12] // L1=["c","a","c","a"] #create list with strings // enumerate function, how it works: seasons = ['Spring', 'Summer', 'Fall', 'Winter'] for i, item in enumerate(L): ... print i ... print item 0 Spring 1 Summer 2 Fall 3 Winter // #split a string into characters >>>print list("hello") ['h', 'e', 'l', 'l', 'o'] // #indexing strings >>>h="hola" >>>print h[0] h or >>>print h[0:2] ho // a="hola" print len(a) #print out length of a // #concatenate a="cac" b="otas" print a+b // >>>a="1" >>>print a.isdigit() True \ >>> b="ahola" >>> print b.isdigit() False isdigit() returns True if all characters in a string are digits // #print a list as a string >>>L1=["c","a","c","a"] >>>print','.join(L1) c,a,c,a // #join in python 3 >>> ".".join(("a","b","c")) 'a.b.c' // #join in for loop: url = 'http://ergast.com/api/f1/{0}/qualifying?limit=10000' print('\n'.join(url.format(year) for year in range(2000, 2016))) # http://ergast.com/api/f1/2000/qualifying?limit=10000 # http://ergast.com/api/f1/2001/qualifying?limit=10000 # ... # http://ergast.com/api/f1/2015/qualifying?limit=10000 // #print argv passed to the script (sys.argv is a list) import sys print sys.argv print len(sys.argv) #length of the list passedFile=sys.argv[1] // print 3 in [1,2,3] #check membership 'TRUE' // for x in [1,2,3]: print x #iterate list // #to iterate over a certain range(from,to,interval) >>>for i in range(0,3,1): >>> print i // Generate list of range tuples with given boundaries in python ranges = [(n, min(n+step, stop)) for n in xrange(start, stop, step)] Will produce something like: [(100, 110), (110, 120), (120, 130), (130, 140)] // #iterave over a list >>>list=[1,2,3,4,5] >>>for i in range(len(list)): ... print list[i] 1 2 3 4 5 // #append an elements into a list list=[1,2,3] list.append(4) // #append several elements onto list list=[1,2,3] list.extend([4,5,6]) // #delete and print last element(item) list=[1,2,3] print list.pop() // #getting the last element of a list: some_list[-1] // #reverse a list # See methods at https://www.geeksforgeeks.org/python-reversing-list/ list.reverse() // #initializing an empty dict newdict=dict() // #create dictionary (hash) d1={'spam':2,'ham':1,'eggs':3} // #create dict from 2 lists: keys = ['a','b','c','d'] values = [1,2,3,4] d = dict(zip(keys, values)) // #fetch value by key d1['eggs'] // #key membership test >>> dict={'a':1,'b':2,'c':3} >>> 'a' in dict True >>> 'd' in dict False >>> if not 'f' in D: print('missing') // #get the value for a specific key if it exists, if not it will assign a default value: >>> dict={'a':1,'b':2,'c':3} >>> value=dict.get('c',0) #value will be 3 >>> value=dict.get('d',0) #value will be 0 because 'd' does not exist // #check if key exists in a 2 nested dictionary d.get('key1',{}).get('key2') // #returns list with keys >>> d1.keys() ['eggs', 'ham', 'spam'] // #iterating through key/values simultaneously >>>for k,v in d1.items(): >>> print k,v // #another example of the use of a dict to organize the cities by state >>> from collections import defaultdict >>> city_list = [('TX','Austin'), ('TX','Houston'), ('NY','Albany'), ('NY', 'Syracuse'), ('NY', 'Buffalo'), ('NY', 'Rochester'), ('TX', 'Dallas'), ('CA','Sacramento'), ('CA', 'Palo Alto'), ('GA', 'Atlanta')] >>> >>> cities_by_state = defaultdict(list) >>> for state, city in city_list: ... cities_by_state[state].append(city) ... for state, cities in cities_by_state.iteritems(): ... print state, ', '.join(cities) ... NY Albany, Syracuse, Buffalo, Rochester CA Sacramento, Palo Alto GA Atlanta TX Austin, Houston, Dallas // # better way of performing autovivification class AutoVivification(dict): """Implementation of perl's autovivification feature.""" def __getitem__(self, item): try: return dict.__getitem__(self, item) except KeyError: value = self[item] = type(self)() return value a = AutoVivification() a[1][2][3] = 4 a[1][3][3] = 5 a[1][2]['test'] = 6 print(a) // #creating a nested dictionary defining a priory the number of levels and the final value: from collections import defaultdict def autovivify(levels=1, final=dict): return (defaultdict(final) if levels < 2 else defaultdict(lambda: autovivify(levels - 1, final))) words = autovivify(3, list) words["sam"][2012][5].append('a') words["sam"][2012][5].append('b') #or: words = autovivify(5, int) words["sam"][2012][5][25]["hello"] += 1 words["sue"][2012][5][24]["today"] += 1 // #create nested dictionary(dictionary of dictionary) >>>d1={} >>>d1["key1"]={ >>> "keyA":0, >>> "keyB":1, >>> "keyC":2, >>> } // #checking for the existence of a key in a dictionary: if key in d: d[key] += 1 else: d[key] = 1 // #print out results >>>for key in d1.keys(): >>> for key1 in d1[key]: >>> print key1 keyC keyB keyA // #accessing a nested dictionary >>>print d1["key1"]["keyA"] 0 // #autovivification in python def rec_dd(): return defaultdict(rec_dd) >>> x = rec_dd() >>> x['a']['b']['c']['d'] defaultdict(, {}) // dict.items() #This method returns a list of tuple pairs. dict = {'Name': 'Zara', 'Age': 7} print "Value : %s" % dict.items() Value : [('Age', 7), ('Name', 'Zara')] // #deleteing a key from dict: >>> dict={'a':1, ... 'b':2, ... 'c':3} >>> del dict['a'] >>> dict {'c': 3, 'b': 2} // #creating a dictionary from list with indices as keys a = [51,27,13,56] b = dict(enumerate(a)) print(b) will produce: {0: 51, 1: 27, 2: 13, 3: 56} // #defaultdict. A defaultdict works exactly like a normal dict, but it is initialized with a function (“default factory”) that takes no arguments and provides the default value for a nonexistent key. >>> from collections import defaultdict >>> ice_cream = defaultdict(lambda: 'Vanilla') >>> >>> ice_cream = defaultdict(lambda: 'Vanilla') >>> ice_cream['Sarah'] = 'Chunky Monkey' >>> ice_cream['Abdul'] = 'Butter Pecan' >>> print ice_cream['Sarah'] Chunky Monkey >>> print ice_cream['Joe'] Vanilla >>> / #Implementing a counter with defaultdict from collections import defaultdict string="a a b b b a a a b b b" letters=string.split() letter_count=defaultdict(int); # default value of int is 0 for letter in letters: letter_count[letter] +=1 # increment element's value by 1 // # is used for comments // v1="hola" v2="caracola" // #initializing 2 variables at the same time v1,v2=1,2 // # # #debugger# # # pdb script.py #run python debugger p variable #print variable contents n #next statement b 2 #create a breakpoint at line 2 run #restart the debugger // #creating a breakpoint within a function that is in a class: b VcfQC.run_CollectVariantCallingMetrics #it has to be set after being imported // print 'hello' // >>>print 1,2 #print adding a space between 1 2 // print 1,2, #print without newline at end of text // >>>a="ccc" >>>print a.upper() CCC // L1=[2,3,4] #create list (array) print L1[0] #printing element at index=0 // L1[-2] #accessing element but counting from the right // L1[2:] #slicing a list (from index at 2 until the last index) // L1=["c","a","c","a"] #create list with strings // #indexing strings >>>h="hola" >>>print h[0] h or >>>print h[0:2] ho // a="hola" print len(a) #print out length of a // #concatenate a="cac" b="otas" print a+b // #string+int concatenation >>>a=1 >>>b="hola" >>>c=str(a)+b 1hola #a second way of doing this concatenation would be to use backticks >>>a=1 >>>b="hola" >>>c=`a`+b // #print a string and an int >>>a=1 >>>print "caramelos=",a // convert string into integer start="1000" end=str(start)+1 // >>>L1=["c","a","c","a"] >>>print','.join(L1) c,a,c,a // #to replace characters in a string newstringObject=stringObject.replace(old,new) // #removing whitespaces from a string >>> s=" I am learning\tpython" >>> s.replace(' ','') 'Iamlearning\tpython' // #removing all whitespaces and replacing by single /t >>> s="hola adios" >>> re.sub('\s+','\t',s) 'hola\tadios' // #using regex when replacing >>>s='100 NORTH 100' >>>re.sub('^100','200',s) >>>re >>>'200 NORTH 100' // #delete a fragment of a string import re url = 'abcdc.com' url = re.sub('\.com$', '', url) // #Referencing the stdout: import sys sys.stdout // #print argv passed to the script import sys print sys.argv // print 3 in [1,2,3] #check membership 'TRUE' // for x in [1,2,3]: print x #iterate list // #to iterate over a certain range(from,to,interval) >>>for i in range(0,3,1): >>> print i // #iterave over a list >>>list=[1,2,3,4,5] >>>for i in range(len(list)): ... print list[i] 1 2 3 4 5 // # append integer to the beginning of a list >>> a = 5 >>> li = [1, 2, 3] >>> [a] + li # Don't use 'list' as variable name. [5, 1, 2, 3] // #append an elements into a list list=[1,2,3] list.append(4) // #append several elements onto list list=[1,2,3] list.extend([4,5,6]) // #delete and print last element(item) >>>list=[1,2,3] >>>print list.pop() 3 // #delement an element in the list >>>list=[1,2,3] >>>del list[0] >>>print list 2,3 // #reverse a list list.reverse() // #using lists as stacks #a stack is a data structure where the last element added is the first element retrieved("last-in,first-out"): >>>stack=[3,4,5] >>>stack.append(6) >>>stack.append(7) >>>>>> stack [3, 4, 5, 6, 7] >>> stack.pop() 7 >>> stack [3, 4, 5, 6] >>> stack.pop() 6 >>> stack.pop() 5 >>> stack [3, 4] // #Using Lists as Queues You can also use a list conveniently as a queue, where the first element added is the first element retrieved (“first-in, first-out”). To add an item to the back of the queue, use append(). To retrieve an item from the front of the queue, use pop() with 0 as the index. For example: >>> queue = ["Eric", "John", "Michael"] >>> queue.append("Terry") # Terry arrives >>> queue.append("Graham") # Graham arrives >>> queue.pop(0) 'Eric' >>> queue.pop(0) 'John' >>> queue ['Michael', 'Terry', 'Graham'] // #create dictionary (hash) d1={'spam':2,'ham':1,'eggs':3} // #fetch value by key d1['eggs'] // #key membership test >>> d1.has_key('ham') True // #returns list with keys >>> d1.keys() ['eggs', 'ham', 'spam'] // #iterating through key/values simultaneously >>>for k,v in d1.items(): >>> print k,v // #create nested dictionary(dictionary of dictionary) >>>d1={} >>>d1["key1"]={ >>> "keyA":0, >>> "keyB":1, >>> "keyC":2, >>> } #print out results >>>for key in d1.keys(): >>> for key1 in d1[key]: >>> print key1 keyC keyB keyA // #iterate over this is sorted order by the key >>> steps = {1:"val1", 5:"val2", 2:"val3"} >>> for key in sorted(steps): ... print steps[key] ... val1 val3 val2 // #accessing a nested dictionary >>>print d1["key1"]["keyA"] 0 // #sorting a dictionary by its keys def sortedDictValues(adict): keys = adict.keys( ) keys.sort( ) return [adict[key] for key in keys] // #In Python2.7, an OrderedDict can be used to sort a dictionary by its keys: sd = OrderedDict(sorted(d.items())) #where d is the dictionary to sort // #looping through a sorted dict: python_words = {'list': 'A collection of values that are not connected, but have an order.', 'dictionary': 'A collection of key-value pairs.', 'function': 'A named set of instructions that defines a set of actions in Python.', } for word in sorted(python_words.keys()): print(word) // #sorting a dictionary by its values >>>d1={ >>> 'a':2, >>> 'b':4, >>> 'c':3, >>> 'd':1 >>> } >>> >>>sortedKeys=sorted(d1.items(), key=lambda(k,v):(v,k)) >>> >>>for thisKey in sortedKeys: >>> print thisKey[0],d1[thisKey[0]] d 1 a 2 c 3 b 4 // #counting characters in a string >>>string="aaabbb" >>>charCount={} >>>for char in string: >>> charCount[char]=charCount.get(char,0)+1 >>>print charCount {'a': 3, 'b': 3} // myfile=open('myfile','w') #open for output (creates) myfile.write('hello text file\n') #write a line of text myfile.close() // >>> myfile=open('myfile','r') #open for input >>> myfile.readline() #read the line back 'hello text file\n' // #open a file and throws an error if file does not exist file="filename" #check if file exists if os.path.isfile(file) == False: raise Exception("File does not exist") #getting the dir for a certain file: dir=os.path.dirname(os.path.abspath(file)) #check if dir exists import os os.path.isdir('./dir') // #better way of opening a file (it closes the file when there is an error) #using with with open("x.txt") as f: data = f.read() do something with data / #or reading line per line with open("x.txt") as f: for line in f: print line, // #os module // *get the file size of a certain file import os os.path.getsize('C:\\Python27\\Lib\\genericpath.py') // #getting the current working directory >>>import os >>> print os.getcwd() /data/scratch/ernesto/454/SCD1/SNPs // #return all files in a directory as a list >>>os.listdir(os.getcwd()) or >>>os.listdir('.') // #getting extension of a file >>>import os >>> os.path.splitext('caca.txt') ('caca', '.txt') // >>> filename='/Users/ernesto/projects/IGSR/files/testABC.pdf' 22>>> os.path.basename(filename).split('.')[0] 'testABC' // #splitting an absolute path in /path/ and file name >>>import os >>> os.path.split('/data/genomes/human36/chr1.fa') ('/data/genomes/human36/','chr1.fa') // #globbing or reading files from a directory import glob glob.glob("/data/scratch/ernesto/454/SCD1/againstGenes/snps/*txt") // #globbing import glob for file in glob.glob("*txt"): print file / # globbing and filtering the resulting files based on a pattern: import glob res = [f for f in glob.glob("*.txt") if "abc" in f or "123" in f or "a1b" in f] for f in res: print(f) // #delete a file os.remove('afile') // #renaming a file os.rename(filename,newfilename) // #if statement >>>x=2; >>>if x==1: >>> print "hello" >>>elif x==2: >>> print "caca" >>>else: >>> print "cacotas" // #negating ifs if not x==1: print "hello" // #while loop >>>a=0; b=10 >>> >>>while a>> print a, >>> a+=1 0 1 2 3 4 5 6 7 8 9 // *args and **keywordargs for functions definitions are used for passing lists of arguments and dictionaries of arguments, respectively. So if I had a function this: def printlist(*args): for x in args: print x printlist(1, 2, 3, 4, 5) # or as many more arguments as I'd like def printdict(**kwargs): print repr(kwargs) printdict(john=10, jill=12, david=15) // #using a default value for an argument that can be passed through the **kwargs dictionary def printdict(a,b=5,**kwargs): print("b="+str(b)) printdict(a=1,john=10, d=12, david=15) #will print b=5, but if b is passed, then the new value of b will be printed // #passing a dictionary to a function having **kwargs def printdict(a,b=3,**kwargs): print("a="+str(a)) print("b="+str(b)) print repr(kwargs) d={'john':1,'b':2,'david':3} printdict(a=1,**d) // * setting class attributes dynamically with variable number of arguments and kwargs: class Bar(object): def __init__(self, **kwargs): self.__dict__.update(kwargs) bar = Bar(a=1, b=2) print(bar.b) / * And if you want to allow only a certain attributes: class Bar(object): def __init__(self, **kwargs): allowed_keys = ['a', 'b', 'c'] self.__dict__.update((k, v) for k, v in kwargs.items() if k in allowed_keys) // #passing a dictionary to a function def my_function(city, standard, name, school=None): schoolname = school cityname = city standard = standard studentname = name print(cityname) data = {'standard': '7', 'name': 'abc', 'city': 'delhi'} my_function(**data) // #create times function >>>def times(x,y): >>> return x*y >>>product=times(2,4) >>>print product 8 // #in python lists,dictionaries and tuples are basic types. So they can be passed as arguments for functions // #checking the type of an object >>>a=1 >>>type(a) // REGEX \d matches any decimal digit, [0-9] \D matches any non-digit character [^0-9] \s matches any whitespace character [ \t\n\r\f\v] \S matches any non-whitespace character [^ \t\n\r\f\v] \w matches any alphanumeric character [a-zA-Z0-9_] \W matches any non-alphanumeric character [^a-zA-Z0-9_] // Performing Matches: match() #determine if the RE matches at the beginning of the string search() #scan through a string, looking for any location where this RE matches findall() #find all substrings where the RE matches, and returns them as a list finditer() #find all substrings where the RE matches, and return them as an iterator // REGEX >>>import re #module for regex >>>p=re.comple('^>') #compile patternp >>>print p.match("") #if match then returns a match object. if not then returns none None // #open a file and splitting by newlines: with open("out.txt") as f: data = f.read().splitlines() print "h\n" // #open a file a print out all lines starting with a pattern: with open("test.fasta") as f: for line in f: if line.startswith(">"): print line, // #grouping in the REGEX >>>dna="chr14:10000-20000" >>>p=re.compile("(chr\d+):(\d+)-(\d+)") >>>m=p.match(dna) >>>print m.group(0) #returns the whole match >>>chr=m.group(1) >>>start=m.group(2) >>>end=m.group(3) >>>print chr >>>print start >>>print end chr14:10000-20000 chr14 10000 20000 // #matching 5 as a{5} // #parsing a file and skipping lines starting with a certain pattern import re p=re.compile('^#') #compile pattern infile=open('filename','r') while 1: line=infile.readline() if p.match(line):continue // #Create a List that contain each Line of a File List = open("filename.txt").readlines() // #Structure of an exception: try: You do your operations here; ...................... except ExceptionI: If there is ExceptionI, then execute this block. except ExceptionII: If there is ExceptionII, then execute this block. ...................... else: If there is no exception then execute this block. #Ex: #!/usr/bin/python try: fh = open("testfile", "w") fh.write("This is my test file for exception handling!!") except IOError: print "Error: can\'t find file or read data" else: print "Written content in the file successfully" fh.close() // #List comprehensions: Everything that can be expressed as a 'for' loop can be expressed with a list comprehension. For example (with pseudocode): new_things = [] for ITEM in old_things: if condition_based_on(ITEM): Can be expressed: new_things = ["something with " + ITEM for ITEM in old_things if condition_based_on(ITEM)] With a real example: numbers = [1, 2, 3, 4, 5] doubled_odds = [] for n in numbers: if n % 2 == 1: doubled_odds.append(n * 2) Can be transformed to: numbers = [1, 2, 3, 4, 5] doubled_odds = [n * 2 for n in numbers if n % 2 == 1] #if the for loop does not have a condition. Then it is even simpler: doubled_numbers = [] for n in numbers: doubled_numbers.append(n * 2) Gets to: doubled_numbers = [n * 2 for n in numbers] #For nested loops (for example, if we want to flatten a matrix): flattened = [] for row in matrix: for n in row: flattened.append(n) Then the comprehension would be: flattened = [n for row in matrix for n in row] // *Using regex on the elements of a list: *Good explanation at: http://www.cademuir.eu/blog/2011/10/20/python-searching-for-a-string-within-a-list-list-comprehension/ >>> import re >>> list=['a cat','a dog','a yacht','cats'] >>> regex=re.compile(".*(cat).*") >>> [m.group(0) for l in list for m in [regex.search(l)] if m] ['a cat', 'cats'] >>> [m.group(1) for l in list for m in [regex.search(l)] if m] ['cat', 'cat'] // # catch all exceptions try: ... except: # catch just one exception try: ... except IOError: ... # catch one exception, but provide the exception object try: ... except IOError, e: ... # catch more than one exception try: ... except (IOError, ValueError), e: ... // #raising exceptions, allows the programmer to force a specified exception to occur. >>>try: >>> raise NameError('HiThere') >>>except NameError: >>> print 'An exception flew by!' >>> raise // *TypeError exceptions: : *We are trying to access a string as it were a dict a="abc" try: value = a['a'] except TypeError,e: print "Not valid",e #e will contain more info in the error cause // >>>a="hola," >>>print a.strip(",") hola // #remove newline character in the right side line=line.rstrip("\n") // #remove all spaces in the right side line=line.rstrip() // #remove all spaces in any of the sides line=line.strip() // while : if: break #last equivalent if: continue if: pass // += #concatenate operator // #or logical operator if a=="a" or a=="b": // #and logical operator if a=="a" and a=="b" // #increment operator variable+=1 // #map, executes passed in function over each item in a list >>>def inc(x):return x+10 >>>L=map(inc,[1,2,3]) >>>print L // #list comprenhension (shortcut to create lists from other lists by specifying a formula to be applied to each element) >>>[x*x for x in [1,2,3]] [1, 4, 9] / *concat a string to all elements of a list using comprenhension >>>['concat_'+x for x in ['a','b','c']] ['concat_a', 'concat_b', 'concat_c'] / #another example: pow2 = [2 ** x for x in range(10)] // #getting user input name=(raw_input("Como te llamas?")) // #exceptions // #raising a IOError exception try: f=open('normoxia_70bp.aln.nativ','r') except IOError, e: print e // #casting to a int a="1" b=int(a) // #casting into a float a="1.245" b=float(a) // #string formatting >>> exclamation="Ni" >>> "The knights who say %s!" % exclamation 'The knights who say Ni!' // "%d %s %d you" % (1,'spam',4) '1 spam 4 you' // *If one want to repeat the same string several times, instead of doing: s='arbit' string='%s hello world %s hello world %s' %(s,s,s) *It is better to use (available from Python 2.6 and Python 3.x:): incoming = 'arbit' result = '{0} hello world {0} hello world {0}'.format(incoming) // #formatting a float (setting the number of decimal places) >>> a=1.23456789 >>> '%.2f' % a '1.23' // #Differences between %s and %r: %s invokes str(), whereas %r invokes repr(). Ex: x = "example" print "My %s"%x My example print "My %r"%x My 'example' // #Another way, using string format method. Printing out a table: table={'hola': 1, 'adios':2, 'bye':3} for name,phone in table.items(): print '{0:10} ==> {1:40d}'.format(name,phone) bye ==> 3 hola ==> 1 adios ==> 2 10 and 40 controls the amount of spaces in each cell, the d is because we are dealing with integers, if we would have strings, we should use 40s // #Calling a function for each object within a list of objects (in this case we invoke the mass function) sumofmass = sum(i.mass for i in objList) // #exitting the script sys.exit() // #making a python script executable #!/usr/bin/python print "KAKOTAS!!!" #chmod +x prueba.py // #issue warnings to STDER import warnings warnings.warn("Hello") // #process management >>>import subprocess >>> >>>subprocess.Popen(['/bin/echo','hola']) #open a process, args is a list in which list[0] is the command and list[>1] are the command arguments #NOTE. If we open processes in a loop, this subprocess module allows to open several processes concurrently. This functionality may crash the server / #gettng the stdout and stderr from subprocess import Popen, PIPE cmd = 'blah' p = Popen(cmd, stdout=PIPE, stderr=PIPE) stdout, stderr = p.communicate() / #run ls and store its output import subprocess ls_output = subprocess.check_output(['ls']) #to pass args, #the first elem is the command name and the rest are the args subprocess.check_output(['ls', '-l']) # #this will not work, the shell considers the entire string as something to be executed subprocess.call(['ls -l']) # #this would work: subprocess.call('ls -l', shell=True) // returncode=subprocess.call("ls -l", shell=True) #if returncode=0 then everything went well, if not then it will 1 // *running a process and accessing the error when something went wrong: import subprocess try: subprocess.check_output("ls -l",shell=True) except subprocess.CalledProcessError as e: print(e.output) // #another way of opening subprocesses is os.system(). It is easier to use than subprocess but user do not have the same level of control over the subprocesses. #If you have several arguments to be passed to the process, a List is not needed.The use of system is equivalent to Perl system() #System only open one process at a time >>>import os >>>os.system("blat database file -minIdentity=100 -out=pslx outputname.pslx") // #declaring and initializing several variables at a time (a,b)=('a','b') or (a,b)=('','') // #sort a list >>>array=[2,1,4,3,5,6] >>>array.sort() >>>print array [1, 2, 3, 4, 5, 6] // #getting the index of a cetain element in an array ["foo", "bar", "baz"].index("bar") 1 // >>>4%2 #modulus // #calculating median array=[2,1,4,3,5,6] array.sort() count=len(array) median=0.0 if count%2: #odd median=array[int(count/2)]; else: #even lower=float(array[(count/2)-1]) upper=float(array[count/2]) median=(float(array[count/2])+float(array[count/2-1]))/2 print median // #multiline printing >>>print """ >>>First line. >>>Second line. >>>Third line. >>>""" First line. Second line. Third line. // #installing python eggs: sudo easy_install JsonUtils or pip install python_module_name #this will install it in /homes/ernesto/.local/lib/python2.7/site-packages/ / #knowing where pip installs the modules: >>> import site; site.getsitepackages() / #Changing the default install location for pip: pip install --install-option="--prefix=/homes/ernesto/" packagename / #knowing list of installed packages and versions: pip list / #If you have a program working in a given computer, then get a freeze of all pip modules installed: pip freeze > requeriments.txt #then, install all modules required on a new computer using: pip install -r requirements.txt / #upgrade a given package: pip install modulename --upgrade / #uninstall a package: pip uninstall packagename / #installing easy_install locally: wget https://bootstrap.pypa.io/ez_setup.py -O - | python - --user #will install it at: /homes/ernesto/.local/bin/ / #using easy_install for installing in a given dir: ~/.local/bin/easy_install-2.7 --install-dir /homes/ernesto/.local/lib/python2.7/site-packages pip / *If pip it is not installed, you will need to install it by doing: sudo easy_install pip // #installing package from source #use the option --record files.txt to know what files were created python setup.py install --record files.txt --home=/nfs/research2/flicek/user/ernesto/ // #knowing the location of an installed package: pip show packagename // #classes in Python #!/usr/bin/python class Employee: 'Common base class for all employees' empCount = 0 def __init__(self, name, salary): self.name = name self.salary = salary Employee.empCount += 1 def displayCount(self): print "Total Employee %d" % Employee.empCount def displayEmployee(self): print "Name : ", self.name, ", Salary: ", self.salary "This would create first object of Employee class" emp1 = Employee("Zara", 2000) "This would create second object of Employee class" emp2 = Employee("Manni", 5000) emp1.displayEmployee() emp2.displayEmployee() print "Total Employee %d" % Employee.empCount // #reading in a Json file import json from pprint import pprint file=open('test.json','r') parsed_json = json.load(file) pprint(parsed_json) // *Creating your own module: Read tutorial at: http://www.tutorialspoint.com/python/python_modules.htm // *Creating your own package: Read at tutorial at: http://www.tutorialspoint.com/python/python_modules.htm // *The dir() function: The dir() built-in function returns a sorted list of strings containing the names defined by a module. For example: # Import built-in module math import math content = dir(math) print content // *Parsing XML One useful module is xmltodict. Read on it at: http://docs.python-guide.org/en/latest/scenarios/xml/ / #parsing and writing out a xml file: with open('analysis.xml') as fd: doc = xmltodict.parse(fd.read()) doc['ANALYSIS_SET']['ANALYSIS']['ANALYSIS_TYPE']['REFERENCE_ALIGNMENT']['ASSEMBLY']['STANDARD']['@refname']="GRCh37" print (xmltodict.unparse(doc,encoding='utf-8',pretty=True)) #Problem is that is not respect the element order when unparsing // *Counter in Python: >>> from collections import Counter >>> Counter(['apple','red','apple','red','red','pear']) Counter({'red': 3, 'apple': 2, 'pear': 1}) // #creating datetime objects: >>> import datetime >>> >>> x = datetime.datetime(2020, 5, 17) >>> print(x) 2020-05-17 00:00:00 / #or : >>> x = datetime.datetime(2020, 5, 17,22,30) >>> print(x) 2020-05-17 22:30:00 // *Working with dates: from datetime import datetime now = datetime.now() print "Now: ", now print "Today's date: ", now.strftime('%Y-%m-%d') print "year:", now.year print "month:", now.month print "day:", now.day print "hour:", now.hour print "minute:", now.minute print "second:", now.second / *Converting strings to dates: from datetime import datetime datetime_object = datetime.strptime('Jun 1 2005 1:33PM', '%b %d %Y %I:%M%p') candle = datetime.strptime("2015-10-24 21:10:05", "%Y-%m-%d %H:%M:%S") / *datetime in isoformat print datetime.isoformat() 2015-10-19T21:00:00 / *Convert string into datetime from datetime import datetime date_object = datetime.strptime('Jun 1 2005 1:33PM', '%b %d %Y %I:%M%p') / #dates arithmetics: import datetime d=datetime.datetime.strptime("2016-10-26", "%Y-%m-%d").date() one_day = datetime.timedelta(days=3) yesterday = d - one_day print yesterday / * Getting hours, hours, days from timedelta >>> import datetime >>> x = datetime.datetime(2020, 5, 17,22,30) >>> y = datetime.datetime(2020, 5, 16,22,30) >>> d= x-y >>> d.days Returns 2 (days in this case) If the timedelta is not a whole, then we will get seconds that we will need to convert to hours >>> x = datetime.datetime(2020, 5, 17,22,30) >>> y = datetime.datetime(2020, 5, 16,23,00) >>> d= x-y >>> d.days 0 >>> d.seconds 84600 >>> d.seconds/3600 In hours / *another way of dealing with dates: / *Adding single days: import pandas as pd ic="2016-12-11 22:00:00" D=pd.DateOffset(1) # pd.datetime is an alias for datetime.datetime candle = pd.datetime.strptime(ic, "%Y-%m-%d %H:%M:%S") start=candle-2*D end=candle+2*D // *Knowing the weekday: datetime.datetime.today().weekday() // *substracting 2 times (differences between 2 dates): # Create datetime objects for each time (a and b) dateTimeA = datetime.datetime.combine(datetime.date.today(), a) dateTimeB = datetime.datetime.combine(datetime.date.today(), b) # Get the difference between datetimes (as timedelta) dateTimeDifference = dateTimeA - dateTimeB # Divide difference in seconds by number of seconds in hour (3600) dateTimeDifferenceInHours = dateTimeDifference.total_seconds() / 3600 // import pandas as pd # BDay is business day, not birthday... from pandas.tseries.offsets import BDay # pd.datetime is an alias for datetime.datetime today = pd.datetime.today() print today - BDay(4) / * creating times from datetime import time # time(hour = 0, minute = 0, second = 0) a = time() print("a =", a) # time(hour, minute and second) b = time(11, 34, 56) print("b =", b) # time(hour, minute and second) c = time(hour = 11, minute = 34, second = 56) print("c =", c) # time(hour, minute, second, microsecond) d = time(11, 34, 56, 234566) print("d =", d) / *Creating dates/times ranges: *Hourly import pandas as pd ic="2016-12-11 22:00:00" rng = pd.date_range('1/1/2011', periods=72, freq='H') print rng[:5] / *Iterating over time_ranges: for d in pd.date_range(start='2016-12-09',end='2016-12-15'): print d // *Initializing a time object with 0:00:00 import datetime t = datetime.time(0, 0, 0) * (hours,minutes,seconds) // *getting the current time and date: now = datetime.datetime.now() / #comparing 2 times: import datetime d.time()2 #will no raise anything, But if we do: i=1 assert i>2 We get: Traceback (most recent call last): File "test.py", line 3, in assert i>2 AssertionError // #assert on a single line for testing a condition >>> a=2 >>> assert a==1, "a variable is not 1" // # assert a datetime: assert slist.start() == datetime.datetime(2019, 3, 10, 21, 0) // #knowing the environment within a function an outside the function: a_global="hole" def foo(): print locals() print foo() print globals() #And the output you get: {} None {'a_global': 'hole', '__builtins__': , '__file__': 'test.py', '__package__': None, '__name__': '__main__', 'foo': , '__doc__': None} Whichs that the local environment within foo() is empty and the global environment contains the variable declared that is named 'a_global' // #modifying the contents of PATH to a certain value dict(os.environ,PATH="/homes/ernesto/lib") // #passing the desired PYTHONPATH at runtime: PYTHONPATH=/path/to/ python script.py // *duck typing: he idea is that it doesn't actually matter what type my data is - just whether or not I can do what I want with it. // *In Python, everything is an object. So strings can be used as arrays, because strings are objects that contain the __get_item__ function str="abc" print str[0] *will print a print str[1] * will print b // #Parameter checking in Python: 1 from types import * 2 class MyDB: 3 ... 4 def add(self, id, name): 5 assert type(id) is IntType, "id is not an integer: %r" % id 6 assert type(name) is StringType, "name is not a string: %r" % name #Check if something is a list or a string: #In Python3: lst=[1,2,3] assert not isinstance(lst, str) #In Python2 assert not isinstance(lst, basestring) // # Check if something is either a float or int a='s' if not isinstance(a, (int, float)): print("a is either int or float") // #checking if something is a list: if not isinstance(objs,list): print "h" // #checking if something is a number import numbers isinstance('a', numbers.Number) >>> False // *merging (concatenate) 2 lists: >>> a=[1,2,3,4] >>> b=[2,3,4,5] >>> a+b [1, 2, 3, 4, 2, 3, 4, 5] // *zip It is a built-in function to merge together 2 lists in the following way: >>> names = ['Bob','Jessica','Mary','John','Mel'] >>> births = [968, 155, 77, 578, 973] >>>> zip(names,births) [('Bob', 968), ('Jessica', 155), ('Mary', 77), ('John', 578), ('Mel', 973)] // *Pandas: It is a module used for data analysis. / *First, creating a data.frame from a certain list: from pandas import DataFrame, read_csv / *Panda's data structures: **Series: One-dimensional labeled array capable of holding any data type(integers,strings, objects, etc.) * iterating over Series: for i, value in df['column'].mean().iteritems(): print(i, value) *Initializing: s = pd.Series(data, index=index) data can be: a Python dict an ndarray a scalar value (like 5) *Initializing from an ndarray: s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) *Initializing from a dict: d = {'a' : 0., 'b' : 1., 'c' : 2.} pd.Series(d) // #initialize dict with keys,values from two lists keys = ['a','b','c','d'] values = [1,2,3,4] d = dict(zip(keys, values)) // *Accessing elements: d['a'] *Operations on series: In [43]: s[s>1] c 2.0 dtype: float64 In [43]: s.mean() Out[43]: 1.0 *They behave like dicts: s.keys() *Getting unique values from panda series: s.unique() // *removing characters from strings in a list lst = [("aaaa8"),("bb8"),("ccc8"),("dddddd8")] print([s.strip('8') for s in lst]) # remove the 8 from the string borders print([s.replace('8', '') for s in lst]) # remove all the 8s // * Useful snippets on dataframes: https://jeffdelaney.me/blog/useful-snippets-in-pandas/ // # creating a random dataframe df = pandas.DataFrame(np.random.randn(5, 3), columns=['a', 'b', 'c']) // * exploring the dataframe df.shape # number of rows/columns in a tuple // #creating a dataframe from numpy arrays: data = np.array([['','Col1','Col2'], ['Row1',1,2], ['Row2',3,4]]) print(pd.DataFrame(data=data[1:,1:], index=data[1:,0], columns=data[0,1:])) And this will produce: output: Col1 Col2 Row1 1 2 Row2 3 4 // * using apply to apply a function over the column of a dataframe: df['A']=df['A'].apply(lambda x:x+1) // * Filter a dataframe based on the outcome of a function applyied on a certain column In [3]: df = pandas.DataFrame(np.random.randn(5, 3), columns=['a', 'b', 'c']) In [4]: df Out[4]: a b c 0 -0.001968 -1.877945 -1.515674 1 -0.540628 0.793913 -0.983315 2 -1.313574 1.946410 0.826350 3 0.015763 -0.267860 -2.228350 4 0.563111 1.195459 0.343168 In [6]: df[df.apply(lambda x: x['b'] > x['c'], axis=1)] Out[6]: a b c 1 -0.540628 0.793913 -0.983315 2 -1.313574 1.946410 0.826350 3 0.015763 -0.267860 -2.228350 4 0.563111 1.195459 0.343168 // * Difference between map, apply and applymap Map: It iterates over each element of a series. df[‘column1’].map(lambda x: 10+x), this will add 10 to each element of column1. df[‘column2’].map(lambda x: ‘AV’+x), this will concatenate “AV“ at the beginning of each element of column2 (column format is string). Apply: As the name suggests, applies a function along any axis of the DataFrame. df[[‘column1’,’column2’]].apply(sum), it will returns the sum of all the values of column1 and column2. ApplyMap: This helps to apply a function to each element of dataframe. func = lambda x: x+2 df.applymap(func), it will add 2 to each element of dataframe (all columns of dataframe must be numeric type) // # The inital set of baby names and bith rates names = ['Bob','Jessica','Mary','John','Mel'] births = [968, 155, 77, 578, 973] BabyDataSet = list(zip(names,births)) df = DataFrame(data = BabyDataSet, columns=['Names', 'Births']) #selecting the first column of a dataframe: df[[0]] *adding a new columen to a dataframe df['newcol']=toadd *peeking the dataframe: df.head() *Getting a basic summary on a quantitative variable: df['births'].describe() *Getting the sum of a column: Total = df['MyColumn'].sum() print (Total) 319 *Getting the mean of a column: df['births'].mean() *Median df['births'].median() *10th percentile df['births'].quantile(0.1) *90th percentile df['births'].quantile(0.9) # Now, export the data.frame to a csv file df.to_csv('births1880.csv',index=False,header=False) # read-in the data frame from csv file: df=read_csv('/Users/ernesto/Google_Drive/PYTHON_LEARN/births1880.csv') # read-in the data frame from csv file specifying that the first row is the header df=read_csv('/Users/ernesto/Google_Drive/PYTHON_LEARN/births1880.csv',heder=1) # read-in the data.frame from csv file adding column names, and therefore an index df=read_csv('/Users/ernesto/Google_Drive/PYTHON_LEARN/births1880.csv',names=['col1','col2','col3']) # read-in the data frame from tsv file: df=pd.DataFrame.from_csv('/Users/ernesto/projects/IGSR/16_12_16/cov_DF.txt', sep='\t') #read-in the data frame from file and skipping comments: df = pd.read_csv("DF.txt",comment='#') #read-in the data frame and specifying that 2 columns are dates df = pd.read_csv('pizza.csv', parse_dates=['dates']) #read-in the data frame and use only some columns df = pd.read_csv('pizza.csv', usecols=['foo', 'bar']) # if first column is not picked, try with index_col=False DF=pd.DataFrame.from_csv('/Users/ernesto/projects/IGSR/16_12_16/cov_DF.txt', sep='\t',index_col=False) #read in the data frame without header df=read_csv('/Users/ernesto/Google_Drive/PYTHON_LEARN/births1880.csv',header=None) #knowing the data types in a dataframe df.dtypes #checking if a data.frame is empty: if df.empty: do something #checking if a list is empty: if not a: print("List is empty") #renaming column names in pandas data.frame: df.columns = ['a', 'b'] #pretty print a data.frame from tabulate import tabulate import pandas as pd df = pd.DataFrame({'col_two' : [0.0001, 1e-005 , 1e-006, 1e-007], 'column_3' : ['ABCD', 'ABCD', 'long string', 'ABCD']}) print tabulate(df, headers='keys', tablefmt='psql') +----+-----------+-------------+ | | col_two | column_3 | |----+-----------+-------------| | 0 | 0.0001 | ABCD | | 1 | 1e-05 | ABCD | | 2 | 1e-06 | long string | | 3 | 1e-07 | ABCD | +----+-----------+-------------+ #copy a data.frame surveys_copy = surveys_df.copy() #slice a data.frame via indexes: surveys_df.iloc[0:3, 1:4] #select 3 first rows and columns 1 to 4 #slice the first 2 columns and all the rows: surveys_df.iloc[:, 1:4] #accessing a cell surveys_df.iloc[1, 2] #slice and select different non-consecutive columns surveys_df.iloc[:, [0,1,4]] #accessing the actual value for a certain cell i p df['colname'].item() #iterating over a data.frame for index, row in data1.iterrows(): print(row['col1']) #changing the data types within a dataframe df=df.astype(int) >>> df 0 1 0 Bob 968 1 Jessica 155 2 Mary 77 3 John 578 4 Mel 973 #to select rows whose column value equals a scalar, some_value, use ==: df.loc[df['column_name'] == some_value] #to select applying more than one condition: df1 = df.loc[(df.a != -1) & (df.b != -1)] #selecting all rows that null values for a certain column: df.loc[df['colname'].isnull()] #Getting the columns names from a data.frame: DF.columns And the we access the first column: df[0] # printing the 3 first rows of the data frame df[:3] # Selecting by more than 1 column: df[['col1',col2']] *counting the values of one column if the variable is categorical: df['col1'].value_counts() / #filling the NA values in a dataframe DF.fillna(value=0) / #plotting a data.frame, one column versus the other: df.plot(x='col1',y='col2') / #getting the rownames of a dataframe in list format: list(df.index) / #drawing a barplot df.plot(kind='bar') / # Applying a function over a dataframe, see tutorial: https://chrisalbon.com/python/pandas_apply_operations_to_dataframes.html / *Counting the occurrences of one variable by the occurrence of other, (similar to R's table function): print pd.crosstab(df['admit'], df['prestige'], rownames=['admit']) prestige 1 2 3 4 admit 0 28 97 93 55 1 33 54 28 12 / *With only one column: pd.crosstab(index=df['instrument'], columns="count") col_0 count instrument AUD_USD 33 EUR_USD 35 USD_CAD 14 / *How to make a pandas crosstab with percentages? pd.crosstab(df.A, df.B).apply(lambda r: r/r.sum(), axis=1) / #crosstab plus plot carat_table = pd.crosstab(index=diamonds["clarity"], columns="count") carat_table.plot(kind="bar", figsize=(8,8)) / *Return evenly spaced numbers over a specified interval from numpy import linspace x = linspace(-5,5,100) / *Pandas conditional creation of a dataframe based on the value of one column import pandas as pd import numpy as np df=pd.DataFrame({'Type':list('ABBC'), 'Set':list('ZZXY')}) df['color']=np.where(df['Set']=='Z', 'green', 'red') print(df) / import numpy as np normally_distributed = np.random.normal(size=10000) # Generate normal data* / *Pandas Dataframe columns are a Pandas Series when you pull them out, which you can then call .tolist() on to turn them into a python list dfList = df['one'].tolist() / #preventing poping windows when plotting: import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt / #creating a boxplot from quantitative variable using matplotlib: import matplotlib.pyplot as plt import numpy as np # basic plot plt.boxplot(data, labels=['set1','set2']) #labels=adding labels to each set / #saving boxplot to file import matplotlib.pyplot as plt import numpy as np # basic plot plt.boxplot(data) plt.savefig('/Users/ernesto/projects/IGSR/18_01_17/asdf.pdf',format='pdf') / #setting the axis labels and title plt.xlabel("x axis", size=14) #and size also plt.ylabel("y axis") plt.title("caca", size=20) / #setting the size of the tick labels: plt.tick_params(labelsize=20) / #creating a boxplot from a data.frame df.boxplot() / # calculating the IQR (interquartile range) and whiskers and median in a boxplot import numpy as np import matplotlib.pyplot as plt %matplotlib inline data = np.random.rand(100) plt.boxplot(data) median = np.median(data) upper_quartile = np.percentile(data, 75) lower_quartile = np.percentile(data, 25) iqr = upper_quartile - lower_quartile upper_whisker = data[data<=upper_quartile+1.5*iqr].max() lower_whisker = data[data>=lower_quartile-1.5*iqr].min() / * plotting a categorical variable (x variable) against a dependent variable (y variable): See: https://seaborn.pydata.org/tutorial/categorical.html / #rotating the X-labels: df.plot(rot=90) / #changing the aspect of a boxplot props = dict(boxes="DarkGreen", whiskers="DarkOrange", medians="DarkBlue", caps="Gray") ax=df.plot.box(grid=True,return_type='axes',color=props, patch_artist=True) / #setting the color df.plot(kind='bar',color="red") / #applying an y limit df.plot(ylim=[0,50]) // #setting the figure size df.plot(figsize=[10,10]) // #creating a plot from dataframe and saving ax = df.plot() fig = ax.get_figure() fig.savefig('asdf.png') fig.savefig('/Users/ernesto/projects/IGSR/files/asdf.pdf',format='pdf') #saving in pdf format / Setting the xticks labels: ax=subDF.plot() ax.set_xticklabels(['a','b','c']) / #reducing the number of x axis ticks and labels to a certain frequency (setting only every n tick) n = 10 ax = df.plot() ticks = ax.xaxis.get_ticklocs() ticklabels = [l.get_text() for l in ax.xaxis.get_ticklabels()] ax.xaxis.set_ticks(ticks[::n]) ax.xaxis.set_ticklabels(ticklabels[::n]) ax.figure.show() // *How to display all label values ax=DF.plot() ax.set_xticks(np.arange(len(DF.index))) ax.set_xticklabels(DF.index) // *Creating a composed plot import matplotlib.pyplot as plt plt.figure(1) # the first figure plt.subplot(311) # the first subplot in the first figure plt.plot([1, 2, 3]) plt.subplot(312) # the second subplot in the first figure in a new row plt.plot([4, 5, 6]) plt.subplot(313) # the third subplot in the first figure in a new row plt.plot([7, 8, 9]) *Where subplot(3,1,3) is nrow,ncol,fignum. The maximum value in fignum will depend on nrow*ncol // #useful reading on groupby and applying operations on gropus: https://chrisalbon.com/python/data_wrangling/pandas_apply_operations_to_groups/ // *Group data in the DataFrame by a certain column: bytreatment = data.groupby('Treatment') *Then, print descriptive stats for each of the values in Treatment: >>>bytreatment['RelativeFitness'].describe() Treatment Dish count 32.000000 mean 1.456359 std 0.184792 min 0.955221 25% 1.429005 50% 1.510884 75% 1.581340 max 1.699276 Tube count 32.000000 mean 0.929589 std 0.050153 min 0.795107 25% 0.915050 50% 0.939089 75% 0.953505 max 1.000363 dtype: float64 / *Mean for each group: >>>bytreatment['RelativeFitness'].mean() Treatment Dish 1.456359 Tube 0.929589 Name: RelativeFitness, dtype: float64 / *Aggregating and applying different numpy functions: bytreatment['RelativeFitness'].agg([np.mean,np.std,len,np.sum]) / *Print a groupby dataframe: import pandas as pd df = pd.DataFrame({'A': ['one', 'one', 'two', 'three', 'three', 'one'], 'B': range(6)}) grouped_df = df.groupby('A') for key, item in grouped_df: print(grouped_df.get_group(key)) // * Plotting with matplotlib import numpy as np import matplotlib.pyplot as plt *create an array x=np.linspace(0,5,10) y=x*x fig,ax=plt.subplots() ax.plot(x,y) plt.show() // #NumPY: #Create a NumPy array: import numpy as np import pandas as pd data=np.array([['','Col1','Col2'],['Row1',1,2],['Row2',3,4]]) >>>data[1:,] #print from row 1 till the end: array([['Row1', '1', '2'], ['Row2', '3', '4']], dtype='|S4') >>>data[1,1] '1' // #readin a np array from file: good_set=np.fromfile("file.txt",dtype=float,sep="\n") // #calculating the mean on a numpy array: np.mean(array) // #checking if elements within array are greater than some value np.where(data>10) // # Transforming a dataset (logarithmic) np.log(data) // # Transforming a dataset (square root) np.sqrt(data) // #calculating max and min in an array: np.amax(data) np.amin(data) // #converting a numpy array to list: data.tolist() // #How to append elements to a numpy array A = np.array([]) for row in matrix: A = numpy.append(A, row) // #creating now a DataFrame from the Numpy array, df=pd.DataFrame(data=data[1:,1:],index=data[1:,0],columns=data[0,1:]) // #Using Jupyter. If you want to use inline matplotlib plots in this platform, use: %matplotlib inline // #selecting several cells shift+j // #create virtual environment for a project: cd my_project_folder $ virtualenv venv #where venv is the virtual env name #then, start using it: source venv/bin/activate #when you are done: deactivate #once the env is active, to list the modules that are local (in the env) only: pip list --local #Anaconda: #To install a package conda install packagename #To install a package from a certain channel conda install -c bioconda pybedtools #We can upgrage packages: conda update numpy #Verify environment we are right now: conda info --envs #create a new environment with some packages installed conda create --name bamqc numpy pandas #activate the created environment source activate bamqc #deactivate the environment source deactivate #installing a new environment with a new python version: conda create --name blahblah python=2.7 #installing a new version with all python packages included in anaconda conda create -n python2.7_env python=2.7 anaconda #list all envs conda env list #remove a given env conda remove --name bamqc --all // #correcting indentation errors, use python ~/bin/Python-2.7.12/Tools/scripts/reindent.py // # round all elements in a list alist = [0.30000000000000004, 0.5, 0.20000000000000001] my_rounded_list = [ round(elem, 2) for elem in alist ] Will return: [0.3, 0.5, 0.2] // #rounding to the nearest 10: import math def roundup(x): return int(math.ceil(x / 10.0)) * 10 # iterating through object attributes: for attr, value in anobject.__dict__.iteritems(): print attr, value #ternary operator value_when_true if condition else value_when_false #For example: 'Yes' if fruit == 'Apple' else 'No' *Enumerations in Python: >>> from enum import Enum >>> class Color(Enum): ... red = 1 ... green = 2 ... blue = 3 ... *Enumeration members have human readable string representations: >>> print(Color.red) Color.red *Enum members also have a property that contains just their item name: >>> print(Color.red.name) red *Enumerations support iteration, in definition order: >>> class Shake(Enum): ... vanilla = 7 ... chocolate = 4 ... cookies = 9 ... mint = 3 ... >>> for shake in Shake: ... print(shake) ... Shake.vanilla Shake.chocolate Shake.cookies Shake.mint Enumeration members are hashable, so they can be used in dictionaries and sets: >>> apples = {} >>> apples[Color.red] = 'red delicious' >>> apples[Color.green] = 'granny smith' >>> apples == {Color.red: 'red delicious', Color.green: 'granny smith'} True // #grep on the elements of a list: >>> names = ['aet2000','ppt2000', 'aet2001', 'ppt2001'] >>> filter(lambda x:'aet' in x, names) ['aet2000', 'aet2001'] # in python 2, #In python 3: list(filter(lambda x:'aet' in x, names)) // #function to check if a string represents a number: def is_number(s): try: float(s) return True except ValueError: return False // #Creating an iterator object class Fib: ① def __init__(self, max): ② self.max = max def __iter__(self): ③ self.a = 0 self.b = 1 return self def __next__(self): ④ fib = self.a if fib > self.max: raise StopIteration ⑤ self.a, self.b = self.b, self.a + self.b return fib class Fib: ① def __init__(self, max): ② self.max = max def __iter__(self): ③ self.a = 0 self.b = 1 return self def __next__(self): ④ fib = self.a if fib > self.max: raise StopIteration ⑤ self.a, self.b = self.b, self.a + self.b return fib >>> from fibonacci2 import Fib >>> for n in Fib(1000): ... print(n, end=' ') 0 1 1 2 3 5 8 13 21 34 55 89 144 233 377 610 987 // *Containers in python: Containers are any object that holds an arbitrary number of other objects. Generally, containers provide a way to access the contained objects and to iterate over them. Examples of containers include tuple, list, set, dict; these are the built-in containers. More container types are available in the collections module. // *Getting the absolute number of a number abs(a) // #glob in python >>> import glob >>> glob.glob('*.pl') or : >>> glob.glob("/path/to/file/TSI*") // #glob and sort for file in sorted(glob.glob("*.fastq*")): print(file) // #iterating over 2 lists at the same time: a=[1,2,3] b=[2,4,6] for i, j in zip(a, b): print(i,j) // #opening a gzipped file: import gzip with gzip.open('ALL.chip.omni_broad_sanger_combined.20140818.snps.genotypes.hg38.autosomes.maf0.01_call_rate_0.95.recoded.vcf.gz','r') as fin: for line in fin: print('got line', line) #in this case, the line returned is in bytes format #opening now and returning a text: with gzip.open(sys.argv[1], 'rt') as f: for line in f: if line.startswith(b"#CHROM"): print(line) #note that #CHROM is opened as a bytes stream and not a string, so this is why wee need this 'b' / #converting a bytes object into str: >>> b"abcde" b'abcde' # utf-8 is used here because it is a very common encoding, but you # need to use the encoding your data is actually in. >>> b"abcde".decode("utf-8") 'abcde' / #encoding to bytes a certain str b = mystring.encode('utf-8') / #create a compressed gzip file import gzip content = "Lots of content here" with gzip.open('file.txt.gz', 'wb') as f: f.write(content) // #logging: A good tutorial at http://www.blog.pythonlibrary.org/2012/08/02/python-101-an-intro-to-logging/ // #logging with format: logging.basicConfig(filename="sample.log", filemode="w", level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') // *Enumeration in Python from enum import Enum class Color(Enum): red = 1 green = 2 blue = 3 >>>print(Color.red.name) red >>>print(Color.red.value) 1 # supports iteration for color in Color: print color.value // *group a list into inclusive sequential n-tuples >>> lst = ['A', 'B', 'C', 'D', 'E', 'F', 'G'] >>> zip(lst, lst[1:]) [('A', 'B'), ('B', 'C'), ('C', 'D'), ('D', 'E'), ('E', 'F'), ('F', 'G')] // #iterating and printing all instances in a class class A(object): def __init__(self): self.myinstatt1 = 'one' self.myinstatt2 = 'two' def mymethod(self): pass a = A() for attr, value in a.__dict__.iteritems(): # this is in python2, in python use 3 dict.items() print attr, value // *checking if an element is on a list: >>> a_list=['a','b','c','d'] >>> 'c' in a_list False // *How to get list index and element simultaneously: for k,i in enumerate(mylist): #do something with index k #do something with element i #creating venn diagrams: #2way from matplotlib import pyplot as plt from matplotlib_venn import venn2, venn2_circles set1 = set(['A', 'B', 'C', 'D']) set2 = set(['B', 'C', 'D', 'E']) venn2([set1, set2], ('Set1', 'Set2')) #3way from matplotlib import pyplot as plt from matplotlib_venn import venn3, venn3_circles set1 = set(['A', 'B', 'C', 'D']) set2 = set(['B', 'C', 'D', 'E']) set3 = set(['C', 'D',' E', 'F', 'G']) venn3([set1, set2, set3], ('Set1', 'Set2', 'Set3')) // #Venn with counts: from collections import Counter import matplotlib.pyplot as plt from matplotlib_venn import venn2, venn3 %matplotlib inline sets = Counter() sets['01'] = 10 sets['11'] = 3 sets['10'] = 5 setLabels = ['set1', 'set2'] plt.figure() ax = plt.gca() v = venn2(subsets = sets, set_labels = setLabels, ax = ax) plt.title('Venn Diagram') plt.show() // #creating hist: n, bins, patches = plt.hist(data, 50, normed=0, facecolor='green', alpha=0.75, range=[0, 990]) // #adding y-label to a pyplot plt.ylabel('some numbers') // #adding title to pyplot plt.title('Histogram of IQ') // #adding a grid plt.grid(True) // #importing an excel spreadsheet in Python: import xlrd #---------------------------------------------------------------------- def open_file(path): """ Open and read an Excel file """ book = xlrd.open_workbook(path) # print number of sheets print book.nsheets # print sheet names print book.sheet_names() # get the first worksheet first_sheet = book.sheet_by_index(0) # read a row print first_sheet.row_values(0) # read a cell cell = first_sheet.cell(0,0) print cell print cell.value # read a row slice print first_sheet.row_slice(rowx=0, start_colx=0, end_colx=2) #---------------------------------------------------------------------- if __name__ == "__main__": path = "test.xls" open_file(path) #using panda to load an excel file into a dataframe # import modules import pandas as pd # Import the excel file and call it xls_file xls_file = pd.ExcelFile('../data/example.xls') xls_file # View the excel file's sheet names xls_file.sheet_names ['Sheet1'] # Load the xls file's Sheet1 as a dataframe df = xls_file.parse('Sheet1') df / #Load the xls file'e Sheet as a dataframe, skipping the first row: df = xls_file.parse('Final QC Results',skiprows=1) #Load the xls file'e Sheet as a dataframe, declaring a column as the index df = xls.parse('Sheet1', index_col='Sample') #Now declaring more than 1 column as the index: df = xls.parse('Sheet1', index_col=[0,1]) * Now enforcing the type of some of the columns: df = xls.parse('Sheet1', index_col=[0,1],converters={'A': str}) // >>> writer = pd.ExcelWriter('output.xlsx') >>> df1.to_excel(writer,'Sheet1') >>> df2.to_excel(writer,'Sheet2') >>> writer.save() // # In python 2.7, the integer division truncates the result. For example: >>> 3 / 2 1 # Use this instead: >>> from __future__ import division >>> 53740/3 17913.333333333332 // #checking if a str is a float or int: import sys def numeric_type(x): type="" try: a = float(x) if a.is_integer()==True: type="int" else: type="float" except ValueError: type="str" return type s=sys.argv[1] print numeric_type(s) import sys def numeric_type(x): type="" try: a = float(x) if a.is_integer()==True: type="int" else: type="float" return type except ValueError: return False s=sys.argv[1] print numeric_type(s) // #python one-liner: python -c "for r in range(10): print r" // #what is an __init__.py file: Read documentation at: https://pythontips.com/2013/07/28/what-is-__init__-py/ // *DUMMIFY (or Convert A Categorical Variable Into Dummy Variables): # import modules import pandas as pd # Create a dataframe raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'], 'sex': ['male', 'female', 'male', 'female', 'female']} df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'sex']) df first_name last_name sex 0 Jason Miller male 1 Molly Jacobson female 2 Tina Ali male 3 Jake Milner female 4 Amy Cooze female # Create a set of dummy variables from the sex variable df_sex = pd.get_dummies(df['sex']) # Join the dummy variables to the main dataframe df_new = pd.concat([df, df_sex], axis=1) df_new first_name last_name sex female male 0 Jason Miller male 0 1 1 Molly Jacobson female 1 0 2 Tina Ali male 0 1 3 Jake Milner female 1 0 4 Amy Cooze female 1 0 // #serializing a dataframe (store) df.to_pickle(file_name) # where to save it, usually as a .pkl #Then you can load it back using: df = pd.read_pickle(file_name) // #ipython. Embedding images: from IPython.display import Image Image("/Users/ernesto/Desktop/20170301124551634.png") // *Compute the skewness of a dataset scipy.stats.skew(a, axis=0, bias=True)[source]¶ // #generate a random string in python >>> import string >>> import random >>> def random_generator(size=6, chars=string.ascii_uppercase + string.digits): ... return ''.join(random.choice(chars) for x in range(size)) ... >>> random_generator() 'G5G74W' >>> random_generator(3, "6793YUIO") 'Y3U'1 // #Converting a string representation of a list into an actual list object >>> fruits = "['apple', 'orange', 'banana']" >>> import ast >>> fruits = ast.literal_eval(fruits) >>> fruits ['apple', 'orange', 'banana'] >>> fruits[1] 'orange' // #How to use the __init__.py Files named __init__.py are used to mark directories on disk as Python package directories. If you have the files mydir/spam/__init__.py mydir/spam/module.py and mydir is on your path, you can import the code in module.py as import spam.module or from spam import module #good explanation on what is a @classmethod https://stackoverflow.com/questions/12179271/meaning-of-classmethod-and-staticmethod-for-beginner # #compare 2 lists and return matches >>> a = [1, 2, 3, 4, 5] >>> b = [9, 8, 7, 6, 5] >>> set(a) & set(b) // #check if 2 lists are equal (order does not matter): >>> a = [1, 2, 3, 4, 5] >>> b = [9, 8, 7, 6, 5] >>> set(a)==set(b) // #creating a temporary file and write something to it and read it from it #the file is destroyed when temp.close() is called import os import tempfile temp = tempfile.TemporaryFile() try: temp.write('Some data') temp.seek(0) print temp.read() finally: temp.close() #creating a temporary file with a name (path) associated to it: import os import tempfile temp = tempfile.NamedTemporaryFile() try: print 'temp:', temp print 'temp.name:', temp.name finally: # Automatically cleans up the file temp.close() print 'Exists after close:', os.path.exists(temp.name) // #parse file with entries (each entry on a newline) and create a list: crimefile = open('chros.txt', 'r') lines = crimefile.read().splitlines() // temp = tempfile.NamedTemporaryFile(dir='testdir/',delete=False,prefix='caca') // * Running pylint: Run pylint and generate a report: pylint --reports=y VcfFilter.py // #pytest ###################################### / #creating a simple test with a fixture @pytest.fixture def some_data(): return 42 def test_some_data(some_data): assert some_data == 42 #some_data() can be used by different test functions / #creating tmp files import os def test_create_file(tmpdir): p = tmpdir.mkdir("sub").join("hello.txt") p.write("content") assert p.read() == "content" assert len(tmpdir.listdir()) == 1 assert 0 / #in this fixture we execute code within the fixture before the test, then we pass 42 to the test with yield and finally we execute the code after the yield that is executed at the end: import pytest import warnings @pytest.fixture def some_data(): warnings.warn("before the test") yield 42 warnings.warn("executed after the test") def test_some_data(some_data): warnings.warn("executed in the test") assert some_data == 42 / pytest -s #will invoke pytest and write the output of the print commands to the terminal // #pytest check that raises and Exception: def test_passes(): with pytest.raises(Exception) as e_info: x = 1 / 0 ######################################### #dropping trailing 0s a="1.00000" (str(a)[-2:] == '.0' and str(a)[:-2] or str(a)) 1 // # asserting for equality of a returned list in pytest: returned_list=['a','b','c'] assert all([a == b for a, b in zip(returned_list, ['a','b','c'])]) // # how to check if a file is a directory or regular file in python? os.path.isfile("bob.txt") # Does bob.txt exist? Is it a file, or a directory? os.path.isdir("bob") // * Python: access class property from string x = getattr(self, source) // # parse a string representing a list into a real list >>> import ast >>> mylist = ast.literal_eval("['foo', ['cat', ['ant', 'bee'], 'dog'], 'bar', 'baz']") >>> mylist ['foo', ['cat', ['ant', 'bee'], 'dog'], 'bar', 'baz'] // #Create a List that contain each Line of a File List = open("filename.txt").readlines() // # creating a dir if it does not exist: if not os.path.exists(directory): os.makedirs(directory) // # convert a list into a string: list1 = ['1', '2', '3'] str1 = ''.join(list1) // #good tutorial on Python decorators: https://realpython.com/primer-on-python-decorators/ // # requests module # Get a website from the internet: # #Create a program called anexample.py: def simple_get(url): """ Attempts to get the content at `url` by making an HTTP GET request. If the content-type of response is some kind of HTML/XML, return the text content, otherwise return None """ try: with closing(get(url, stream=True)) as resp: if is_good_response(resp): return resp.content else: return None except RequestException as e: log_error('Error during requests to {0} : {1}'.format(url, str(e))) return None def is_good_response(resp): """ Returns true if the response seems to be HTML, false otherwise """ content_type = resp.headers['Content-Type'].lower() return (resp.status_code == 200 and content_type is not None and content_type.find('html') > -1) def log_error(e): """ It is always a good idea to log errors. This function just prints them, but you can make it do anything. """ print(e) #Then: >>> from anexample import simple_get >>> raw_html = simple_get('https://realpython.com/blog/') >>> len(raw_html) 33878 // #adding timestamp to filename: import time timestr = time.strftime("%Y%m%d-%H%M%S") print timestr #Packaging: #To create a release, your source code needs to be packaged into a single archive file. This can be done with the sdist command: python setup.py sdist This command will create the *.tar.gz file inside ./dist/ #Then we can install it by doing: pip install ./dist/package.tar.gz // #getters and setters (the pythonic way): # Excellent post at: https://www.programiz.com/python-programming/property class Celsius: def __init__(self, temperature = 0): self._temperature = temperature def to_fahrenheit(self): return (self.temperature * 1.8) + 32 @property def temperature(self): print("Getting value") return self._temperature @temperature.setter def temperature(self, value): if value < -273: raise ValueError("Temperature below -273 is not possible") print("Setting value") self._temperature = value // #Sphinx . Generating documentation: / #Generate automatically doc for your source dir sphinx-apidoc -o /homes/ernesto/lib/igsr_analysis/docs/ /homes/ernesto/lib/igsr_analysis/ # In this case, -o sets where to put the generated doc, and the second parameter sets the path to the source code // #sorting a list of tuples by first element then by the second unsorted = [('a', 4, 2), ('a', 4, 3), ('a', 7, 2), ('a', 7, 3), ('b', 4, 2), ('b', 4, 3), ('b', 7, 2), ('b', 7, 3)] print(sorted(unsorted, key=lambda element: (element[1], element[2]))) // # Getting information on the different numeric Python data types import numpy as np int_types = ["uint8", "int8", "int16"] for it in int_types: print(np.iinfo(it)) float_types = ["float16", "float32", "float64"] for ft in float_types: print(np.finfo(ft)) // # Memory profiling #Install required module: pip install -U memory_profiler #Create a test function and decorate it with @profile: @profile def my_func(): a = [1] * (10 ** 6) b = [2] * (2 * 10 ** 7) del b return a if __name__ == '__main__': my_func() #Finally run the test script in the following way: python -m memory_profiler test.py // # initializing several lists at the same time: alist, blist, clist, dlist, elist = ([] for i in range(5)) // # Generating tables in python from prettytable import PrettyTable x = PrettyTable() # we set the header x.field_names = ["City name", "Area", "Population", "Annual Rainfall"] # we add values x.add_row(["Adelaide", 1295, 1158259, 600.5]) x.add_row(["Brisbane", 5905, 1857594, 1146.4]) print(x) +-----------+------+------------+-----------------+ | City name | Area | Population | Annual Rainfall | +-----------+------+------------+-----------------+ | Adelaide | 1295 | 1158259 | 600.5 | | Brisbane | 5905 | 1857594 | 1146.4 | | Darwin | 112 | 120900 | 1714.7 | | Hobart | 1357 | 205556 | 619.5 | | Sydney | 2058 | 4336374 | 1214.8 | | Melbourne | 1566 | 3806092 | 646.9 | | Perth | 5386 | 1554769 | 869.4 | +-----------+------+------------+-----------------+ // #checking if an int is within 2 numbers: if 10000 <= number <= 30000: // * check if something is list x = {'a', 'b', 'c', 'd'} if type(x) is list: print("h") // # How to check if an object has an attribute in Python: hasattr(self, 'start') True // #append elements to beginning of list : >>> a = ['a','b'] >>> k = ['nice', '-n', '10'] >>> a[0:0] = k >>> a ['nice', '-n', '10', 'a', 'b'] // #remove several elements from a list using its index item_list = ['item', 5, 'foo', 3.14, True] item_list = [e for e in item_list if e not in ('item', 5)] // # generating ranges in python #Create a sequence of numbers from 3 to 5, and print each item in the sequence: x = range(3, 6) for n in x: print(n) #Create a sequence of numbers from 3 to 19, but increment by 2 instead of 1: x = range(3, 20, 2) for n in x: print(n) * Article on how to manage configuration files in python: https://hackernoon.com/4-ways-to-manage-the-configuration-in-python-4623049e841b // # remove several elements from a list indices = [0, 2] somelist = [i for j, i in enumerate(somelist) if j not in indices] // # Iterating over every two elements in a list l = [1,2,3,4,5,6] def pairwise(iterable): "s -> (s0, s1), (s2, s3), (s4, s5), ..." a = iter(iterable) return zip(a, a) for x, y in pairwise(l): print("{0} + {1} = {2}".format(x, y, x + y)) // # Making a Python script executable: 1) Add this line as the first line in the script: #!/usr/bin/env python3. 2) At the unix command prompt, type the following to make myscript.py executable: $ chmod +x myscript.py. // * Previous and next values inside a loop Example extracted from: https://stackoverflow.com/questions/1011938/python-previous-and-next-values-inside-a-loop from itertools import tee, islice, chain, izip def previous_and_next(some_iterable): prevs, items, nexts = tee(some_iterable, 3) prevs = chain([None], prevs) nexts = chain(islice(nexts, 1, None), [None]) return izip(prevs, items, nexts) mylist = ['banana', 'orange', 'apple', 'kiwi', 'tomato'] for previous, item, nxt in previous_and_next(mylist): print "Item is now", item, "next is", nxt, "previous is", previous The results: Item is now banana next is orange previous is None Item is now orange next is apple previous is banana Item is now apple next is kiwi previous is orange Item is now kiwi next is tomato previous is apple Item is now tomato next is None previous is kiwi // # Excellent articule on decorators: https://realpython.com/primer-on-python-decorators/#simple-decorators // #knowing PYTHONPATH from python script import os try: user_paths = os.environ['PYTHONPATH'].split(os.pathsep) print(user_paths) except KeyError: user_paths = [] // # Excellent artickle on how to submit a Python project to PyPi https://dzone.com/articles/executable-package-pip-install // # How to use glob() to find files recursively? from pathlib import Path for filename in Path('src').rglob('*.c'): print(filename) // # Convert Python2 code to Python3 # Where test.py contains the code to modify, with -w we will # print out on the same file 2to3 -w test.py // # Excellent article on using the main entry point in Python https://realpython.com/python-main-function/ // # Python shebang or hash bang: #!/usr/bin/python: writing the absolute path #!/usr/bin/env python: using the operating system env command, which locates and executes Python by searching the PATH environment variable // main entry point with args parsing: import argparse def main(): parser = argparse.ArgumentParser() parser.add_argument('file_in', help='input file') parser.add_argument('file_out', help='output file') args = parser.parse_args() execute_code(args.file_in, args.file_out) if __name__ == '__main__': main() // # Splitting a string by one or more whitespaces import re str = '63 41 92 81 69 70' #split string by single space chunks = re.split(' +', str) print(chunks) // # Testing if a binary exists def is_tool(name): """Check whether `name` is on PATH and marked as executable.""" # from whichcraft import which from shutil import which return which(name) is not None