#import wget import textract # url = 'https://www.utcourts.gov/cal/data/SLC_Calendar.pdf' # pdf = wget.download(url) print "Processing pdf into text..." pdf_text_raw = textract.process("SLC_Calendar.pdf") # Load text into list print "Formatting..." pdf_text_lines=pdf_text_raw.splitlines() # Convert '\n' into new lines print "Structuring data..." pdf_text_array=[line.split() for line in pdf_text_lines] # Load lines into 2D array print "Searching for start of entries..." row = 0 while row < len(pdf_text_array)-1: col = 0 while col < len(pdf_text_array[row])-1: if (( pdf_text_array[row][col] == "Page" ) and ( pdf_text_array[row+2][1] == "3RD" ) and ( pdf_text_array[row+3][1] == "BERNARDS-GOODMAN" ) and ( pdf_text_array[row+7][0] == "September" ) and ( pdf_text_array[row+7][1] == 29 ) and ( pdf_text_array[row+7][2] == 2017 )): start = [row][col] break col = col+1 row = row+1 print "Found start position %" % start while pos <= len(pdf_text_array) - 1: if (( pdf_text_array[pos][1] == "Page" ) and ( pdf_text_array[pos+1][1] != "BERNARDS-GOODMAN" )): end = pos break pos = pos+1 print "Found end position %" % end