#import wget import textract # url = 'https://www.utcourts.gov/cal/data/SLC_Calendar.pdf' # pdf = wget.download(url) print "Processing pdf into text..." pdf_text_raw = textract.process("SLC_Calendar.pdf") # Load text into list print "Formatting..." pdf_text_lines=pdf_text_raw.splitlines() # Convert '\n' into new lines print "Structuring data..." pdf_text_array=[line.split() for line in pdf_text_lines] # Load lines into 2D array print "Searching for start of entries..." pos = 0 while pos <= len(pdf_text_array) - 1: if (( pdf_text_array[pos][0] == "Page" ) and ( pdf_text_array[pos+2][0] == "3RD" ) and ( pdf_text_array[pos+3][1] == "BERNARDS-GOODMAN" ) and ( pdf_text_array[pos+7][0] == "September" ) and ( pdf_text_array[pos+7][1] == 29 ) and ( pdf_text_array[pos+7][3] == 2017 )): start = pos break pos = pos+1 print "Found start position %" % start while pos <= len(pdf_text_array) - 1: if (( pdf_text_array[pos][1] == "Page" ) and ( pdf_text_array[pos+1][1] != "BERNARDS-GOODMAN" )): end = pos break pos = pos+1 print "Found end position %" % end