import PyPDF2 import subprocess pdfIn = open('presentations.pdf', 'rb') reader = PyPDF2.PdfFileReader(pdfIn) writer = PyPDF2.PdfFileWriter() for i in range(reader.numPages): page = reader.getPage(i) # This didn't work so using others instead. # contents = page.extractText() # get this page as it's own file so it can be parsed. bufWriter = PyPDF2.PdfFileWriter() bufWriter.addPage(page) # Write this page to a pdf on its own to parse. bufFile = open('buffer.pdf', 'wb') bufWriter.write(bufFile) bufFile.close() # Put the text of the page into a txt file. subprocess.call(['pdftotext', 'buffer.pdf', 'buffer.txt']) # If that .txt contains "question" then write. with open('buffer.txt', 'r') as bufTxt: pageTxt = bufTxt.read() if "question" in pageTxt.lower(): writer.addPage(page) fileOutput = open('questions.pdf', 'wb') writer.write(fileOutput) pdfIn.close() fileOutput.close()