Skip to content

Instantly share code, notes, and snippets.

@ritwikraha
Created December 5, 2023 06:48
Show Gist options
  • Save ritwikraha/cedaa0304099b68947ea14fdda538dff to your computer and use it in GitHub Desktop.
Save ritwikraha/cedaa0304099b68947ea14fdda538dff to your computer and use it in GitHub Desktop.

Revisions

  1. ritwikraha revised this gist Dec 5, 2023. 1 changed file with 12 additions and 1 deletion.
    13 changes: 12 additions & 1 deletion pdf-extractor.ipynb
    Original file line number Diff line number Diff line change
    @@ -4,7 +4,8 @@
    "metadata": {
    "colab": {
    "provenance": [],
    "authorship_tag": "ABX9TyOM1MK/wsnFQZ9IDhe5wqdw"
    "authorship_tag": "ABX9TyOM1MK/wsnFQZ9IDhe5wqdw",
    "include_colab_link": true
    },
    "kernelspec": {
    "name": "python3",
    @@ -15,6 +16,16 @@
    }
    },
    "cells": [
    {
    "cell_type": "markdown",
    "metadata": {
    "id": "view-in-github",
    "colab_type": "text"
    },
    "source": [
    "<a href=\"https://colab.research.google.com/gist/ritwikraha/cedaa0304099b68947ea14fdda538dff/pdf-extractor.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 1,
  2. ritwikraha created this gist Dec 5, 2023.
    88 changes: 88 additions & 0 deletions pdf-extractor.ipynb
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,88 @@
    {
    "nbformat": 4,
    "nbformat_minor": 0,
    "metadata": {
    "colab": {
    "provenance": [],
    "authorship_tag": "ABX9TyOM1MK/wsnFQZ9IDhe5wqdw"
    },
    "kernelspec": {
    "name": "python3",
    "display_name": "Python 3"
    },
    "language_info": {
    "name": "python"
    }
    },
    "cells": [
    {
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {
    "colab": {
    "base_uri": "https://localhost:8080/"
    },
    "id": "3W6PqL7fnKoE",
    "outputId": "d1e0d8b6-d930-440d-f6b5-e8b444d83d63"
    },
    "outputs": [
    {
    "output_type": "stream",
    "name": "stdout",
    "text": [
    "Collecting PyPDF2\n",
    " Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)\n",
    "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m232.6/232.6 kB\u001b[0m \u001b[31m2.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
    "\u001b[?25hInstalling collected packages: PyPDF2\n",
    "Successfully installed PyPDF2-3.0.1\n"
    ]
    }
    ],
    "source": [
    "!pip install PyPDF2"
    ]
    },
    {
    "cell_type": "code",
    "source": [
    "start_page = 12\n",
    "end_page =94"
    ],
    "metadata": {
    "id": "GowGZUpWrUDo"
    },
    "execution_count": 4,
    "outputs": []
    },
    {
    "cell_type": "code",
    "source": [
    "import csv\n",
    "from PyPDF2 import PdfReader\n",
    "\n",
    "# Creating a pdf reader object\n",
    "reader = PdfReader('puzzles.pdf')\n",
    "\n",
    "# Open a new CSV file for writing\n",
    "with open('questions.csv', 'w', newline='', encoding='utf-8') as file:\n",
    " writer = csv.writer(file)\n",
    "\n",
    " # Writing the header row\n",
    " writer.writerow(['page_number', 'questions'])\n",
    "\n",
    " # Looping through the pages\n",
    " for page_number in range(start_page, end_page):\n",
    " page = reader.pages[page_number]\n",
    " text = page.extract_text()\n",
    "\n",
    " # Writing the page number and text to the CSV file\n",
    " writer.writerow([page_number + 1, text]) # Adding 1 because page_number starts from 0\n"
    ],
    "metadata": {
    "id": "6G7OW0_qnaTT"
    },
    "execution_count": 5,
    "outputs": []
    }
    ]
    }