masitings · January 26, 2024 09:47 · Jan 26, 2024 · Jan 26, 2024
diff --git a/yt_ocr.ipynb b/yt_ocr.ipynb
@@ -4,7 +4,8 @@
   "metadata": {
     "colab": {
       "provenance": [],
-      "authorship_tag": "ABX9TyNEBoY32Js79QI9AsomXvp4"
+      "authorship_tag": "ABX9TyNEBoY32Js79QI9AsomXvp4",
+      "include_colab_link": true
     },
     "kernelspec": {
       "name": "python3",
@@ -15,6 +16,16 @@
     }
   },
   "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/gist/masitings/9dab604147dfe269d6f895f41f872991/yt_ocr.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
     {
       "cell_type": "code",
       "execution_count": null,

diff --git a/yt_ocr.ipynb b/yt_ocr.ipynb
@@ -0,0 +1,415 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "authorship_tag": "ABX9TyNEBoY32Js79QI9AsomXvp4"
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "YMViZtqR340x",
+        "outputId": "402ab93d-5d98-492a-9e21-004c4effa942"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Mounted at /content/drive\n"
+          ]
+        }
+      ],
+      "source": [
+        "from google.colab import drive\n",
+        "import os\n",
+        "\n",
+        "drive.mount('/content/drive')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!sudo apt install tesseract-ocr-ind\n",
+        "!pip install pytesseract\n",
+        "!pip install protobuf"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "JLCHyiEf4YyX",
+        "outputId": "f96fe312-5c40-48da-ee6d-4f53b8a39aec"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Reading package lists... Done\n",
+            "Building dependency tree... Done\n",
+            "Reading state information... Done\n",
+            "The following additional packages will be installed:\n",
+            "  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd\n",
+            "The following NEW packages will be installed:\n",
+            "  tesseract-ocr tesseract-ocr-eng tesseract-ocr-ind tesseract-ocr-osd\n",
+            "0 upgraded, 4 newly installed, 0 to remove and 30 not upgraded.\n",
+            "Need to get 5,353 kB of archives.\n",
+            "After this operation, 16.8 MB of additional disk space will be used.\n",
+            "Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]\n",
+            "Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]\n",
+            "Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]\n",
+            "Get:4 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-ind all 1:4.00~git30-7274cfa-1.1 [537 kB]\n",
+            "Fetched 5,353 kB in 1s (8,532 kB/s)\n",
+            "debconf: unable to initialize frontend: Dialog\n",
+            "debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 4.)\n",
+            "debconf: falling back to frontend: Readline\n",
+            "debconf: unable to initialize frontend: Readline\n",
+            "debconf: (This frontend requires a controlling tty.)\n",
+            "debconf: falling back to frontend: Teletype\n",
+            "dpkg-preconfigure: unable to re-open stdin: \n",
+            "Selecting previously unselected package tesseract-ocr-eng.\n",
+            "(Reading database ... 121671 files and directories currently installed.)\n",
+            "Preparing to unpack .../tesseract-ocr-eng_1%3a4.00~git30-7274cfa-1.1_all.deb ...\n",
+            "Unpacking tesseract-ocr-eng (1:4.00~git30-7274cfa-1.1) ...\n",
+            "Selecting previously unselected package tesseract-ocr-osd.\n",
+            "Preparing to unpack .../tesseract-ocr-osd_1%3a4.00~git30-7274cfa-1.1_all.deb ...\n",
+            "Unpacking tesseract-ocr-osd (1:4.00~git30-7274cfa-1.1) ...\n",
+            "Selecting previously unselected package tesseract-ocr.\n",
+            "Preparing to unpack .../tesseract-ocr_4.1.1-2.1build1_amd64.deb ...\n",
+            "Unpacking tesseract-ocr (4.1.1-2.1build1) ...\n",
+            "Selecting previously unselected package tesseract-ocr-ind.\n",
+            "Preparing to unpack .../tesseract-ocr-ind_1%3a4.00~git30-7274cfa-1.1_all.deb ...\n",
+            "Unpacking tesseract-ocr-ind (1:4.00~git30-7274cfa-1.1) ...\n",
+            "Setting up tesseract-ocr-eng (1:4.00~git30-7274cfa-1.1) ...\n",
+            "Setting up tesseract-ocr-ind (1:4.00~git30-7274cfa-1.1) ...\n",
+            "Setting up tesseract-ocr-osd (1:4.00~git30-7274cfa-1.1) ...\n",
+            "Setting up tesseract-ocr (4.1.1-2.1build1) ...\n",
+            "Processing triggers for man-db (2.10.2-1) ...\n",
+            "Collecting pytesseract\n",
+            "  Downloading pytesseract-0.3.10-py3-none-any.whl (14 kB)\n",
+            "Requirement already satisfied: packaging>=21.3 in /usr/local/lib/python3.10/dist-packages (from pytesseract) (23.2)\n",
+            "Requirement already satisfied: Pillow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from pytesseract) (9.4.0)\n",
+            "Installing collected packages: pytesseract\n",
+            "Successfully installed pytesseract-0.3.10\n",
+            "Requirement already satisfied: protobuf in /usr/local/lib/python3.10/dist-packages (3.20.3)\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import cv2\n",
+        "import numpy as np\n",
+        "import pytesseract\n",
+        "import pandas as pd\n",
+        "from PIL import Image\n",
+        "import matplotlib.pyplot as plt"
+      ],
+      "metadata": {
+        "id": "EyApGvkG4wsy"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "FILE_PATH = '/content/drive/MyDrive/ocr/dataset'\n",
+        "filePath = os.path.join(FILE_PATH, 'ktp.png')\n",
+        "\n",
+        "img = cv2.imread(filePath)\n",
+        "gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)\n",
+        "th, threshed = cv2.threshold(gray, 127, 255, cv2.THRESH_TRUNC)\n",
+        "\n",
+        "result = pytesseract.image_to_string((threshed), lang=\"ind\")\n",
+        "\n",
+        "for word in result.split(\"\\n\"):\n",
+        "  if \"”—\" in word:\n",
+        "    word = word.replace(\"”—\", \":\")\n",
+        "  if \"NIK\" in word:\n",
+        "    nik_char = word.split()\n",
+        "    if \"?\" in word:\n",
+        "      word = word.replace(\"?\", \"7\")\n",
+        "    if \"D\" in word:\n",
+        "      word = word.replace(\"D\", \"0\")\n",
+        "\n",
+        "  print(word)\n"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "WTWGqHTT5NAh",
+        "outputId": "07bff621-dd0a-4d71-80b4-1f8ac1bfa47a"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "em\n",
+            "\n",
+            "PROVINSI DAERAH ISTIMEWA YOGYAKARTA\n",
+            "KABUPATEN SLEMAN\n",
+            "\n",
+            " \n",
+            "\n",
+            "NIK : 34711140209790001\n",
+            "\n",
+            "Nama :RIYANTO. SE\n",
+            "\n",
+            "Tempat/Tgl Lahir : GROBOGAN. 02-09-1979\n",
+            "\n",
+            "Jenis Kelamin : LAKI-LAKI Gol Darah : 0\n",
+            "\n",
+            "Alamat PRM PURI DOMAS D-3. SEMPU\n",
+            "RTRW 1001 1024\n",
+            "\n",
+            "Kel/Desa : WEDOMARTANI!\n",
+            "Kecamatan : NGEMPLAK\n",
+            "\n",
+            "Agama \"ISLAM\n",
+            "Status Bean KAWIN SLEMAN\n",
+            "Pekerjaan : PEDAGANG 05-06-2012\n",
+            "\n",
+            "Kewarganegaraan: WNI HI —\n",
+            "Berlaku Hingga :02-09-2017 NIA\n",
+            "\n",
+            "   \n",
+            " \n",
+            "\n",
+            " \n",
+            "\f\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "class KTPInformation(object):\n",
+        "    def __init__(self):\n",
+        "        self.nik = \"\"\n",
+        "        self.nama = \"\"\n",
+        "        self.tempat_lahir = \"\"\n",
+        "        self.tanggal_lahir = \"\"\n",
+        "        self.jenis_kelamin = \"\"\n",
+        "        self.golongan_darah = \"\"\n",
+        "        self.alamat = \"\"\n",
+        "        self.rt = \"\"\n",
+        "        self.rw = \"\"\n",
+        "        self.kelurahan_atau_desa = \"\"\n",
+        "        self.kecamatan = \"\"\n",
+        "        self.agama = \"\"\n",
+        "        self.status_perkawinan = \"\"\n",
+        "        self.pekerjaan = \"\"\n",
+        "        self.kewarganegaraan = \"\"\n",
+        "        berlaku_hingga = \"SEUMUR HIDUP\""
+      ],
+      "metadata": {
+        "id": "TLhP9FJH7m5-"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import cv2\n",
+        "import json\n",
+        "import re\n",
+        "import numpy as np\n",
+        "import pytesseract\n",
+        "import matplotlib.pyplot as plt\n",
+        "from PIL import Image\n",
+        "\n",
+        "class KTPOCR(object):\n",
+        "    def __init__(self, image):\n",
+        "        self.image = cv2.imread(image)\n",
+        "        self.gray = cv2.cvtColor(self.image, cv2.COLOR_BGR2GRAY)\n",
+        "        self.th, self.threshed = cv2.threshold(self.gray, 127, 255, cv2.THRESH_TRUNC)\n",
+        "        self.result = KTPInformation()\n",
+        "        self.master_process()\n",
+        "\n",
+        "    def process(self, image):\n",
+        "        raw_extracted_text = pytesseract.image_to_string((self.threshed), lang=\"ind\")\n",
+        "        return raw_extracted_text\n",
+        "\n",
+        "    def word_to_number_converter(self, word):\n",
+        "        word_dict = {\n",
+        "            \"L\": \"1\",\n",
+        "            \"l\": \"1\",\n",
+        "            \"O\": \"0\",\n",
+        "            \"o\": \"0\",\n",
+        "            \"?\": \"7\",\n",
+        "            \"A\": \"4\",\n",
+        "            \"Z\": \"2\",\n",
+        "            \"z\": \"2\",\n",
+        "            \"S\": \"5\",\n",
+        "            \"s\": \"5\",\n",
+        "            \"b\": \"6\",\n",
+        "            \"B\": \"8\",\n",
+        "            \"G\": \"6\"\n",
+        "        }\n",
+        "        res = \"\"\n",
+        "        for letter in word:\n",
+        "            if letter in word_dict:\n",
+        "                res += word_dict[letter]\n",
+        "            else:\n",
+        "                res += letter\n",
+        "        return res\n",
+        "\n",
+        "    def extract(self, extracted_result):\n",
+        "        #print(extracted_result.replace('\\n', ' -- '))\n",
+        "        for word in extracted_result.split(\"\\n\"):\n",
+        "            word = self.pun_rem(word)\n",
+        "\n",
+        "            if \"NIK\" in word:\n",
+        "              word = word.split(':')\n",
+        "              self.result.nik = self.word_to_number_converter(word[-1].replace(\" \", \"\"))\n",
+        "              continue\n",
+        "\n",
+        "            if \"Nama\" in word:\n",
+        "              word = word.split(':')\n",
+        "              self.result.nama = word[-1]\n",
+        "              continue\n",
+        "\n",
+        "            if \"Lahir\" in word:\n",
+        "              word = word.split(':')\n",
+        "              self.result.tanggal_lahir = re.search(\"([0-9]{2}\\-[0-9]{2}\\-[0-9]{4})\", word[-1])[0]\n",
+        "              self.result.tempat_lahir = word[-1].replace(self.result.tanggal_lahir, '')\n",
+        "              continue\n",
+        "\n",
+        "            if \"Gol\" in word:\n",
+        "              word = word.split(':')\n",
+        "              self.result.jenis_kelamin = re.search(\"(LAKI-LAKI|LAKI|LELAKI|PEREMPUAN)\", word[1])[0]\n",
+        "              # gol = re.search(\"(O|A|B|AB)\", word[2])[0]\n",
+        "\n",
+        "              # if gol == \"0\":\n",
+        "              #   gol = \"O\"\n",
+        "\n",
+        "              # self.result.golongan_darah = gol\n",
+        "              continue\n",
+        "\n",
+        "            if \"Alamat\" in word:\n",
+        "              self.result.alamat = re.sub(r'^\\W*\\w+\\W*', '', word)\n",
+        "              continue\n",
+        "\n",
+        "            if \"RW\" in word:\n",
+        "              word = re.sub(r'^\\W*\\w+\\W*', '', word)\n",
+        "\n",
+        "              if \" \" in word:\n",
+        "                a = word.split(\" \")\n",
+        "              elif \"/\" in word:\n",
+        "                a = word.split(\"/\")\n",
+        "\n",
+        "              self.result.rt = a[0][-3:]\n",
+        "              self.result.rw = a[1][-3:]\n",
+        "              continue\n",
+        "\n",
+        "            if \"kel\" in word:\n",
+        "              word = re.sub(r'^\\W*\\w+\\W*', '', word)\n",
+        "              self.result.kecamatan = word\n",
+        "              continue\n",
+        "\n",
+        "            if \"Agama\" in word:\n",
+        "              self.result.agama = re.search(\"(ISLAM|KRISTEN|KATOLIK|HINDU|BUDDHA|KONG HU CU)\", word)[0]\n",
+        "              continue\n",
+        "\n",
+        "            if \"Status\" in word:\n",
+        "              self.result.status_perkawinan = re.search(\"(KAWIN|BELUM KAWIN|DUDA CERAI|DUDA MATI|JANDA CERAI|JANDA MATI)\", word)[0]\n",
+        "              continue\n",
+        "\n",
+        "            if \"Pekerjaan\" in word:\n",
+        "              word = re.sub(r'^\\W*\\w+\\W*', '', word)\n",
+        "              word = word.split(\" \")\n",
+        "              self.result.pekerjaan = word[0]\n",
+        "              continue\n",
+        "\n",
+        "            if \"Kewarganegaraan\" in word:\n",
+        "              self.result.kewarganegaraan = re.search(\"(WNI|WNA)\", word)[0]\n",
+        "              continue\n",
+        "\n",
+        "    def pun_rem(self,text):\n",
+        "      punctuations = '''!()[]{}'\"\\<>?@#$%^&*_~'''\n",
+        "      no_punct = ''\n",
+        "\n",
+        "      for char in text:\n",
+        "        if char not in punctuations:\n",
+        "          no_punct = no_punct + char\n",
+        "\n",
+        "      return no_punct\n",
+        "\n",
+        "    def master_process(self):\n",
+        "        raw_text = self.process(self.image)\n",
+        "        self.extract(raw_text)\n",
+        "\n",
+        "    def to_json(self):\n",
+        "        return json.dumps(self.result.__dict__, indent=4)"
+      ],
+      "metadata": {
+        "id": "7rZCI9I47oC7"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "images = KTPOCR(filePath)\n",
+        "print(images.to_json());"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "MFfCNv4L8MPZ",
+        "outputId": "a6a0f2b0-13e7-4574-a3b9-489fc0b3c00f"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "{\n",
+            "    \"nik\": \"34711140209790001\",\n",
+            "    \"nama\": \"RIYANTO. SE\",\n",
+            "    \"tempat_lahir\": \" GROBOGAN. \",\n",
+            "    \"tanggal_lahir\": \"02-09-1979\",\n",
+            "    \"jenis_kelamin\": \"LAKI-LAKI\",\n",
+            "    \"golongan_darah\": \"\",\n",
+            "    \"alamat\": \"PRM PURI DOMAS D-3. SEMPU\",\n",
+            "    \"rt\": \"001\",\n",
+            "    \"rw\": \"024\",\n",
+            "    \"kelurahan_atau_desa\": \"\",\n",
+            "    \"kecamatan\": \"\",\n",
+            "    \"agama\": \"ISLAM\",\n",
+            "    \"status_perkawinan\": \"KAWIN\",\n",
+            "    \"pekerjaan\": \"PEDAGANG\",\n",
+            "    \"kewarganegaraan\": \"WNI\"\n",
+            "}\n"
+          ]
+        }
+      ]
+    }
+  ]
+}