Skip to content

Instantly share code, notes, and snippets.

@masitings
Created January 26, 2024 09:47
Show Gist options
  • Save masitings/9dab604147dfe269d6f895f41f872991 to your computer and use it in GitHub Desktop.
Save masitings/9dab604147dfe269d6f895f41f872991 to your computer and use it in GitHub Desktop.

Revisions

  1. masitings revised this gist Jan 26, 2024. 1 changed file with 12 additions and 1 deletion.
    13 changes: 12 additions & 1 deletion yt_ocr.ipynb
    Original file line number Diff line number Diff line change
    @@ -4,7 +4,8 @@
    "metadata": {
    "colab": {
    "provenance": [],
    "authorship_tag": "ABX9TyNEBoY32Js79QI9AsomXvp4"
    "authorship_tag": "ABX9TyNEBoY32Js79QI9AsomXvp4",
    "include_colab_link": true
    },
    "kernelspec": {
    "name": "python3",
    @@ -15,6 +16,16 @@
    }
    },
    "cells": [
    {
    "cell_type": "markdown",
    "metadata": {
    "id": "view-in-github",
    "colab_type": "text"
    },
    "source": [
    "<a href=\"https://colab.research.google.com/gist/masitings/9dab604147dfe269d6f895f41f872991/yt_ocr.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
  2. masitings created this gist Jan 26, 2024.
    415 changes: 415 additions & 0 deletions yt_ocr.ipynb
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,415 @@
    {
    "nbformat": 4,
    "nbformat_minor": 0,
    "metadata": {
    "colab": {
    "provenance": [],
    "authorship_tag": "ABX9TyNEBoY32Js79QI9AsomXvp4"
    },
    "kernelspec": {
    "name": "python3",
    "display_name": "Python 3"
    },
    "language_info": {
    "name": "python"
    }
    },
    "cells": [
    {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
    "colab": {
    "base_uri": "https://localhost:8080/"
    },
    "id": "YMViZtqR340x",
    "outputId": "402ab93d-5d98-492a-9e21-004c4effa942"
    },
    "outputs": [
    {
    "output_type": "stream",
    "name": "stdout",
    "text": [
    "Mounted at /content/drive\n"
    ]
    }
    ],
    "source": [
    "from google.colab import drive\n",
    "import os\n",
    "\n",
    "drive.mount('/content/drive')"
    ]
    },
    {
    "cell_type": "code",
    "source": [
    "!sudo apt install tesseract-ocr-ind\n",
    "!pip install pytesseract\n",
    "!pip install protobuf"
    ],
    "metadata": {
    "colab": {
    "base_uri": "https://localhost:8080/"
    },
    "id": "JLCHyiEf4YyX",
    "outputId": "f96fe312-5c40-48da-ee6d-4f53b8a39aec"
    },
    "execution_count": null,
    "outputs": [
    {
    "output_type": "stream",
    "name": "stdout",
    "text": [
    "Reading package lists... Done\n",
    "Building dependency tree... Done\n",
    "Reading state information... Done\n",
    "The following additional packages will be installed:\n",
    " tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd\n",
    "The following NEW packages will be installed:\n",
    " tesseract-ocr tesseract-ocr-eng tesseract-ocr-ind tesseract-ocr-osd\n",
    "0 upgraded, 4 newly installed, 0 to remove and 30 not upgraded.\n",
    "Need to get 5,353 kB of archives.\n",
    "After this operation, 16.8 MB of additional disk space will be used.\n",
    "Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]\n",
    "Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]\n",
    "Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]\n",
    "Get:4 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-ind all 1:4.00~git30-7274cfa-1.1 [537 kB]\n",
    "Fetched 5,353 kB in 1s (8,532 kB/s)\n",
    "debconf: unable to initialize frontend: Dialog\n",
    "debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 4.)\n",
    "debconf: falling back to frontend: Readline\n",
    "debconf: unable to initialize frontend: Readline\n",
    "debconf: (This frontend requires a controlling tty.)\n",
    "debconf: falling back to frontend: Teletype\n",
    "dpkg-preconfigure: unable to re-open stdin: \n",
    "Selecting previously unselected package tesseract-ocr-eng.\n",
    "(Reading database ... 121671 files and directories currently installed.)\n",
    "Preparing to unpack .../tesseract-ocr-eng_1%3a4.00~git30-7274cfa-1.1_all.deb ...\n",
    "Unpacking tesseract-ocr-eng (1:4.00~git30-7274cfa-1.1) ...\n",
    "Selecting previously unselected package tesseract-ocr-osd.\n",
    "Preparing to unpack .../tesseract-ocr-osd_1%3a4.00~git30-7274cfa-1.1_all.deb ...\n",
    "Unpacking tesseract-ocr-osd (1:4.00~git30-7274cfa-1.1) ...\n",
    "Selecting previously unselected package tesseract-ocr.\n",
    "Preparing to unpack .../tesseract-ocr_4.1.1-2.1build1_amd64.deb ...\n",
    "Unpacking tesseract-ocr (4.1.1-2.1build1) ...\n",
    "Selecting previously unselected package tesseract-ocr-ind.\n",
    "Preparing to unpack .../tesseract-ocr-ind_1%3a4.00~git30-7274cfa-1.1_all.deb ...\n",
    "Unpacking tesseract-ocr-ind (1:4.00~git30-7274cfa-1.1) ...\n",
    "Setting up tesseract-ocr-eng (1:4.00~git30-7274cfa-1.1) ...\n",
    "Setting up tesseract-ocr-ind (1:4.00~git30-7274cfa-1.1) ...\n",
    "Setting up tesseract-ocr-osd (1:4.00~git30-7274cfa-1.1) ...\n",
    "Setting up tesseract-ocr (4.1.1-2.1build1) ...\n",
    "Processing triggers for man-db (2.10.2-1) ...\n",
    "Collecting pytesseract\n",
    " Downloading pytesseract-0.3.10-py3-none-any.whl (14 kB)\n",
    "Requirement already satisfied: packaging>=21.3 in /usr/local/lib/python3.10/dist-packages (from pytesseract) (23.2)\n",
    "Requirement already satisfied: Pillow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from pytesseract) (9.4.0)\n",
    "Installing collected packages: pytesseract\n",
    "Successfully installed pytesseract-0.3.10\n",
    "Requirement already satisfied: protobuf in /usr/local/lib/python3.10/dist-packages (3.20.3)\n"
    ]
    }
    ]
    },
    {
    "cell_type": "code",
    "source": [
    "import cv2\n",
    "import numpy as np\n",
    "import pytesseract\n",
    "import pandas as pd\n",
    "from PIL import Image\n",
    "import matplotlib.pyplot as plt"
    ],
    "metadata": {
    "id": "EyApGvkG4wsy"
    },
    "execution_count": null,
    "outputs": []
    },
    {
    "cell_type": "code",
    "source": [
    "FILE_PATH = '/content/drive/MyDrive/ocr/dataset'\n",
    "filePath = os.path.join(FILE_PATH, 'ktp.png')\n",
    "\n",
    "img = cv2.imread(filePath)\n",
    "gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)\n",
    "th, threshed = cv2.threshold(gray, 127, 255, cv2.THRESH_TRUNC)\n",
    "\n",
    "result = pytesseract.image_to_string((threshed), lang=\"ind\")\n",
    "\n",
    "for word in result.split(\"\\n\"):\n",
    " if \"”—\" in word:\n",
    " word = word.replace(\"”—\", \":\")\n",
    " if \"NIK\" in word:\n",
    " nik_char = word.split()\n",
    " if \"?\" in word:\n",
    " word = word.replace(\"?\", \"7\")\n",
    " if \"D\" in word:\n",
    " word = word.replace(\"D\", \"0\")\n",
    "\n",
    " print(word)\n"
    ],
    "metadata": {
    "colab": {
    "base_uri": "https://localhost:8080/"
    },
    "id": "WTWGqHTT5NAh",
    "outputId": "07bff621-dd0a-4d71-80b4-1f8ac1bfa47a"
    },
    "execution_count": null,
    "outputs": [
    {
    "output_type": "stream",
    "name": "stdout",
    "text": [
    "em\n",
    "\n",
    "PROVINSI DAERAH ISTIMEWA YOGYAKARTA\n",
    "KABUPATEN SLEMAN\n",
    "\n",
    " \n",
    "\n",
    "NIK : 34711140209790001\n",
    "\n",
    "Nama :RIYANTO. SE\n",
    "\n",
    "Tempat/Tgl Lahir : GROBOGAN. 02-09-1979\n",
    "\n",
    "Jenis Kelamin : LAKI-LAKI Gol Darah : 0\n",
    "\n",
    "Alamat PRM PURI DOMAS D-3. SEMPU\n",
    "RTRW 1001 1024\n",
    "\n",
    "Kel/Desa : WEDOMARTANI!\n",
    "Kecamatan : NGEMPLAK\n",
    "\n",
    "Agama \"ISLAM\n",
    "Status Bean KAWIN SLEMAN\n",
    "Pekerjaan : PEDAGANG 05-06-2012\n",
    "\n",
    "Kewarganegaraan: WNI HI —\n",
    "Berlaku Hingga :02-09-2017 NIA\n",
    "\n",
    " \n",
    " \n",
    "\n",
    " \n",
    "\f\n"
    ]
    }
    ]
    },
    {
    "cell_type": "code",
    "source": [
    "class KTPInformation(object):\n",
    " def __init__(self):\n",
    " self.nik = \"\"\n",
    " self.nama = \"\"\n",
    " self.tempat_lahir = \"\"\n",
    " self.tanggal_lahir = \"\"\n",
    " self.jenis_kelamin = \"\"\n",
    " self.golongan_darah = \"\"\n",
    " self.alamat = \"\"\n",
    " self.rt = \"\"\n",
    " self.rw = \"\"\n",
    " self.kelurahan_atau_desa = \"\"\n",
    " self.kecamatan = \"\"\n",
    " self.agama = \"\"\n",
    " self.status_perkawinan = \"\"\n",
    " self.pekerjaan = \"\"\n",
    " self.kewarganegaraan = \"\"\n",
    " berlaku_hingga = \"SEUMUR HIDUP\""
    ],
    "metadata": {
    "id": "TLhP9FJH7m5-"
    },
    "execution_count": null,
    "outputs": []
    },
    {
    "cell_type": "code",
    "source": [
    "import cv2\n",
    "import json\n",
    "import re\n",
    "import numpy as np\n",
    "import pytesseract\n",
    "import matplotlib.pyplot as plt\n",
    "from PIL import Image\n",
    "\n",
    "class KTPOCR(object):\n",
    " def __init__(self, image):\n",
    " self.image = cv2.imread(image)\n",
    " self.gray = cv2.cvtColor(self.image, cv2.COLOR_BGR2GRAY)\n",
    " self.th, self.threshed = cv2.threshold(self.gray, 127, 255, cv2.THRESH_TRUNC)\n",
    " self.result = KTPInformation()\n",
    " self.master_process()\n",
    "\n",
    " def process(self, image):\n",
    " raw_extracted_text = pytesseract.image_to_string((self.threshed), lang=\"ind\")\n",
    " return raw_extracted_text\n",
    "\n",
    " def word_to_number_converter(self, word):\n",
    " word_dict = {\n",
    " \"L\": \"1\",\n",
    " \"l\": \"1\",\n",
    " \"O\": \"0\",\n",
    " \"o\": \"0\",\n",
    " \"?\": \"7\",\n",
    " \"A\": \"4\",\n",
    " \"Z\": \"2\",\n",
    " \"z\": \"2\",\n",
    " \"S\": \"5\",\n",
    " \"s\": \"5\",\n",
    " \"b\": \"6\",\n",
    " \"B\": \"8\",\n",
    " \"G\": \"6\"\n",
    " }\n",
    " res = \"\"\n",
    " for letter in word:\n",
    " if letter in word_dict:\n",
    " res += word_dict[letter]\n",
    " else:\n",
    " res += letter\n",
    " return res\n",
    "\n",
    " def extract(self, extracted_result):\n",
    " #print(extracted_result.replace('\\n', ' -- '))\n",
    " for word in extracted_result.split(\"\\n\"):\n",
    " word = self.pun_rem(word)\n",
    "\n",
    " if \"NIK\" in word:\n",
    " word = word.split(':')\n",
    " self.result.nik = self.word_to_number_converter(word[-1].replace(\" \", \"\"))\n",
    " continue\n",
    "\n",
    " if \"Nama\" in word:\n",
    " word = word.split(':')\n",
    " self.result.nama = word[-1]\n",
    " continue\n",
    "\n",
    " if \"Lahir\" in word:\n",
    " word = word.split(':')\n",
    " self.result.tanggal_lahir = re.search(\"([0-9]{2}\\-[0-9]{2}\\-[0-9]{4})\", word[-1])[0]\n",
    " self.result.tempat_lahir = word[-1].replace(self.result.tanggal_lahir, '')\n",
    " continue\n",
    "\n",
    " if \"Gol\" in word:\n",
    " word = word.split(':')\n",
    " self.result.jenis_kelamin = re.search(\"(LAKI-LAKI|LAKI|LELAKI|PEREMPUAN)\", word[1])[0]\n",
    " # gol = re.search(\"(O|A|B|AB)\", word[2])[0]\n",
    "\n",
    " # if gol == \"0\":\n",
    " # gol = \"O\"\n",
    "\n",
    " # self.result.golongan_darah = gol\n",
    " continue\n",
    "\n",
    " if \"Alamat\" in word:\n",
    " self.result.alamat = re.sub(r'^\\W*\\w+\\W*', '', word)\n",
    " continue\n",
    "\n",
    " if \"RW\" in word:\n",
    " word = re.sub(r'^\\W*\\w+\\W*', '', word)\n",
    "\n",
    " if \" \" in word:\n",
    " a = word.split(\" \")\n",
    " elif \"/\" in word:\n",
    " a = word.split(\"/\")\n",
    "\n",
    " self.result.rt = a[0][-3:]\n",
    " self.result.rw = a[1][-3:]\n",
    " continue\n",
    "\n",
    " if \"kel\" in word:\n",
    " word = re.sub(r'^\\W*\\w+\\W*', '', word)\n",
    " self.result.kecamatan = word\n",
    " continue\n",
    "\n",
    " if \"Agama\" in word:\n",
    " self.result.agama = re.search(\"(ISLAM|KRISTEN|KATOLIK|HINDU|BUDDHA|KONG HU CU)\", word)[0]\n",
    " continue\n",
    "\n",
    " if \"Status\" in word:\n",
    " self.result.status_perkawinan = re.search(\"(KAWIN|BELUM KAWIN|DUDA CERAI|DUDA MATI|JANDA CERAI|JANDA MATI)\", word)[0]\n",
    " continue\n",
    "\n",
    " if \"Pekerjaan\" in word:\n",
    " word = re.sub(r'^\\W*\\w+\\W*', '', word)\n",
    " word = word.split(\" \")\n",
    " self.result.pekerjaan = word[0]\n",
    " continue\n",
    "\n",
    " if \"Kewarganegaraan\" in word:\n",
    " self.result.kewarganegaraan = re.search(\"(WNI|WNA)\", word)[0]\n",
    " continue\n",
    "\n",
    " def pun_rem(self,text):\n",
    " punctuations = '''!()[]{}'\"\\<>?@#$%^&*_~'''\n",
    " no_punct = ''\n",
    "\n",
    " for char in text:\n",
    " if char not in punctuations:\n",
    " no_punct = no_punct + char\n",
    "\n",
    " return no_punct\n",
    "\n",
    " def master_process(self):\n",
    " raw_text = self.process(self.image)\n",
    " self.extract(raw_text)\n",
    "\n",
    " def to_json(self):\n",
    " return json.dumps(self.result.__dict__, indent=4)"
    ],
    "metadata": {
    "id": "7rZCI9I47oC7"
    },
    "execution_count": null,
    "outputs": []
    },
    {
    "cell_type": "code",
    "source": [
    "images = KTPOCR(filePath)\n",
    "print(images.to_json());"
    ],
    "metadata": {
    "colab": {
    "base_uri": "https://localhost:8080/"
    },
    "id": "MFfCNv4L8MPZ",
    "outputId": "a6a0f2b0-13e7-4574-a3b9-489fc0b3c00f"
    },
    "execution_count": null,
    "outputs": [
    {
    "output_type": "stream",
    "name": "stdout",
    "text": [
    "{\n",
    " \"nik\": \"34711140209790001\",\n",
    " \"nama\": \"RIYANTO. SE\",\n",
    " \"tempat_lahir\": \" GROBOGAN. \",\n",
    " \"tanggal_lahir\": \"02-09-1979\",\n",
    " \"jenis_kelamin\": \"LAKI-LAKI\",\n",
    " \"golongan_darah\": \"\",\n",
    " \"alamat\": \"PRM PURI DOMAS D-3. SEMPU\",\n",
    " \"rt\": \"001\",\n",
    " \"rw\": \"024\",\n",
    " \"kelurahan_atau_desa\": \"\",\n",
    " \"kecamatan\": \"\",\n",
    " \"agama\": \"ISLAM\",\n",
    " \"status_perkawinan\": \"KAWIN\",\n",
    " \"pekerjaan\": \"PEDAGANG\",\n",
    " \"kewarganegaraan\": \"WNI\"\n",
    "}\n"
    ]
    }
    ]
    }
    ]
    }