Skip to content

Instantly share code, notes, and snippets.

@Zsailer
Last active May 11, 2024 15:17
Show Gist options
  • Save Zsailer/6da0dc3c97ec873685b7fe58e52d36d7 to your computer and use it in GitHub Desktop.
Save Zsailer/6da0dc3c97ec873685b7fe58e52d36d7 to your computer and use it in GitHub Desktop.

Revisions

  1. Zsailer revised this gist Aug 27, 2019. 1 changed file with 88 additions and 27 deletions.
    115 changes: 88 additions & 27 deletions schemaorg-pydantic.ipynb
    Original file line number Diff line number Diff line change
    @@ -9,18 +9,19 @@
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
    "import pydantic\n",
    "from pydantic import BaseModel, Schema\n",
    "from pydantic.main import MetaModel\n",
    "from schemaorg import main as schemaorg"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
    @@ -78,7 +79,7 @@
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
    @@ -130,7 +131,7 @@
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
    @@ -178,9 +179,41 @@
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "outputs": [
    {
    "data": {
    "text/plain": [
    "{'title': 'Thing',\n",
    " 'description': 'The most generic type of item.',\n",
    " 'type': 'object',\n",
    " 'properties': {'additionalType': {'title': 'additionalType',\n",
    " 'description': \"An additional type for the item, typically used for adding more specific types from external vocabularies in microdata syntax. This is a relationship between something and a class that the thing is in. In RDFa syntax, it is better to use the native RDFa syntax - the 'typeof' attribute - for multiple types. Schema.org tools may have only weaker understanding of extra types, in particular those defined externally.\",\n",
    " 'type': 'string'},\n",
    " 'alternateName': {'title': 'alternateName',\n",
    " 'description': 'An alias for the item.',\n",
    " 'type': 'string'},\n",
    " 'description': {'title': 'Description',\n",
    " 'description': 'A description of the item.',\n",
    " 'type': 'string'},\n",
    " 'disambiguatingDescription': {'title': 'Disambiguatingdescription',\n",
    " 'description': 'A sub property of description. A short description of the item used to disambiguate from other, similar items. Information from other properties (in particular, name) may be necessary for the description to be useful for disambiguation.',\n",
    " 'type': 'string'}},\n",
    " 'required': ['additionalType',\n",
    " 'alternateName',\n",
    " 'description',\n",
    " 'disambiguatingDescription'],\n",
    " '$id': 'https://schema.org/Thing',\n",
    " '$schema': 'https://schema.org',\n",
    " 'version': 3.9}"
    ]
    },
    "execution_count": 5,
    "metadata": {},
    "output_type": "execute_result"
    }
    ],
    "source": [
    "Thing.schema()"
    ]
    @@ -201,11 +234,22 @@
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "outputs": [
    {
    "name": "stdout",
    "output_type": "stream",
    "text": [
    "Object did not validate.\n"
    ]
    }
    ],
    "source": [
    "thing = Thing()"
    "try: \n",
    " thing = Thing()\n",
    "except pydantic.ValidationError:\n",
    " print(\"Object did not validate.\")"
    ]
    },
    {
    @@ -217,7 +261,7 @@
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
    @@ -245,7 +289,7 @@
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
    @@ -283,38 +327,55 @@
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "outputs": [
    {
    "name": "stdout",
    "output_type": "stream",
    "text": [
    "Specification base set to http://www.schema.org\n",
    "Using Version 3.5\n",
    "Found http://www.schema.org/Thing\n",
    "Thing: found 12 properties\n",
    "Specification base set to http://www.schema.org\n",
    "Using Version 3.5\n",
    "Found http://www.schema.org/Event\n",
    "Event: found 47 properties\n"
    ]
    }
    ],
    "source": [
    "class Thing(metaclass=SchemaOrg): pass\n",
    "class Event(metaclass=SchemaOrg): pass"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "Thing.schema()"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "outputs": [
    {
    "name": "stdout",
    "output_type": "stream",
    "text": [
    "Specification base set to http://www.schema.org\n",
    "Using Version 3.5\n",
    "Found http://www.schema.org/Person\n",
    "Person: found 69 properties\n"
    ]
    }
    ],
    "source": [
    "Event.schema()"
    "class Person(metaclass=SchemaOrg): pass"
    ]
    }
    ],
    "metadata": {
    "kernelspec": {
    "display_name": "Python 3.7",
    "display_name": "omnipotent (Python 3.7)",
    "language": "python",
    "name": "python3"
    "name": "omnipotent"
    },
    "language_info": {
    "codemirror_mode": {
  2. Zsailer revised this gist Aug 27, 2019. 1 changed file with 6 additions and 22 deletions.
    28 changes: 6 additions & 22 deletions schemaorg-pydantic.ipynb
    Original file line number Diff line number Diff line change
    @@ -13,7 +13,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
    "from pydantic import BaseModel, Schema"
    "from pydantic import BaseModel, Schema\n",
    "from pydantic.main import MetaModel\n",
    "from schemaorg import main as schemaorg"
    ]
    },
    {
    @@ -80,8 +82,6 @@
    "metadata": {},
    "outputs": [],
    "source": [
    "from pydantic.main import MetaModel\n",
    "\n",
    "\n",
    "class MetaSchema(MetaModel):\n",
    " \"\"\"Metaclass that checks for three required class attributes:\n",
    @@ -243,15 +243,6 @@
    "## Autogenerate pydantic objects from schema.org"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "from schemaorg import main as schemaorg"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    @@ -317,20 +308,13 @@
    "source": [
    "Event.schema()"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": []
    }
    ],
    "metadata": {
    "kernelspec": {
    "display_name": "omnipotent (Python 3.7)",
    "display_name": "Python 3.7",
    "language": "python",
    "name": "omnipotent"
    "name": "python3"
    },
    "language_info": {
    "codemirror_mode": {
    @@ -347,4 +331,4 @@
    },
    "nbformat": 4,
    "nbformat_minor": 4
    }
    }
  3. Zsailer revised this gist Aug 27, 2019. 1 changed file with 3 additions and 3 deletions.
    6 changes: 3 additions & 3 deletions schemaorg-pydantic.ipynb
    Original file line number Diff line number Diff line change
    @@ -328,9 +328,9 @@
    ],
    "metadata": {
    "kernelspec": {
    "display_name": "Python 3.7",
    "display_name": "omnipotent (Python 3.7)",
    "language": "python",
    "name": "python3"
    "name": "omnipotent"
    },
    "language_info": {
    "codemirror_mode": {
    @@ -347,4 +347,4 @@
    },
    "nbformat": 4,
    "nbformat_minor": 4
    }
    }
  4. Zsailer revised this gist Aug 27, 2019. No changes.
  5. Zsailer revised this gist Aug 27, 2019. 1 changed file with 36 additions and 39 deletions.
    75 changes: 36 additions & 39 deletions schemaorg-pydantic.ipynb
    Original file line number Diff line number Diff line change
    @@ -240,7 +240,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
    "## Attempting to autogenerate pydantic objects from schema.org"
    "## Autogenerate pydantic objects from schema.org"
    ]
    },
    {
    @@ -258,38 +258,36 @@
    "metadata": {},
    "outputs": [],
    "source": [
    "obj._properties"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "obj = schemaorg.Schema(\"Thing\")"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "def schematize(obj_name):\n",
    " # Upload schemaorg object.\n",
    " obj = schemaorg.Schema(obj_name)\n",
    " dct = dict(\n",
    " _title=obj_name,\n",
    " __doc__=obj.comment,\n",
    " _id=obj.id,\n",
    " _version=obj.version,\n",
    " )\n",
    " for key, val in obj.items():\n",
    " dct[key] = val\n",
    "class SchemaOrg(MetaSchema):\n",
    " \n",
    " def __new__(cls, name, base, dct):\n",
    " annotations = {}\n",
    " \n",
    " data = schemaorg.Schema(name)\n",
    " \n",
    " dct = dict(\n",
    " _title=name,\n",
    " _id=data.id,\n",
    " _version=float(data.version),\n",
    " _schema=data.base,\n",
    " __doc__=data.comment,\n",
    " __annotations__={}\n",
    " )\n",
    "\n",
    " "
    " # Currently, sets all class variables to type==str for\n",
    " # demostration purposes.\n",
    " # Need to develop datatypes for Schema.org objects.\n",
    " for key, val in data._properties.items():\n",
    " dct[key] = Schema(\n",
    " ...,\n",
    " description=val['comment'],\n",
    " title=val['label']\n",
    " )\n",
    " dct['__annotations__'][key] = str\n",
    "\n",
    " base = (BaseModel,) + base\n",
    " \n",
    " return super(SchemaOrg, cls).__new__(cls, name, base, dct)\n"
    ]
    },
    {
    @@ -298,7 +296,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
    "x = type('Thing', (MetaSchema,), {})"
    "class Thing(metaclass=SchemaOrg): pass\n",
    "class Event(metaclass=SchemaOrg): pass"
    ]
    },
    {
    @@ -307,7 +306,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
    "x.__name__"
    "Thing.schema()"
    ]
    },
    {
    @@ -316,24 +315,22 @@
    "metadata": {},
    "outputs": [],
    "source": [
    "obj._properties.keys()"
    "Event.schema()"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "Thing.schema()"
    ]
    "source": []
    }
    ],
    "metadata": {
    "kernelspec": {
    "display_name": "Python 3",
    "display_name": "Python 3.7",
    "language": "python",
    "name": "python"
    "name": "python3"
    },
    "language_info": {
    "codemirror_mode": {
  6. Zsailer created this gist Aug 27, 2019.
    353 changes: 353 additions & 0 deletions schemaorg-pydantic.ipynb
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,353 @@
    {
    "cells": [
    {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
    "# Define and validate schema.org structured data in Python with Pydantic "
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "from pydantic import BaseModel, Schema"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "class Thing(BaseModel):\n",
    " \"\"\"The most generic type of item.\"\"\"\n",
    " \n",
    " # Need to define extra items at the top level \n",
    " class Config:\n",
    " title = 'Thing'\n",
    " schema_extra = {\n",
    " '$schema': 'https://schema.org',\n",
    " '$id': 'https://schema.org/Thing',\n",
    " }\n",
    " \n",
    " additionalType: str = Schema(\n",
    " ...,\n",
    " title='additionalType',\n",
    " description=(\n",
    " \"An additional type for the item, typically \"\n",
    " \"used for adding more specific types from \"\n",
    " \"external vocabularies in microdata syntax. \"\n",
    " \"This is a relationship between something and \"\n",
    " \"a class that the thing is in. In RDFa syntax, \"\n",
    " \"it is better to use the native RDFa syntax - \"\n",
    " \"the 'typeof' attribute - for multiple types. \"\n",
    " \"Schema.org tools may have only weaker \"\n",
    " \"understanding of extra types, in particular \"\n",
    " \"those defined externally.\"\n",
    " )\n",
    " )\n",
    " \n",
    " alternateName: str = Schema(\n",
    " ...,\n",
    " title='alternateName',\n",
    " description=\"An alias for the item.\"\n",
    " )\n",
    " \n",
    " description: str = Schema(\n",
    " ...,\n",
    " description=\"A description of the item.\"\n",
    " )\n",
    " \n",
    " disambiguatingDescription: str = Schema(\n",
    " ...,\n",
    " description=\"A sub property of description. A short description of the item used to disambiguate from other, similar items. Information from other properties (in particular, name) may be necessary for the description to be useful for disambiguation.\"\n",
    " )"
    ]
    },
    {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
    "I want to enforce the attributes in the `Config` accessor, *and* I think the accessor syntax is a bit ugly. Let's see if we can make it go away."
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "from pydantic.main import MetaModel\n",
    "\n",
    "\n",
    "class MetaSchema(MetaModel):\n",
    " \"\"\"Metaclass that checks for three required class attributes:\n",
    " 1. _id: the ID of the event\n",
    " 2. _version: the version of the current schema.\n",
    " 3. _title: the name of the schema.\n",
    "\n",
    " These attribute are mapped to pydantic.BaseModel's `Config` inner class\n",
    " for proper schema generation+validation.\n",
    " \"\"\"\n",
    " def __new__(cls, name, base, dct):\n",
    " # Check that required keys are found.\n",
    " if not all((key in dct for key in ['_id', '_title', '_version', '_schema'])):\n",
    " raise AttributeError('Required class attributes are missing from the {} class.'.format(name))\n",
    "\n",
    " # Check that keys are the proper types.\n",
    " if not all((\n",
    " type(dct['_id']) in (str, type(None)),\n",
    " type(dct['_version']) in (float, type(None)),\n",
    " type(dct['_title']) in (str, type(None)),\n",
    " type(dct['_schema']) in (str, type(None)),\n",
    " )):\n",
    " raise TypeError('Check the class attributes types: \"_id\" must be a string, '\n",
    " '\"_version\" must be an integer, and \"_title\" must be a string.')\n",
    "\n",
    " # Add a Config inner class to this Pydantic model.\n",
    " class Config:\n",
    " title = dct['_title']\n",
    " schema_extra = {\n",
    " '$id': dct['_id'],\n",
    " '$schema': dct['_schema'],\n",
    " 'version': dct['_version']\n",
    " }\n",
    "\n",
    " dct['Config'] = Config\n",
    " return super(MetaSchema, cls).__new__(cls, name, base, dct)\n",
    "\n",
    "\n",
    "class JsonSchema(pydantic.BaseModel, metaclass=MetaSchema):\n",
    " \"\"\"A pydantic base Model for JSON schemas.\"\"\"\n",
    " _id: str = None\n",
    " _version: float = None\n",
    " _title: str = None\n",
    " _schema: str = None"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "class Thing(JsonSchema):\n",
    " \"\"\"The most generic type of item.\"\"\"\n",
    " # Define top level attributes\n",
    " _id = 'https://schema.org/Thing'\n",
    " _version = 3.9\n",
    " _title = 'Thing'\n",
    " _schema = 'https://schema.org'\n",
    "\n",
    " additionalType: str = Schema(\n",
    " ...,\n",
    " title='additionalType',\n",
    " description=(\n",
    " \"An additional type for the item, typically \"\n",
    " \"used for adding more specific types from \"\n",
    " \"external vocabularies in microdata syntax. \"\n",
    " \"This is a relationship between something and \"\n",
    " \"a class that the thing is in. In RDFa syntax, \"\n",
    " \"it is better to use the native RDFa syntax - \"\n",
    " \"the 'typeof' attribute - for multiple types. \"\n",
    " \"Schema.org tools may have only weaker \"\n",
    " \"understanding of extra types, in particular \"\n",
    " \"those defined externally.\"\n",
    " )\n",
    " )\n",
    " \n",
    " alternateName: str = Schema(\n",
    " ...,\n",
    " title='alternateName',\n",
    " description=\"An alias for the item.\"\n",
    " )\n",
    " \n",
    " description: str = Schema(\n",
    " ...,\n",
    " description=\"A description of the item.\"\n",
    " )\n",
    " \n",
    " disambiguatingDescription: str = Schema(\n",
    " ...,\n",
    " description=\"A sub property of description. A short description of the item used to disambiguate from other, similar items. Information from other properties (in particular, name) may be necessary for the description to be useful for disambiguation.\"\n",
    " )"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "Thing.schema()"
    ]
    },
    {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
    "## Validate a new object"
    ]
    },
    {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
    "What happens when we create an invalid object?"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "thing = Thing()"
    ]
    },
    {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
    "Now let's try a valid object..."
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "thing = Thing(\n",
    " alternateName='New Thing',\n",
    " description='This is a new thing',\n",
    " disambiguatingDescription='This thing is unique.',\n",
    " additionalType='No additional type'\n",
    ")"
    ]
    },
    {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
    "No error was raised."
    ]
    },
    {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
    "## Attempting to autogenerate pydantic objects from schema.org"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "from schemaorg import main as schemaorg"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "obj._properties"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "obj = schemaorg.Schema(\"Thing\")"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "def schematize(obj_name):\n",
    " # Upload schemaorg object.\n",
    " obj = schemaorg.Schema(obj_name)\n",
    " dct = dict(\n",
    " _title=obj_name,\n",
    " __doc__=obj.comment,\n",
    " _id=obj.id,\n",
    " _version=obj.version,\n",
    " )\n",
    " for key, val in obj.items():\n",
    " dct[key] = val\n",
    " \n",
    "\n",
    " "
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "x = type('Thing', (MetaSchema,), {})"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "x.__name__"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "obj._properties.keys()"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
    "Thing.schema()"
    ]
    }
    ],
    "metadata": {
    "kernelspec": {
    "display_name": "Python 3",
    "language": "python",
    "name": "python"
    },
    "language_info": {
    "codemirror_mode": {
    "name": "ipython",
    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.7.1"
    }
    },
    "nbformat": 4,
    "nbformat_minor": 4
    }