{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Define and validate schema.org structured data in Python with Pydantic " ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pydantic\n", "from pydantic import BaseModel, Schema\n", "from pydantic.main import MetaModel\n", "from schemaorg import main as schemaorg" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "class Thing(BaseModel):\n", " \"\"\"The most generic type of item.\"\"\"\n", " \n", " # Need to define extra items at the top level \n", " class Config:\n", " title = 'Thing'\n", " schema_extra = {\n", " '$schema': 'https://schema.org',\n", " '$id': 'https://schema.org/Thing',\n", " }\n", " \n", " additionalType: str = Schema(\n", " ...,\n", " title='additionalType',\n", " description=(\n", " \"An additional type for the item, typically \"\n", " \"used for adding more specific types from \"\n", " \"external vocabularies in microdata syntax. \"\n", " \"This is a relationship between something and \"\n", " \"a class that the thing is in. In RDFa syntax, \"\n", " \"it is better to use the native RDFa syntax - \"\n", " \"the 'typeof' attribute - for multiple types. \"\n", " \"Schema.org tools may have only weaker \"\n", " \"understanding of extra types, in particular \"\n", " \"those defined externally.\"\n", " )\n", " )\n", " \n", " alternateName: str = Schema(\n", " ...,\n", " title='alternateName',\n", " description=\"An alias for the item.\"\n", " )\n", " \n", " description: str = Schema(\n", " ...,\n", " description=\"A description of the item.\"\n", " )\n", " \n", " disambiguatingDescription: str = Schema(\n", " ...,\n", " description=\"A sub property of description. A short description of the item used to disambiguate from other, similar items. Information from other properties (in particular, name) may be necessary for the description to be useful for disambiguation.\"\n", " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "I want to enforce the attributes in the `Config` accessor, *and* I think the accessor syntax is a bit ugly. Let's see if we can make it go away." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "\n", "class MetaSchema(MetaModel):\n", " \"\"\"Metaclass that checks for three required class attributes:\n", " 1. _id: the ID of the event\n", " 2. _version: the version of the current schema.\n", " 3. _title: the name of the schema.\n", "\n", " These attribute are mapped to pydantic.BaseModel's `Config` inner class\n", " for proper schema generation+validation.\n", " \"\"\"\n", " def __new__(cls, name, base, dct):\n", " # Check that required keys are found.\n", " if not all((key in dct for key in ['_id', '_title', '_version', '_schema'])):\n", " raise AttributeError('Required class attributes are missing from the {} class.'.format(name))\n", "\n", " # Check that keys are the proper types.\n", " if not all((\n", " type(dct['_id']) in (str, type(None)),\n", " type(dct['_version']) in (float, type(None)),\n", " type(dct['_title']) in (str, type(None)),\n", " type(dct['_schema']) in (str, type(None)),\n", " )):\n", " raise TypeError('Check the class attributes types: \"_id\" must be a string, '\n", " '\"_version\" must be an integer, and \"_title\" must be a string.')\n", "\n", " # Add a Config inner class to this Pydantic model.\n", " class Config:\n", " title = dct['_title']\n", " schema_extra = {\n", " '$id': dct['_id'],\n", " '$schema': dct['_schema'],\n", " 'version': dct['_version']\n", " }\n", "\n", " dct['Config'] = Config\n", " return super(MetaSchema, cls).__new__(cls, name, base, dct)\n", "\n", "\n", "class JsonSchema(pydantic.BaseModel, metaclass=MetaSchema):\n", " \"\"\"A pydantic base Model for JSON schemas.\"\"\"\n", " _id: str = None\n", " _version: float = None\n", " _title: str = None\n", " _schema: str = None" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "class Thing(JsonSchema):\n", " \"\"\"The most generic type of item.\"\"\"\n", " # Define top level attributes\n", " _id = 'https://schema.org/Thing'\n", " _version = 3.9\n", " _title = 'Thing'\n", " _schema = 'https://schema.org'\n", "\n", " additionalType: str = Schema(\n", " ...,\n", " title='additionalType',\n", " description=(\n", " \"An additional type for the item, typically \"\n", " \"used for adding more specific types from \"\n", " \"external vocabularies in microdata syntax. \"\n", " \"This is a relationship between something and \"\n", " \"a class that the thing is in. In RDFa syntax, \"\n", " \"it is better to use the native RDFa syntax - \"\n", " \"the 'typeof' attribute - for multiple types. \"\n", " \"Schema.org tools may have only weaker \"\n", " \"understanding of extra types, in particular \"\n", " \"those defined externally.\"\n", " )\n", " )\n", " \n", " alternateName: str = Schema(\n", " ...,\n", " title='alternateName',\n", " description=\"An alias for the item.\"\n", " )\n", " \n", " description: str = Schema(\n", " ...,\n", " description=\"A description of the item.\"\n", " )\n", " \n", " disambiguatingDescription: str = Schema(\n", " ...,\n", " description=\"A sub property of description. A short description of the item used to disambiguate from other, similar items. Information from other properties (in particular, name) may be necessary for the description to be useful for disambiguation.\"\n", " )" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'title': 'Thing',\n", " 'description': 'The most generic type of item.',\n", " 'type': 'object',\n", " 'properties': {'additionalType': {'title': 'additionalType',\n", " 'description': \"An additional type for the item, typically used for adding more specific types from external vocabularies in microdata syntax. This is a relationship between something and a class that the thing is in. In RDFa syntax, it is better to use the native RDFa syntax - the 'typeof' attribute - for multiple types. Schema.org tools may have only weaker understanding of extra types, in particular those defined externally.\",\n", " 'type': 'string'},\n", " 'alternateName': {'title': 'alternateName',\n", " 'description': 'An alias for the item.',\n", " 'type': 'string'},\n", " 'description': {'title': 'Description',\n", " 'description': 'A description of the item.',\n", " 'type': 'string'},\n", " 'disambiguatingDescription': {'title': 'Disambiguatingdescription',\n", " 'description': 'A sub property of description. A short description of the item used to disambiguate from other, similar items. Information from other properties (in particular, name) may be necessary for the description to be useful for disambiguation.',\n", " 'type': 'string'}},\n", " 'required': ['additionalType',\n", " 'alternateName',\n", " 'description',\n", " 'disambiguatingDescription'],\n", " '$id': 'https://schema.org/Thing',\n", " '$schema': 'https://schema.org',\n", " 'version': 3.9}" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Thing.schema()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Validate a new object" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "What happens when we create an invalid object?" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Object did not validate.\n" ] } ], "source": [ "try: \n", " thing = Thing()\n", "except pydantic.ValidationError:\n", " print(\"Object did not validate.\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now let's try a valid object..." ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "thing = Thing(\n", " alternateName='New Thing',\n", " description='This is a new thing',\n", " disambiguatingDescription='This thing is unique.',\n", " additionalType='No additional type'\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "No error was raised." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Autogenerate pydantic objects from schema.org" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "class SchemaOrg(MetaSchema):\n", " \n", " def __new__(cls, name, base, dct):\n", " annotations = {}\n", " \n", " data = schemaorg.Schema(name)\n", " \n", " dct = dict(\n", " _title=name,\n", " _id=data.id,\n", " _version=float(data.version),\n", " _schema=data.base,\n", " __doc__=data.comment,\n", " __annotations__={}\n", " )\n", "\n", " # Currently, sets all class variables to type==str for\n", " # demostration purposes.\n", " # Need to develop datatypes for Schema.org objects.\n", " for key, val in data._properties.items():\n", " dct[key] = Schema(\n", " ...,\n", " description=val['comment'],\n", " title=val['label']\n", " )\n", " dct['__annotations__'][key] = str\n", "\n", " base = (BaseModel,) + base\n", " \n", " return super(SchemaOrg, cls).__new__(cls, name, base, dct)\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Specification base set to http://www.schema.org\n", "Using Version 3.5\n", "Found http://www.schema.org/Thing\n", "Thing: found 12 properties\n", "Specification base set to http://www.schema.org\n", "Using Version 3.5\n", "Found http://www.schema.org/Event\n", "Event: found 47 properties\n" ] } ], "source": [ "class Thing(metaclass=SchemaOrg): pass\n", "class Event(metaclass=SchemaOrg): pass" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Specification base set to http://www.schema.org\n", "Using Version 3.5\n", "Found http://www.schema.org/Person\n", "Person: found 69 properties\n" ] } ], "source": [ "class Person(metaclass=SchemaOrg): pass" ] } ], "metadata": { "kernelspec": { "display_name": "omnipotent (Python 3.7)", "language": "python", "name": "omnipotent" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.1" } }, "nbformat": 4, "nbformat_minor": 4 }