# Define and validate schema.org structured data in Python with Pydantic 

In [1]:
import pydantic
from pydantic import BaseModel, Schema
from pydantic.main import MetaModel
from schemaorg import main as schemaorg

In [2]:
class Thing(BaseModel):
 """The most generic type of item."""
 
 # Need to define extra items at the top level 
 class Config:
 title = 'Thing'
 schema_extra = {
 '$schema': 'https://schema.org',
 '$id': 'https://schema.org/Thing',
 }
 
 additionalType: str = Schema(
 ...,
 title='additionalType',
 description=(
 "An additional type for the item, typically "
 "used for adding more specific types from "
 "external vocabularies in microdata syntax. "
 "This is a relationship between something and "
 "a class that the thing is in. In RDFa syntax, "
 "it is better to use the native RDFa syntax - "
 "the 'typeof' attribute - for multiple types. "
 "Schema.org tools may have only weaker "
 "understanding of extra types, in particular "
 "those defined externally."
 )
 )
 
 alternateName: str = Schema(
 ...,
 title='alternateName',
 description="An alias for the item."
 )
 
 description: str = Schema(
 ...,
 description="A description of the item."
 )
 
 disambiguatingDescription: str = Schema(
 ...,
 description="A sub property of description. A short description of the item used to disambiguate from other, similar items. Information from other properties (in particular, name) may be necessary for the description to be useful for disambiguation."
 )

I want to enforce the attributes in the `Config` accessor, *and* I think the accessor syntax is a bit ugly. Let's see if we can make it go away.

In [3]:

class MetaSchema(MetaModel):
 """Metaclass that checks for three required class attributes:
 1. _id: the ID of the event
 2. _version: the version of the current schema.
 3. _title: the name of the schema.

 These attribute are mapped to pydantic.BaseModel's `Config` inner class
 for proper schema generation+validation.
 """
 def __new__(cls, name, base, dct):
 # Check that required keys are found.
 if not all((key in dct for key in ['_id', '_title', '_version', '_schema'])):
 raise AttributeError('Required class attributes are missing from the {} class.'.format(name))

 # Check that keys are the proper types.
 if not all((
 type(dct['_id']) in (str, type(None)),
 type(dct['_version']) in (float, type(None)),
 type(dct['_title']) in (str, type(None)),
 type(dct['_schema']) in (str, type(None)),
 )):
 raise TypeError('Check the class attributes types: "_id" must be a string, '
 '"_version" must be an integer, and "_title" must be a string.')

 # Add a Config inner class to this Pydantic model.
 class Config:
 title = dct['_title']
 schema_extra = {
 '$id': dct['_id'],
 '$schema': dct['_schema'],
 'version': dct['_version']
 }

 dct['Config'] = Config
 return super(MetaSchema, cls).__new__(cls, name, base, dct)


class JsonSchema(pydantic.BaseModel, metaclass=MetaSchema):
 """A pydantic base Model for JSON schemas."""
 _id: str = None
 _version: float = None
 _title: str = None
 _schema: str = None

In [4]:
class Thing(JsonSchema):
 """The most generic type of item."""
 # Define top level attributes
 _id = 'https://schema.org/Thing'
 _version = 3.9
 _title = 'Thing'
 _schema = 'https://schema.org'

 additionalType: str = Schema(
 ...,
 title='additionalType',
 description=(
 "An additional type for the item, typically "
 "used for adding more specific types from "
 "external vocabularies in microdata syntax. "
 "This is a relationship between something and "
 "a class that the thing is in. In RDFa syntax, "
 "it is better to use the native RDFa syntax - "
 "the 'typeof' attribute - for multiple types. "
 "Schema.org tools may have only weaker "
 "understanding of extra types, in particular "
 "those defined externally."
 )
 )
 
 alternateName: str = Schema(
 ...,
 title='alternateName',
 description="An alias for the item."
 )
 
 description: str = Schema(
 ...,
 description="A description of the item."
 )
 
 disambiguatingDescription: str = Schema(
 ...,
 description="A sub property of description. A short description of the item used to disambiguate from other, similar items. Information from other properties (in particular, name) may be necessary for the description to be useful for disambiguation."
 )

In [5]:
Thing.schema()

{'title': 'Thing',
 'description': 'The most generic type of item.',
 'type': 'object',
 'properties': {'additionalType': {'title': 'additionalType',
 'description': "An additional type for the item, typically used for adding more specific types from external vocabularies in microdata syntax. This is a relationship between something and a class that the thing is in. In RDFa syntax, it is better to use the native RDFa syntax - the 'typeof' attribute - for multiple types. Schema.org tools may have only weaker understanding of extra types, in particular those defined externally.",
 'type': 'string'},
 'alternateName': {'title': 'alternateName',
 'description': 'An alias for the item.',
 'type': 'string'},
 'description': {'title': 'Description',
 'description': 'A description of the item.',
 'type': 'string'},
 'disambiguatingDescription': {'title': 'Disambiguatingdescription',
 'description': 'A sub property of description. A short description of the item used to disambiguate from other,

## Validate a new object

What happens when we create an invalid object?

In [6]:
try: 
 thing = Thing()
except pydantic.ValidationError:
 print("Object did not validate.")

Object did not validate.


Now let's try a valid object...

In [7]:
thing = Thing(
 alternateName='New Thing',
 description='This is a new thing',
 disambiguatingDescription='This thing is unique.',
 additionalType='No additional type'
)

No error was raised.

## Autogenerate pydantic objects from schema.org

In [8]:
class SchemaOrg(MetaSchema):
 
 def __new__(cls, name, base, dct):
 annotations = {}
 
 data = schemaorg.Schema(name)
 
 dct = dict(
 _title=name,
 _id=data.id,
 _version=float(data.version),
 _schema=data.base,
 __doc__=data.comment,
 __annotations__={}
 )

 # Currently, sets all class variables to type==str for
 # demostration purposes.
 # Need to develop datatypes for Schema.org objects.
 for key, val in data._properties.items():
 dct[key] = Schema(
 ...,
 description=val['comment'],
 title=val['label']
 )
 dct['__annotations__'][key] = str

 base = (BaseModel,) + base
 
 return super(SchemaOrg, cls).__new__(cls, name, base, dct)


In [9]:
class Thing(metaclass=SchemaOrg): pass
class Event(metaclass=SchemaOrg): pass

Specification base set to http://www.schema.org
Using Version 3.5
Found http://www.schema.org/Thing
Thing: found 12 properties
Specification base set to http://www.schema.org
Using Version 3.5
Found http://www.schema.org/Event
Event: found 47 properties


In [10]:
class Person(metaclass=SchemaOrg): pass

Specification base set to http://www.schema.org
Using Version 3.5
Found http://www.schema.org/Person
Person: found 69 properties
